# Genero

Notebook para sacar una subdivicion entre el genero biologico y el eid desde genomics

In [1]:
# Import the required libraries for data analysis and visualization
import pandas as pd # Para manejar dataframes
import numpy as np # Para operaciones matemáticas
import os # Para manejar rutas
import matplotlib.pyplot as plt # Para graficar
import seaborn as sns # Para graficar
import re # Para quitar el patron '_0_0'
from sklearn.experimental import enable_iterative_imputer  # IterativeImputer
from sklearn.impute import IterativeImputer # IterativeImputer
from scipy import stats # Para outliers
import shap # Para SHAP
from sklearn.model_selection import train_test_split # Para dividir el conjunto de datos
from sklearn.ensemble import RandomForestClassifier # Para Random Forest
from sklearn.metrics import classification_report # Para evaluar
from sklearn.preprocessing import LabelEncoder # Para pasar variables categoricas a numericas

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set the path to the directory containing the raw data files
path = "../../main/data/UK_BIOBANK_DATA" 

#Charge dataframes
#Touchscreen
genomics_df = pd.read_csv(os.path.join(path, "genomics_data.tsv"), sep="\t")

genomics_df.head()

Unnamed: 0,eid,f_22000_0_0,f_22001_0_0,f_22003_0_0,f_22004_0_0,f_22005_0_0,f_22006_0_0,f_22009_0_1,f_22009_0_2,f_22009_0_3,...,f_22024_0_0,f_22025_0_0,f_22026_0_0,f_22027_0_0,f_22028_0_0,f_22029_0_0,f_22030_0_0,f_22007_0_0,f_22008_0_0,f_22182_0_0
0,1000012,10.0,0.0,0.186874,0.18824,0.008535,1.0,-12.4522,2.97502,-1.52667,...,44.74,99.059,0.97,,1.0,1.0,1.0,SMP4_0010894A,G03,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,..."
1,1000021,-7.0,1.0,0.190092,0.191694,0.001877,1.0,-13.8955,6.01974,-2.01115,...,93.3443,99.477,0.9528,,1.0,1.0,1.0,SMP4_0008357,B08,"0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,1000035,-2.0,0.0,0.190206,0.191802,0.001883,1.0,-11.7626,4.23129,-3.09134,...,30.1373,99.595,0.96018,,1.0,1.0,1.0,SMP4_0008467,C11,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,..."
3,1000044,57.0,1.0,0.190249,0.191653,0.003119,1.0,-11.2151,4.37638,0.766213,...,29.34,99.58,0.98,,1.0,1.0,1.0,SMP4_0011008A,A03,"0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,..."
4,1000053,69.0,0.0,0.190025,0.191251,0.002626,1.0,-10.4144,2.19996,0.54522,...,22.18,99.572,0.97,,1.0,1.0,1.0,SMP4_0013058A,H12,"0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [4]:
genomics_chars_df = pd.read_csv(
    os.path.join(path, 'genomics_chars.tsv'),
    sep="\t",
    skiprows=5
)

# Mostrar las primeras filas
genomics_chars_df.head()

Unnamed: 0,Field,FieldID,ValueType,Coding,Category,Participants,Items,Stability,Units,ItemType,Strata,Sexed,Instances,Array,Notes,Link,Path
0,Genotype measurement batch,22000,Categorical single,22000.0,100313,488127,488127,Accruing,,Data,Primary,Unisex,1,1,Genotyping was done using two closely related ...,http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=...,Genomics > Genotypes > Genotyping process and ...
1,Genetic sex,22001,Categorical single,9.0,100313,488127,488127,Complete,,Data,Primary,Unisex,1,1,Sex as determined from genotyping analysis. ...,http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=...,Genomics > Genotypes > Genotyping process and ...
2,Heterozygosity,22003,Continuous,,100313,488127,488127,Accruing,Fraction,Data,Derived,Unisex,1,1,Heterozygosity across a set of high-quality ma...,http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=...,Genomics > Genotypes > Genotyping process and ...
3,"Heterozygosity, PCA corrected",22004,Continuous,,100313,488127,488127,Ongoing,Fraction,Data,Derived,Unisex,1,1,Heterozygosity after adjusting for ancestry us...,http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=...,Genomics > Genotypes > Genotyping process and ...
4,Missingness,22005,Continuous,,100313,488127,488127,Accruing,Fraction,Data,Derived,Unisex,1,1,Missing rate of each sample based on a set of ...,http://biobank.ndph.ox.ac.uk/ukb/field.cgi?id=...,Genomics > Genotypes > Genotyping process and ...


In [5]:
#Charge dataframes
#Touchscreen
genomics_cbkeys_df = pd.read_csv(os.path.join(path, "genomics_cbkeys.tsv"), sep="\t")

genomics_cbkeys_df.head()

Unnamed: 0,coding_name,code,meaning,concept,display_order,parent_code
0,9,0,Female,,1,
1,9,1,Male,,2,
2,1,1,Yes,,1,
3,22000,1000,BiLEVE,,1,
4,22000,-11,UKBiLEVEAX_b11,,2,1000.0


In [10]:
def rename_columns_with_field_names(df, genomincs_chars_df, exclude_cols=['eid']):
    genomincs_chars_df = genomincs_chars_df.dropna(subset=['FieldID', 'Field']).copy()
    genomincs_chars_df['FieldID'] = genomincs_chars_df['FieldID'].astype(int)
    genomincs_chars_df['Field'] = genomincs_chars_df['Field'].str.strip()

    fieldid_to_name = dict(zip(genomincs_chars_df['FieldID'], genomincs_chars_df['Field']))

    rename_dict = {}
    for col in df.columns:
        if col in exclude_cols:
            continue

        match = re.match(r'f_(\d+)_\d+_\d+', col)
        if match:
            fieldid = int(match.group(1))
            new_name = fieldid_to_name.get(fieldid)
            if new_name:
                rename_dict[col] = new_name
            else:
                print(f" FieldID {fieldid} encontrado en columna '{col}' no está en genomincs_chars_df")
        else:
            print(f" No coincide con patrón: {col}")

    return df.rename(columns=rename_dict)

In [14]:
gendre_df  = rename_columns_with_field_names(genomics_df, genomics_chars_df)

gendre_df.head()

Unnamed: 0,eid,Genotype measurement batch,Genetic sex,Heterozygosity,"Heterozygosity, PCA corrected",Missingness,Genetic ethnic grouping,Genetic principal components,Genetic principal components.1,Genetic principal components.2,...,DNA concentration,"Affymetrix quality control metric """"""""Cluster.CR""""""""","Affymetrix quality control metric """"""""dQC""""""""",Outliers for heterozygosity or missing rate,Use in phasing Chromosomes 1-22,Use in phasing Chromosome X,Use in phasing Chromosome XY,Genotype measurement plate,Genotype measurement well,HLA imputation values
0,1000012,10.0,0.0,0.186874,0.18824,0.008535,1.0,-12.4522,2.97502,-1.52667,...,44.74,99.059,0.97,,1.0,1.0,1.0,SMP4_0010894A,G03,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,..."
1,1000021,-7.0,1.0,0.190092,0.191694,0.001877,1.0,-13.8955,6.01974,-2.01115,...,93.3443,99.477,0.9528,,1.0,1.0,1.0,SMP4_0008357,B08,"0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,1000035,-2.0,0.0,0.190206,0.191802,0.001883,1.0,-11.7626,4.23129,-3.09134,...,30.1373,99.595,0.96018,,1.0,1.0,1.0,SMP4_0008467,C11,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,..."
3,1000044,57.0,1.0,0.190249,0.191653,0.003119,1.0,-11.2151,4.37638,0.766213,...,29.34,99.58,0.98,,1.0,1.0,1.0,SMP4_0011008A,A03,"0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,..."
4,1000053,69.0,0.0,0.190025,0.191251,0.002626,1.0,-10.4144,2.19996,0.54522,...,22.18,99.572,0.97,,1.0,1.0,1.0,SMP4_0013058A,H12,"0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [15]:
gendre_sex_df = gendre_df[['eid', 'Genetic sex']].copy()

gendre_sex_df['Genetic sex'] = gendre_sex_df['Genetic sex'].map({0.0: 'Female', 1.0: 'Male'})

gendre_sex_df.head()

Unnamed: 0,eid,Genetic sex
0,1000012,Female
1,1000021,Male
2,1000035,Female
3,1000044,Male
4,1000053,Female


In [16]:
gendre_sex_df.shape

(502180, 2)

In [19]:
gendre_sex_df.to_csv(os.path.join(path,"genetic_sex.csv"))