# Notebook to target important medical values for IBD patients in the phenotype table

## Packages

In [381]:
# Import the required libraries for data analysis and visualization
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.experimental import enable_iterative_imputer  # IterativeImputer
from sklearn.impute import IterativeImputer
from scipy import stats


## Set paths

In [382]:
# Set the paths
path = "/Users/fjosesala/Documents/GitHub/Horizontal_UKB/main/data/UK_BIOBANK_DATA/" 
path_graphs = "outputs/graphs" 
path_results = "outputs/results" 

## Charge data

In [383]:
#Charge the dataframes

# Phenotypes
phenotypes_df = pd.read_csv(os.path.join(path,"phenotype_data.tsv"), sep="\t")

# Phenotype chars
phenotypes_chars_df = pd.read_csv(os.path.join(path,"phenotype_chars.tsv"), sep="\t") #change

# Phenotype keys
phenotypes_keys_df = pd.read_csv(os.path.join(path, 'phenotype_cbkeys.tsv'), sep='\t')

# Olinkdata
ibd0_df= pd.read_csv(os.path.join(path, "ibd0.tsv"),sep="\t")



  ibd0_df= pd.read_csv(os.path.join(path, "ibd0.tsv"),sep="\t")


## Limpieza

In [384]:
# Quedarse con solo las cohorte 0 de la tabla phenotype 
selected_columns = ['eid'] + [col for col in phenotypes_df.columns if col.endswith('0_0')]
phenotypes_df = phenotypes_df[selected_columns]

# Quitar el patron '_0_0' y el prefijo 'f_' de los nombres de las columnas
phenotypes_df.columns = [re.sub(r'^f_', '', re.sub(r'_0_0$', '', col)) for col in phenotypes_df.columns]

# Cambiar el nombre de las columnas de phenotypes_df segun la columna FieldID de phenotypes_chars_df
rename_dict = dict(zip(phenotypes_chars_df['FieldID'].astype(str), phenotypes_chars_df['Field']))
phenotypes_df.rename(columns=rename_dict, inplace=True)



### Eliminar NAs

In [385]:
# Agregar la columna disease de ibd0 a phenotypes_df
phenotypes_df = phenotypes_df.merge(ibd0_df[['eid', 'Disease']], on='eid', how='left')

# Ordenar las columnas de phenotypes_df, Disease de segunda columna
phenotypes_df = phenotypes_df[['eid', 'Disease'] + [col for col in phenotypes_df.columns if col not in ['eid', 'Disease']]]

# Drop NA segun la columna Disease
phenotypes_df = phenotypes_df.dropna(subset=['Disease'])

# Drop columnas con mas de 50% de NA
threshold = 0.5 * len(phenotypes_df)
phenotypes_df = phenotypes_df.dropna(thresh=threshold, axis=1)




In [386]:
phenotypes_keys_df

Unnamed: 0,coding_name,code,meaning,concept,display_order,parent_code
0,7,1,Yes,,1,
1,7,0,No,,2,
2,100291,-1,Do not know,,1,
3,100291,-3,Prefer not to answer,,2,
4,100347,1,"Yes, on most or all days",,1,
...,...,...,...,...,...,...
338,1001,-1,Do not know,,21,
339,1001,-3,Prefer not to answer,,22,
340,100700,0,low,,1,
341,100700,1,moderate,,2,


In [387]:
#Eliminando NAS de todo el dataset
pheno_sinNAs= phenotypes_df.dropna()
pheno_sinNAs

Unnamed: 0,eid,Disease,Current tobacco smoking,Past tobacco smoking,Smoking/smokers in household,Exposure to tobacco smoke at home,Exposure to tobacco smoke outside home,Cooked vegetable intake,Salad / raw vegetable intake,Fresh fruit intake,...,Ever smoked,Ethnic background,IPAQ activity group,Summed days activity,Summed minutes activity,At or above moderate/vigorous recommendation,At or above moderate/vigorous/walking recommendation,MET minutes per week for walking,MET minutes per week for moderate activity,MET minutes per week for vigorous activity
984,1009851,IBD,0.0,4.0,0.0,0.0,2.0,4.0,3.0,4.0,...,0.0,1003.0,2.0,8.0,180.0,0.0,1.0,2772.0,240.0,0.0
1181,1011821,IBD,0.0,4.0,0.0,0.0,0.0,4.0,4.0,2.0,...,0.0,1001.0,2.0,10.0,110.0,1.0,1.0,330.0,240.0,1440.0
1768,1017690,Control,0.0,4.0,1.0,0.0,0.0,2.0,1.0,2.0,...,0.0,1001.0,1.0,14.0,50.0,1.0,1.0,462.0,840.0,0.0
1856,1018575,Control,0.0,3.0,0.0,0.0,0.0,4.0,1.0,3.0,...,1.0,1001.0,2.0,13.0,380.0,1.0,1.0,2376.0,5040.0,320.0
1989,1019905,Control,0.0,4.0,0.0,0.0,0.0,2.0,2.0,1.0,...,0.0,1001.0,1.0,10.0,150.0,1.0,1.0,198.0,1920.0,240.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499951,6000162,Control,0.0,1.0,0.0,0.0,0.0,2.0,3.0,7.0,...,1.0,1001.0,1.0,7.0,30.0,0.0,1.0,693.0,0.0,0.0
500086,6001510,Control,0.0,2.0,0.0,0.0,0.0,3.0,6.0,2.0,...,1.0,1001.0,1.0,6.0,60.0,0.0,1.0,495.0,0.0,240.0
500822,6008871,Control,0.0,4.0,0.0,0.0,0.0,3.0,4.0,2.0,...,0.0,1001.0,1.0,12.0,135.0,1.0,1.0,924.0,720.0,560.0
501053,6011186,IBD,0.0,3.0,1.0,0.0,0.0,1.0,1.0,3.0,...,1.0,1001.0,1.0,7.0,50.0,0.0,1.0,594.0,80.0,0.0


### Imputar variable por variable

In [388]:
# Ver valores unicos de cada columna
for col in phenotypes_df.columns:
    print(f"{col}: {phenotypes_df[col].unique()[:10]}")  # Muestra hasta 10 valores únicos por columna
    print("-" * 40)

eid: [1000298 1000767 1000952 1001301 1001358 1001662 1002418 1002677 1002943
 1002990]
----------------------------------------
Disease: ['Control' 'IBD' 'CD' 'UC']
----------------------------------------
Current tobacco smoking: [ 0.  1.  2. nan -3.]
----------------------------------------
Past tobacco smoking: [ 2.  4.  1.  3. nan -3.]
----------------------------------------
Smoking/smokers in household: [ 0.  1. nan  2. -3.]
----------------------------------------
Exposure to tobacco smoke at home: [ 0. -1.  1. nan 16.  2. 12. 21.  7. 40.]
----------------------------------------
Exposure to tobacco smoke outside home: [ 0. -1.  1. nan  2.  8.  5.  4.  3.  6.]
----------------------------------------
Cooked vegetable intake: [  6.   3.  -1.   2.   5.  10.   1.   4. -10.   7.]
----------------------------------------
Salad / raw vegetable intake: [  3.  -1. -10.   1.   0.   2.   5.  12.   4.   6.]
----------------------------------------
Fresh fruit intake: [  3.   2. -10.   0. 

In [389]:
# Ver la moda de cada columna por grupo de 'Disease'
for col in phenotypes_df.columns:
    if col not in ['eid', 'Disease']:
        moda = phenotypes_df.groupby('Disease')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        print(f"Moda de '{col}' por grupo de 'Disease':")
        print(moda)
        print("-" * 40)

Moda de 'Current tobacco smoking' por grupo de 'Disease':
Disease
CD         0.0
Control    0.0
IBD        0.0
UC         0.0
Name: Current tobacco smoking, dtype: float64
----------------------------------------
Moda de 'Past tobacco smoking' por grupo de 'Disease':
Disease
CD         4.0
Control    4.0
IBD        4.0
UC         4.0
Name: Past tobacco smoking, dtype: float64
----------------------------------------
Moda de 'Smoking/smokers in household' por grupo de 'Disease':
Disease
CD         0.0
Control    0.0
IBD        0.0
UC         0.0
Name: Smoking/smokers in household, dtype: float64
----------------------------------------
Moda de 'Exposure to tobacco smoke at home' por grupo de 'Disease':
Disease
CD         0.0
Control    0.0
IBD        0.0
UC         0.0
Name: Exposure to tobacco smoke at home, dtype: float64
----------------------------------------
Moda de 'Exposure to tobacco smoke outside home' por grupo de 'Disease':
Disease
CD         0.0
Control    0.0
IBD        0.

In [364]:
# Imputación categórica estratificada por clase multinomial,
# Imputar NA en cada columna (excepto eid y Disease) por la moda de los grupos 'Disease'
def impute(df):
    df = df.copy()
    for col in df.columns:
        if col not in ['eid', 'Disease']:
            modas = df.groupby('Disease')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
            df[col] = df.apply(
                lambda row: modas[row['Disease']] if pd.isna(row[col]) else row[col], axis=1
            )
    return df


In [390]:
phenotypes_df_imputed = impute(phenotypes_df)



In [391]:
phenotypes_df_imputed["Condition"] = phenotypes_df_imputed["Disease"].apply(lambda x: "IBD" if x in ["CD", "UC", "IBD"] else "Control")

# Ordenar las columnas de phenotypes_df, Condition de segunda columna
phenotypes_df_imputed = phenotypes_df_imputed[['eid', 'Disease' ,"Condition"] + [col for col in phenotypes_df_imputed.columns if col not in ['eid', 'Disease' , "Condition"]]]


In [392]:
phenotypes_df_imputed

Unnamed: 0,eid,Disease,Condition,Current tobacco smoking,Past tobacco smoking,Smoking/smokers in household,Exposure to tobacco smoke at home,Exposure to tobacco smoke outside home,Cooked vegetable intake,Salad / raw vegetable intake,...,Ever smoked,Ethnic background,IPAQ activity group,Summed days activity,Summed minutes activity,At or above moderate/vigorous recommendation,At or above moderate/vigorous/walking recommendation,MET minutes per week for walking,MET minutes per week for moderate activity,MET minutes per week for vigorous activity
28,1000298,Control,Control,0.0,2.0,0.0,0.0,0.0,6.0,3.0,...,1.0,5.0,2.0,16.0,70.0,1.0,1.0,396.0,600.0,800.0
75,1000767,IBD,IBD,0.0,4.0,0.0,0.0,0.0,3.0,3.0,...,0.0,1001.0,1.0,10.0,70.0,1.0,1.0,297.0,600.0,160.0
94,1000952,Control,Control,0.0,4.0,1.0,-1.0,-1.0,-1.0,-1.0,...,0.0,3001.0,2.0,7.0,60.0,1.0,1.0,1386.0,0.0,0.0
129,1001301,Control,Control,0.0,1.0,0.0,0.0,0.0,3.0,-10.0,...,1.0,5.0,2.0,7.0,60.0,1.0,1.0,1386.0,0.0,0.0
134,1001358,Control,Control,0.0,3.0,0.0,0.0,1.0,3.0,1.0,...,1.0,1001.0,1.0,5.0,130.0,0.0,1.0,396.0,480.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502091,6021563,Control,Control,0.0,4.0,0.0,0.0,0.0,3.0,1.0,...,0.0,1001.0,2.0,7.0,60.0,1.0,1.0,1386.0,0.0,0.0
502101,6021667,Control,Control,0.0,4.0,1.0,0.0,0.0,2.0,2.0,...,0.0,1001.0,0.0,2.0,15.0,0.0,0.0,99.0,0.0,0.0
502123,6021888,Control,Control,0.0,4.0,0.0,0.0,3.0,5.0,3.0,...,0.0,1001.0,2.0,7.0,60.0,1.0,1.0,1386.0,0.0,0.0
502126,6021914,Control,Control,0.0,4.0,0.0,0.0,0.0,2.0,2.0,...,0.0,1001.0,2.0,7.0,60.0,1.0,1.0,1386.0,0.0,0.0


### Transformar
Categoricas pasarlas a one-hot encoding,

Ordinales pasarlas a numeros

Numericas limpiar


In [425]:
# Quitar el sufijo '.0' de la columna Coding en phenotypes_chars_df, y pasar a str ambas tablas
phenotypes_chars_df['Coding'] = phenotypes_chars_df['Coding'].astype(str).str.replace(r'\.0$', '', regex=True)
phenotypes_keys_df['coding_name'] = phenotypes_keys_df['coding_name'].astype(str).str.replace(r'\.0$', '', regex=True)


In [426]:
# Dividir el df, en numericos y categoricos
# Lista de columnas numéricas a extraer
numeric_cols = [
    "Exposure to tobacco smoke at home",
    "Exposure to tobacco smoke outside home",
    "Cooked vegetable intake",
    "Salad / raw vegetable intake",
    "Fresh fruit intake",
    "Dried fruit intake",
    "Bread intake",
    "Cereal intake",
    "Tea intake",
    "Water intake",
    "Average weekly red wine intake",
    "Average weekly champagne plus white wine intake",
    "Average weekly beer plus cider intake",
    "Average weekly spirits intake",
    "Average weekly fortified wine intake",
    "Summed days activity",
    "Summed minutes activity",
    "MET minutes per week for walking",
    "MET minutes per week for moderate activity",
    "MET minutes per week for vigorous activity"
]

# Generar df de variables numericas
pheno_numeric = phenotypes_df_imputed[['eid'] + [col for col in numeric_cols if col in phenotypes_df_imputed.columns]].copy()

#Generar df de variables categoricas
pheno_categoric = phenotypes_df_imputed.drop(columns=[col for col in numeric_cols if col in phenotypes_df_imputed.columns]) 

In [428]:
pheno_categoric_mapped = pheno_categoric.copy()

def mapear(nombre):
    # Mapear numeros de phenotypes_df_imputed a texto 
    # 1. Obtener el nombre exacto de la columna
    col_name = nombre
    
    # 2. Buscar el código 'coding' asociado en phenotypes_chars_df
    coding_code = phenotypes_chars_df.loc[phenotypes_chars_df['Field'] == col_name, 'Coding'].values[0]

    # 3. Crear el diccionario de mapeo code->meaning desde phenotypes_keys_df
    mapping = phenotypes_keys_df.loc[phenotypes_keys_df['coding_name'] == coding_code, ['code', 'meaning']]
    code_to_meaning = dict(zip(mapping['code'], mapping['meaning']))

    # 4. Reemplazar en la columna original
    pheno_categoric_mapped[col_name] = pheno_categoric_mapped[col_name].map(code_to_meaning)

In [429]:
# Mapear todas las columnas categóricas a texto usando la función mapear
for colname in pheno_categoric_mapped.columns:
    if colname not in ['eid', 'Disease', 'Condition']:
        mapear(colname)

#### Numericas


In [None]:
pheno_numeric
phenotypes_chars_df

Unnamed: 0,eid,Exposure to tobacco smoke at home,Exposure to tobacco smoke outside home,Cooked vegetable intake,Salad / raw vegetable intake,Fresh fruit intake,Dried fruit intake,Bread intake,Cereal intake,Tea intake,...,Average weekly red wine intake,Average weekly champagne plus white wine intake,Average weekly beer plus cider intake,Average weekly spirits intake,Average weekly fortified wine intake,Summed days activity,Summed minutes activity,MET minutes per week for walking,MET minutes per week for moderate activity,MET minutes per week for vigorous activity
28,1000298,0.0,0.0,6.0,3.0,3.0,1.0,10.0,7.0,2.0,...,6.0,1.0,1.0,0.0,0.0,16.0,70.0,396.0,600.0,800.0
75,1000767,0.0,0.0,3.0,3.0,2.0,0.0,10.0,4.0,1.0,...,4.0,2.0,1.0,0.0,0.0,10.0,70.0,297.0,600.0,160.0
94,1000952,-1.0,-1.0,-1.0,-1.0,-10.0,-1.0,10.0,-10.0,2.0,...,0.0,0.0,0.0,0.0,0.0,7.0,60.0,1386.0,0.0,0.0
129,1001301,0.0,0.0,3.0,-10.0,-10.0,-10.0,2.0,-10.0,1.0,...,0.0,0.0,0.0,0.0,0.0,7.0,60.0,1386.0,0.0,0.0
134,1001358,0.0,1.0,3.0,1.0,2.0,0.0,20.0,3.0,5.0,...,2.0,2.0,3.0,2.0,0.0,5.0,130.0,396.0,480.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502091,6021563,0.0,0.0,3.0,1.0,2.0,0.0,6.0,7.0,3.0,...,0.0,1.0,0.0,0.0,0.0,7.0,60.0,1386.0,0.0,0.0
502101,6021667,0.0,0.0,2.0,2.0,1.0,0.0,12.0,7.0,2.0,...,12.0,4.0,2.0,1.0,1.0,2.0,15.0,99.0,0.0,0.0
502123,6021888,0.0,3.0,5.0,3.0,3.0,1.0,1.0,7.0,0.0,...,2.0,0.0,0.0,3.0,0.0,7.0,60.0,1386.0,0.0,0.0
502126,6021914,0.0,0.0,2.0,2.0,6.0,0.0,5.0,7.0,7.0,...,6.0,9.0,1.0,4.0,0.0,7.0,60.0,1386.0,0.0,0.0


In [None]:
#-10 menos de una unidad
#-3 no quiere responder
#-1 no sabe 

# Mapear -10 a 0.5 y -1/-3 a NaN, excluyendo la columna 'eid'
for col in pheno_numeric.columns:
    if col != 'eid':
        pheno_numeric[col] = pheno_numeric[col].replace(-10, 0.5)
        pheno_numeric[col] = pheno_numeric[col].replace([-1, -3], np.nan)

pheno_numeric= pheno_numeric.dropna()


Unnamed: 0,eid,Exposure to tobacco smoke at home,Exposure to tobacco smoke outside home,Cooked vegetable intake,Salad / raw vegetable intake,Fresh fruit intake,Dried fruit intake,Bread intake,Cereal intake,Tea intake,...,Average weekly red wine intake,Average weekly champagne plus white wine intake,Average weekly beer plus cider intake,Average weekly spirits intake,Average weekly fortified wine intake,Summed days activity,Summed minutes activity,MET minutes per week for walking,MET minutes per week for moderate activity,MET minutes per week for vigorous activity
28,1000298,0.0,0.0,6.0,3.0,3.0,1.0,10.0,7.0,2.0,...,6.0,1.0,1.0,0.0,0.0,16.0,70.0,396.0,600.0,800.0
75,1000767,0.0,0.0,3.0,3.0,2.0,0.0,10.0,4.0,1.0,...,4.0,2.0,1.0,0.0,0.0,10.0,70.0,297.0,600.0,160.0
129,1001301,0.0,0.0,3.0,0.5,0.5,0.5,2.0,0.5,1.0,...,0.0,0.0,0.0,0.0,0.0,7.0,60.0,1386.0,0.0,0.0
134,1001358,0.0,1.0,3.0,1.0,2.0,0.0,20.0,3.0,5.0,...,2.0,2.0,3.0,2.0,0.0,5.0,130.0,396.0,480.0,80.0
165,1001662,0.0,1.0,2.0,0.0,0.0,0.0,2.0,5.0,10.0,...,0.0,0.0,9.0,0.0,0.0,8.0,145.0,891.0,200.0,400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502091,6021563,0.0,0.0,3.0,1.0,2.0,0.0,6.0,7.0,3.0,...,0.0,1.0,0.0,0.0,0.0,7.0,60.0,1386.0,0.0,0.0
502101,6021667,0.0,0.0,2.0,2.0,1.0,0.0,12.0,7.0,2.0,...,12.0,4.0,2.0,1.0,1.0,2.0,15.0,99.0,0.0,0.0
502123,6021888,0.0,3.0,5.0,3.0,3.0,1.0,1.0,7.0,0.0,...,2.0,0.0,0.0,3.0,0.0,7.0,60.0,1386.0,0.0,0.0
502126,6021914,0.0,0.0,2.0,2.0,6.0,0.0,5.0,7.0,7.0,...,6.0,9.0,1.0,4.0,0.0,7.0,60.0,1386.0,0.0,0.0


#### Categoricas

In [432]:
pheno_categoric_mapped

Unnamed: 0,eid,Disease,Condition,Current tobacco smoking,Past tobacco smoking,Smoking/smokers in household,Oily fish intake,Non-oily fish intake,Processed meat intake,Poultry intake,...,"Medication for pain relief, constipation, heartburn",Vitamin and mineral supplements,Mineral and other dietary supplements,Smoking status,Alcohol drinker status,Ever smoked,Ethnic background,IPAQ activity group,At or above moderate/vigorous recommendation,At or above moderate/vigorous/walking recommendation
28,1000298,Control,Control,No,Smoked occasionally,No,Once a week,Once a week,Less than once a week,Once a week,...,Aspirin,Multivitamins +/- minerals,Fish oil (including cod liver oil),Previous,Current,Yes,Chinese,high,Yes,Yes
75,1000767,IBD,IBD,No,I have never smoked,No,2-4 times a week,Once a week,Once a week,2-4 times a week,...,Ibuprofen (e.g. Nurofen),None of the above,Glucosamine,Never,Current,No,British,moderate,Yes,Yes
94,1000952,Control,Control,No,I have never smoked,"Yes, one household member smokes",Do not know,Never,Never,Never,...,None of the above,None of the above,None of the above,Never,Never,No,Indian,high,Yes,Yes
129,1001301,Control,Control,No,Smoked on most or all days,No,Less than once a week,Once a week,Less than once a week,2-4 times a week,...,None of the above,None of the above,None of the above,Previous,Current,Yes,Chinese,high,Yes,Yes
134,1001358,Control,Control,No,Just tried once or twice,No,Less than once a week,Less than once a week,2-4 times a week,2-4 times a week,...,None of the above,None of the above,None of the above,Never,Current,Yes,British,moderate,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502091,6021563,Control,Control,No,I have never smoked,No,Once a week,Once a week,Once a week,Less than once a week,...,Aspirin,None of the above,None of the above,Never,Current,No,British,high,Yes,Yes
502101,6021667,Control,Control,No,I have never smoked,"Yes, one household member smokes",Less than once a week,Once a week,Less than once a week,Once a week,...,None of the above,None of the above,None of the above,Never,Current,No,British,low,No,No
502123,6021888,Control,Control,No,I have never smoked,No,Once a week,Once a week,Less than once a week,Less than once a week,...,None of the above,None of the above,Fish oil (including cod liver oil),Never,Current,No,British,high,Yes,Yes
502126,6021914,Control,Control,No,I have never smoked,No,Less than once a week,Less than once a week,Less than once a week,Once a week,...,Paracetamol,Vitamin D,Calcium,Never,Current,No,British,high,Yes,Yes


In [434]:
# Ver valores unicos de cada columna
for col in pheno_categoric_mapped.columns:
    print(f"{col}: {pheno_categoric_mapped[col].unique()[:10]}")  # Muestra hasta 10 valores únicos por columna
    print("-" * 40)

eid: [1000298 1000767 1000952 1001301 1001358 1001662 1002418 1002677 1002943
 1002990]
----------------------------------------
Disease: ['Control' 'IBD' 'CD' 'UC']
----------------------------------------
Condition: ['Control' 'IBD']
----------------------------------------
Current tobacco smoking: ['No' 'Yes, on most or all days' 'Only occasionally'
 'Prefer not to answer']
----------------------------------------
Past tobacco smoking: ['Smoked occasionally' 'I have never smoked' 'Smoked on most or all days'
 'Just tried once or twice' 'Prefer not to answer']
----------------------------------------
Smoking/smokers in household: ['No' 'Yes, one household member smokes'
 'Yes, more than one household member smokes' 'Prefer not to answer']
----------------------------------------
Oily fish intake: ['Once a week' '2-4 times a week' 'Do not know' 'Less than once a week'
 'Never' '5-6 times a week' 'Once or more daily' 'Prefer not to answer']
----------------------------------------
Non-

##### Categoricas Nominales

In [None]:
# Variables categóricas nominales
pheno_nominal = [
    "eid",
    'Disease',
    'Condition',
    'Current tobacco smoking',
    'Past tobacco smoking',
    'Smoking/smokers in household',
    'Milk type used',
    'Spread type',
    'Bread type',
    'Cereal type',
    'Coffee type',
    'Alcohol usually taken with meals',
    'Never eat eggs, dairy, wheat, sugar',
    'Ethnic background',
    'Smoking status',
    'Alcohol drinker status',
    'Ever smoked',
    'Vascular/heart problems diagnosed by doctor',
    'Blood clot, DVT, bronchitis, emphysema, asthma, rhinitis, eczema, allergy diagnosed by doctor',
    'Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones',
    'Medication for pain relief, constipation, heartburn',
    'Vitamin and mineral supplements',
    'Mineral and other dietary supplements'
]

# Generar df de variables categoricas nominales
pheno_categorical_nominal = pheno_categoric_mapped[['eid'] + [col for col in pheno_nominal if col in pheno_categoric_mapped.columns]].copy()


##### Categoricas Ordinales
 

In [None]:
# Variables categóricas ordinales
categorical_ordinal = [
    "eid",
    'Oily fish intake',
    'Non-oily fish intake',
    'Processed meat intake',
    'Poultry intake',
    'Beef intake',
    'Lamb/mutton intake',
    'Pork intake',
    'Cheese intake',
    'Salt added to food',
    'Hot drink temperature',
    'Major dietary changes in the last 5 years',
    'Variation in diet',
    'Alcohol intake frequency.',
    'Alcohol intake versus 10 years previously',
    'Overall health rating',
    'Long-standing illness, disability or infirmity',
    'Falls in the last year',
    'Weight change compared with 1 year ago',
    'Ever had bowel cancer screening',
    'Diabetes diagnosed by doctor',
    'Cancer diagnosed by doctor',
    'Fractured/broken bones in last 5 years',
    'Other serious medical condition/disability diagnosed by doctor',
    'Taking other prescription medications',
    'IPAQ activity group',
    'At or above moderate/vigorous recommendation',
    'At or above moderate/vigorous/walking recommendation'
]


# Generar df de variables categoricas nominales
pheno_categorical_ordinal = pheno_categoric_mapped[['eid'] + [col for col in categorical_ordinal if col in pheno_categoric_mapped.columns]].copy()

pheno_categorical_ordinal



## Modelado

### Lasso

### Random Forest


### XGBoost