# Importación de librerias

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Carga del dataset

In [None]:
# --- 2. Cargar el dataset desde Google Drive ---
import os

path = "/content/drive/MyDrive/TFG/DATASETS/NSCH/datos procesados/"

import pandas as pd

file_path = path + "data_processed-vars_2.csv"
df = pd.read_csv(file_path, sep=";")

# Resumen Dataset

In [None]:
# Número de filas y columnas
n_filas, n_columnas = df.shape
print(f"Número de filas: {n_filas}")
print(f"Número de columnas: {n_columnas}")

# Rango de edad
if "sc_age_years" in df.columns:
    edad_min = df["sc_age_years"].min()
    edad_max = df["sc_age_years"].max()
    print(f"Rango de edad de los participantes: {edad_min} - {edad_max} años")
else:
    print("La columna 'edad' no se encuentra en el dataset.")

# Diagnóstico de TDAH (número y porcentaje)
if "k2q31a" in df.columns:
    conteo_diagnostico = df["k2q31a"].value_counts()
    porcentaje_diagnostico = df["k2q31a"].value_counts(normalize=True) * 100

    print("\nDistribución de diagnóstico:")
    for valor in conteo_diagnostico.index:
        print(f"{valor}: {conteo_diagnostico[valor]} casos ({porcentaje_diagnostico[valor]:.2f}%)")
else:
    print("La columna 'diagnostico' no se encuentra en el dataset.")

Número de filas: 55162
Número de columnas: 82
Rango de edad de los participantes: 0 - 17 años

Distribución de diagnóstico:
No: 49101 casos (89.01%)
Yes: 5648 casos (10.24%)
No valid response: 413 casos (0.75%)


# División del dataset en train y test

In [None]:
#Fijar semilla para reproducibilidad
RANDOM_STATE = 42

#Dividimos en 80% train, 20% test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

# Guardado de los datasets

In [None]:
processed_path_train = '/content/drive/MyDrive/Colab Notebooks/Proyecto-TFG/data/train.csv'
processed_path_test = '/content/drive/MyDrive/Colab Notebooks/Proyecto-TFG/data/test.csv'

# --- 8. Guardar el dataset procesado en Google Drive para el siguiente notebook ---
train_df.to_csv(processed_path_train, index=False)
test_df.to_csv(processed_path_test, index=False)
print(f"✅ Dataset train guardado en: {processed_path_train}")
print(f"✅ Dataset train guardado en: {processed_path_test}")

✅ Dataset train guardado en: /content/drive/MyDrive/Colab Notebooks/Proyecto-TFG/data/train.csv
✅ Dataset train guardado en: /content/drive/MyDrive/Colab Notebooks/Proyecto-TFG/data/test.csv


In [None]:
# Comprobación de distribución por una variable clave (ej: sexo o edad)
if 'k2q31a' in df.columns:
    print("Distribución por var.objetivo (k2q31a) en original:")
    print(df['k2q31a'].value_counts(normalize=True))
    print(df['k2q31a'].value_counts())
    print()

    print("Distribución por var.objetivo (k2q31a) en train:")
    print(train_df['k2q31a'].value_counts(normalize=True))
    print(train_df['k2q31a'].value_counts())
    print()

    print("Distribución por var.objetivo (k2q31a) en test:")
    print(test_df['k2q31a'].value_counts(normalize=True))
    print(test_df['k2q31a'].value_counts())
    print()



Distribución por var.objetivo (k2q31a) en original:
k2q31a
No                   0.890124
Yes                  0.102389
No valid response    0.007487
Name: proportion, dtype: float64
k2q31a
No                   49101
Yes                   5648
No valid response      413
Name: count, dtype: int64

Distribución por var.objetivo (k2q31a) en train:
k2q31a
No                   0.890435
Yes                  0.102404
No valid response    0.007161
Name: proportion, dtype: float64
k2q31a
No                   39294
Yes                   4519
No valid response      316
Name: count, dtype: int64

Distribución por var.objetivo (k2q31a) en test:
k2q31a
No                   0.888879
Yes                  0.102329
No valid response    0.008792
Name: proportion, dtype: float64
k2q31a
No                   9807
Yes                  1129
No valid response      97
Name: count, dtype: int64



In [None]:
# 5. Verificaciones básicas del train
print("\nResumen de datos en TRAIN:")
display(train_df.info())
display(train_df.describe(include='all'))


Resumen de datos en TRAIN:
<class 'pandas.core.frame.DataFrame'>
Index: 44129 entries, 28031 to 15795
Data columns (total 82 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   hhlanguage      44129 non-null  object
 1   sc_age_years    44129 non-null  int64 
 2   sc_sex          44129 non-null  object
 3   birth_yr        44129 non-null  object
 4   a1_age          44129 non-null  object
 5   memorycond      44129 non-null  object
 6   allergies       44129 non-null  object
 7   k2q40a          44129 non-null  object
 8   autoimmune      44129 non-null  object
 9   headache        44129 non-null  object
 10  k2q33a          44129 non-null  object
 11  k2q33b          44129 non-null  object
 12  k2q32a          44129 non-null  object
 13  k2q32b          44129 non-null  object
 14  k2q34a          44129 non-null  object
 15  k2q34b          44129 non-null  object
 16  k2q31a          44129 non-null  object
 17  k2q31b          44129 n

None

Unnamed: 0,hhlanguage,sc_age_years,sc_sex,birth_yr,a1_age,memorycond,allergies,k2q40a,autoimmune,headache,...,sc_k2q22,agepos4,sc_racer,family_r,birthwt_l,birthwt,birth_yr_f,higrade,fpl_i1,k2q40a.1
count,44129,44129.0,44129,44129.0,44129.0,44129,44129,44129,44129,44129,...,42120,42120,42120,42120,42120,42120,42120,42120,42120,44129
unique,4,,2,20.0,54.0,4,3,3,3,3,...,3,5,3,9,3,4,4,3,350,3
top,English,,Male,2021.0,38.0,No,No,No,No,No,...,No,Only child,White alone,"Two biogical/adoptive parents, currently married",No,Not low birth weight,No data quality concerns,More than high school,400 or more,No
freq,40157,,22629,3682.0,2113.0,23178,31286,39635,43488,42586,...,36462,16366,32224,28596,36947,36947,40400,37043,18453,39635
mean,,8.341544,,,,,,,,,...,,,,,,,,,,
std,,5.313594,,,,,,,,,...,,,,,,,,,,
min,,0.0,,,,,,,,,...,,,,,,,,,,
25%,,4.0,,,,,,,,,...,,,,,,,,,,
50%,,8.0,,,,,,,,,...,,,,,,,,,,
75%,,13.0,,,,,,,,,...,,,,,,,,,,
