In [None]:
# STEP 0: Download and access

#colab + drive
from google.colab import drive
drive.mount('/content/drive')

#Download from Kaggle
from google.colab import files
files.upload()  # Upload the kaggle.json here

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c widsdatathon2025
!unzip -q widsdatathon2025.zip -d /content/drive/MyDrive/WiDS/wids_data


In [6]:
# 📦 STEP 1: IMPORTS
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

In [15]:
# 📁 STEP 2: LOAD RAW DATA
base_path_data = "/content/drive/MyDrive/WiDS/wids_data"
base_path_dir = "/content/drive/MyDrive/WiDS Datathon"

In [8]:
# Load individual files
labels_path = base_path_data + "/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx"
cat_path = base_path_data + "/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx"
quant_path = base_path_data + "/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx"
connectome_path = base_path_data + "/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv"

In [9]:
# Read files
labels = pd.read_excel(labels_path)
df_cat = pd.read_excel(cat_path)
df_quant = pd.read_excel(quant_path)
df_connectome = pd.read_csv(connectome_path)

In [10]:
# 🔗 STEP 3: MERGE
# Align everything by participant_id
full = labels.merge(df_cat, on='participant_id')
full = full.merge(df_quant, on='participant_id')
full = full.merge(df_connectome, on='participant_id')

In [11]:
# 🧠 STEP 4: FEATURE SPLIT
brain_columns = [col for col in full.columns if 'throw' in col]
target_cols = ['ADHD_Outcome', 'Sex_F']
non_brain_columns = [col for col in full.columns if col not in brain_columns + target_cols + ['participant_id']]

X_brain = full[brain_columns]
X_non_brain = full[non_brain_columns]
y = full[target_cols]

In [13]:
# ⚙️ STEP 5: PREPROCESSING PIPELINE
numerical_cols = X_non_brain.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_non_brain.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='median'), numerical_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Scale brain features
brain_scaler = StandardScaler()
X_brain_scaled = brain_scaler.fit_transform(X_brain)

# Process non-brain
X_non_brain_processed = preprocessor.fit_transform(X_non_brain)

In [16]:
# 🧪 STEP 6: SAVE ASSETS
# Save X, y, and fitted transformers
np.save(base_path_dir + "/X_brain_scaled.npy", X_brain_scaled)
np.save(base_path_dir + "/X_non_brain_processed.npy", X_non_brain_processed)
np.save(base_path_dir + "/y.npy", y.to_numpy())

joblib.dump(preprocessor, base_path_dir + "/final_preprocessor.pkl")
joblib.dump(brain_scaler, base_path_dir + "/brain_scaler.pkl")

print("✅ Data preparation complete and saved.")

✅ Data preparation complete and saved.
