# Install and Load Libraries/Data

In [None]:
import pandas as pd #type: ignore
import numpy as np #type: ignore
from sklearn.pipeline import Pipeline #type: ignore
from sklearn.compose import ColumnTransformer #type: ignore
from sklearn.impute import SimpleImputer #type: ignore
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder #type: ignore
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split #type: ignore
from sklearn.metrics import accuracy_score, classification_report #type: ignore
from sklearn.base import BaseEstimator, TransformerMixin #type: ignore
import joblib #type: ignore
import os
import pyarrow.parquet as pq #type: ignore
from glob import glob

In [2]:
# Paths to data folders
train_csv_path = './data/train.csv'
test_csv_path = './data/test.csv'
series_train_path = './data/series_train.parquet'
series_test_path = './data/series_test.parquet'

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Training Imputation - Data without Parquet Information and Target Variable 

In [3]:
# Display Entire List of Missing Values
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0] 
print(missing_values.to_string()) 

CGAS-Season                               1405
CGAS-CGAS_Score                           1539
Physical-Season                            650
Physical-BMI                               938
Physical-Height                            933
Physical-Weight                            884
Physical-Waist_Circumference              3062
Physical-Diastolic_BP                     1006
Physical-HeartRate                         993
Physical-Systolic_BP                      1006
Fitness_Endurance-Season                  2652
Fitness_Endurance-Max_Stage               3217
Fitness_Endurance-Time_Mins               3220
Fitness_Endurance-Time_Sec                3220
FGC-Season                                 614
FGC-FGC_CU                                1638
FGC-FGC_CU_Zone                           1678
FGC-FGC_GSND                              2886
FGC-FGC_GSND_Zone                         2898
FGC-FGC_GSD                               2886
FGC-FGC_GSD_Zone                          2897
FGC-FGC_PU   

In [4]:
features = train_df.drop(columns=['id', 'sii'])
target = train_df['sii']

In [5]:
# Numerical Columns
numerical_cols = [
    'Basic_Demos-Age',
    'CGAS-CGAS_Score',
    'Physical-BMI',
    'Physical-Height',
    'Physical-Weight',
    'Physical-Waist_Circumference',
    'Physical-Diastolic_BP',
    'Physical-HeartRate',
    'Physical-Systolic_BP',
    'Fitness_Endurance-Max_Stage',
    'Fitness_Endurance-Time_Mins',
    'Fitness_Endurance-Time_Sec',
    'FGC-FGC_CU',
    'FGC-FGC_GSND',
    'FGC-FGC_GSD',
    'FGC-FGC_PU',
    'FGC-FGC_SRL',
    'FGC-FGC_SRR',
    'FGC-FGC_TL',
    'BIA-BIA_Activity_Level_num',
    'BIA-BIA_BMC',
    'BIA-BIA_BMI',
    'BIA-BIA_BMR',
    'BIA-BIA_DEE',
    'BIA-BIA_ECW',
    'BIA-BIA_FFM',
    'BIA-BIA_FFMI',
    'BIA-BIA_FMI',
    'BIA-BIA_Fat',
    'BIA-BIA_ICW',
    'BIA-BIA_LDM',
    'BIA-BIA_LST',
    'BIA-BIA_SMM',
    'BIA-BIA_TBW',
    'PAQ_A-PAQ_A_Total',
    'PAQ_C-PAQ_C_Total',
    'PCIAT-PCIAT_Total',
    'SDS-SDS_Total_Raw',
    'SDS-SDS_Total_T',
    'PCIAT-PCIAT_01',
    'PCIAT-PCIAT_02',
    'PCIAT-PCIAT_03',
    'PCIAT-PCIAT_04',
    'PCIAT-PCIAT_05',
    'PCIAT-PCIAT_06',
    'PCIAT-PCIAT_07',
    'PCIAT-PCIAT_08',
    'PCIAT-PCIAT_09',
    'PCIAT-PCIAT_10',
    'PCIAT-PCIAT_11',
    'PCIAT-PCIAT_12',
    'PCIAT-PCIAT_13',
    'PCIAT-PCIAT_14',
    'PCIAT-PCIAT_15',
    'PCIAT-PCIAT_16',
    'PCIAT-PCIAT_17',
    'PCIAT-PCIAT_18',
    'PCIAT-PCIAT_19',
    'PCIAT-PCIAT_20'
]

# Ordinal Categorical Columns
ordinal_categorical_cols = [
    'Basic_Demos-Sex',  # 0=Male, 1=Female (binary, can be treated as ordinal)
    'FGC-FGC_CU_Zone',  # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_GSND_Zone',  # 1=Weak, 2=Normal, 3=Strong
    'FGC-FGC_GSD_Zone',   # 1=Weak, 2=Normal, 3=Strong
    'FGC-FGC_PU_Zone',    # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_SRL_Zone',   # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_SRR_Zone',   # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_TL_Zone',    # 0=Needs Improvement, 1=Healthy Fitness Zone
    'BIA-BIA_Activity_Level_num',  # 1=Very Light, 2=Light, 3=Moderate, 4=Heavy, 5=Exceptional
    'BIA-BIA_Frame_num',           # 1=Small, 2=Medium, 3=Large
    'PreInt_EduHx-computerinternet_hoursday'  # 0=Less than 1h/day, 1=Around 1h/day, 2=Around 2hs/day, 3=More than 3hs/day
]

# Nominal Categorical Columns
nominal_categorical_cols = [
    'Basic_Demos-Enroll_Season',
    'CGAS-Season',
    'Physical-Season',
    'Fitness_Endurance-Season',
    'FGC-Season',
    'BIA-Season',
    'PAQ_A-Season',
    'PAQ_C-Season',
    'PCIAT-Season',
    'SDS-Season',
    'PreInt_EduHx-Season'
]


In [6]:
# Custom transformer to select columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

In [7]:
# Numerical Pipeline
numerical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=numerical_cols)),
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())                  # Scale features
])

# Ordinal Categorical Pipeline
ordinal_categorical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=ordinal_categorical_cols)),
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))  # Handle unknown categories
])

# Nominal Categorical Pipeline
nominal_categorical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=nominal_categorical_cols)),
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
    ('encoder', OneHotEncoder(handle_unknown='ignore'))     # Handle unknown categories
])

# Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('ord_cat', ordinal_categorical_pipeline, ordinal_categorical_cols),
    ('nom_cat', nominal_categorical_pipeline, nominal_categorical_cols)
])

In [8]:
# Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('ord_cat', ordinal_categorical_pipeline, ordinal_categorical_cols),
    ('nom_cat', nominal_categorical_pipeline, nominal_categorical_cols)
])

In [9]:
# Split the data into known and missing 'sii'
train_known = train_df[train_df['sii'].notnull()].copy()
train_missing = train_df[train_df['sii'].isnull()].copy()

# Features and Target for Known 'sii'
X_known = train_known.drop(columns=['id', 'sii'])
y_known = train_known['sii']

# Features for Missing 'sii'
X_missing = train_missing.drop(columns=['id', 'sii'])

In [10]:
# Split known data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_known, y_known, test_size=0.8, random_state=22, stratify=y_known
)

In [11]:
# Create a preprocessing pipeline
feature_preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the training split
X_train_processed = feature_preprocessing_pipeline.fit_transform(X_train_split)

# Transform the validation split
X_val_processed = feature_preprocessing_pipeline.transform(X_val_split)

In [12]:
# Train the Random Forest classifier on the training split
rf = RandomForestClassifier(n_estimators=100, random_state=22, class_weight='balanced')
rf.fit(X_train_processed, y_train_split)

# Predict on the validation split
y_val_pred = rf.predict(X_val_processed)

# Evaluate performance
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred))

Validation Accuracy: 0.9762448606669712
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      1275
         1.0       0.97      0.98      0.98       584
         2.0       0.92      0.94      0.93       303
         3.0       1.00      0.15      0.26        27

    accuracy                           0.98      2189
   macro avg       0.97      0.77      0.79      2189
weighted avg       0.98      0.98      0.97      2189



In [13]:
# Fit the preprocessing pipeline on the entire known data
X_known_full = X_known
y_known_full = y_known

X_known_full_processed = feature_preprocessing_pipeline.fit_transform(X_known_full)

In [14]:
# Retrain the Random Forest classifier on the entire known data
rf_full = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_full.fit(X_known_full_processed, y_known_full)

In [15]:
# Transform the missing features using the fitted pipeline
X_missing_processed = feature_preprocessing_pipeline.transform(X_missing)

In [16]:
# Predict 'sii' for missing data
sii_pred = rf_full.predict(X_missing_processed)

# Assign predictions to missing 'sii'
train_missing['sii'] = sii_pred

In [17]:
# Combine the known and imputed data
train_df_imputed = pd.concat([train_known, train_missing], ignore_index=True)

In [18]:
# Create imputers
num_imputer = SimpleImputer(strategy='mean')
ord_cat_imputer = SimpleImputer(strategy='most_frequent')
nom_cat_imputer = SimpleImputer(strategy='most_frequent')

# Fit imputers on known data
num_imputer.fit(train_known[numerical_cols])
ord_cat_imputer.fit(train_known[ordinal_categorical_cols])
nom_cat_imputer.fit(train_known[nominal_categorical_cols])

# Impute numerical columns
train_df_imputed[numerical_cols] = num_imputer.transform(train_df_imputed[numerical_cols])

# Impute ordinal categorical columns
train_df_imputed[ordinal_categorical_cols] = ord_cat_imputer.transform(train_df_imputed[ordinal_categorical_cols])

# Impute nominal categorical columns
train_df_imputed[nominal_categorical_cols] = nom_cat_imputer.transform(train_df_imputed[nominal_categorical_cols])

In [27]:
train_df_imputed.to_csv('./data/train_imputed.csv', index=False)

In [19]:
# Verify that there are no missing 'sii' values
print("Missing 'sii' after imputation:", train_df_imputed['sii'].isnull().sum())

# Display Entire List of Missing Values
missing_values = train_df_imputed.isnull().sum()
missing_values = missing_values[missing_values > 0] 
print(missing_values.to_string()) 

Missing 'sii' after imputation: 0
Series([], )


In [20]:
# Save the preprocessing pipeline
joblib.dump(feature_preprocessing_pipeline, 'feature_preprocessing_pipeline.pkl')

# Save the trained RF classifier
joblib.dump(rf_full, 'RF_classifier.pkl')

['RF_classifier.pkl']

# Joining CSV and Parquet Files

In [21]:
# Aggregation function for actigraphy data

def aggregate_actigraphy_data(series_path):
    participant_data = []
    
    for participant_folder in glob(os.path.join(series_path, 'id=*')):
        participant_id = os.path.basename(participant_folder).split('=')[1]
        parquet_file_path = os.path.join(participant_folder, 'part-0.parquet')
        
        if os.path.isfile(parquet_file_path):
            # Load the parquet file
            df = pq.read_table(parquet_file_path).to_pandas()
            
            # Aggregate metrics for each column
            agg_data = {
                'id': participant_id,
                
                # X, Y, Z columns: mean, std, min, max
                'mean_x': df['X'].mean(),
                'std_x': df['X'].std(),
                'min_x': df['X'].min(),
                'max_x': df['X'].max(),
                
                'mean_y': df['Y'].mean(),
                'std_y': df['Y'].std(),
                'min_y': df['Y'].min(),
                'max_y': df['Y'].max(),
                
                'mean_z': df['Z'].mean(),
                'std_z': df['Z'].std(),
                'min_z': df['Z'].min(),
                'max_z': df['Z'].max(),
                
                # ENMO and Angle-Z columns: mean, std, min, max
                'mean_enmo': df['enmo'].mean(),
                'std_enmo': df['enmo'].std(),
                'min_enmo': df['enmo'].min(),
                'max_enmo': df['enmo'].max(),
                
                'mean_anglez': df['anglez'].mean(),
                'std_anglez': df['anglez'].std(),
                'min_anglez': df['anglez'].min(),
                'max_anglez': df['anglez'].max(),
                
                # Non-wear flag: ratio of non-wear time
                'non_wear_ratio': df['non-wear_flag'].mean(),
                
                # Light and Battery Voltage columns: mean, std, min, max
                'mean_light': df['light'].mean(),
                'std_light': df['light'].std(),
                'min_light': df['light'].min(),
                'max_light': df['light'].max(),
                
                'mean_battery_voltage': df['battery_voltage'].mean(),
                'std_battery_voltage': df['battery_voltage'].std(),
                'min_battery_voltage': df['battery_voltage'].min(),
                'max_battery_voltage': df['battery_voltage'].max(),
                
                # Time of Day, Weekday, Quarter: mode (most frequent)
                'mode_time_of_day': df['time_of_day'].mode()[0] if not df['time_of_day'].mode().empty else np.nan,
                'mode_weekday': df['weekday'].mode()[0] if not df['weekday'].mode().empty else np.nan,
                'mode_quarter': df['quarter'].mode()[0] if not df['quarter'].mode().empty else np.nan,
                
                # Relative date to PCIAT: mean, min, max
                'mean_relative_date_PCIAT': df['relative_date_PCIAT'].mean(),
                'min_relative_date_PCIAT': df['relative_date_PCIAT'].min(),
                'max_relative_date_PCIAT': df['relative_date_PCIAT'].max()
            }
            
            # Append the aggregated data for this participant
            participant_data.append(agg_data)
    
    # Convert list of dicts to DataFrame
    return pd.DataFrame(participant_data)

# Aggregate actigraphy data for training and testing
aggregated_train_actigraphy = aggregate_actigraphy_data(series_train_path)
aggregated_test_actigraphy = aggregate_actigraphy_data(series_test_path)

# Merge with the imputed training data
merged_train_df = pd.merge(train_df_imputed, aggregated_train_actigraphy, on='id', how='left')

# Process and Merge Test Data

In [22]:
# Ensure all required columns are present in test_df
required_cols = numerical_cols + ordinal_categorical_cols + nominal_categorical_cols
missing_cols = set(required_cols) - set(test_df.columns)

for col in missing_cols:
    test_df[col] = np.nan

In [23]:
# Transform the test features using the fitted pipeline
X_test = test_df.drop(columns=['id'])
X_test_processed = feature_preprocessing_pipeline.transform(X_test)

# Retrieve feature names after transformation
preprocessor_named_steps = feature_preprocessing_pipeline.named_steps['preprocessor']

# Get numerical features after scaling
num_features = numerical_cols

# Get ordinal categorical features after encoding
ord_cat_features = ordinal_categorical_cols

# Get nominal categorical features after one-hot encoding
# To get the OneHotEncoder feature names:
nom_cat_pipeline = preprocessor_named_steps.named_transformers_['nom_cat']
nom_cat_features = nom_cat_pipeline.named_steps['encoder'].get_feature_names_out(nominal_categorical_cols)

# Combine all feature names
all_features = list(num_features) + list(ord_cat_features) + list(nom_cat_features)

# Create the processed test DataFrame
processed_test_df = pd.DataFrame(X_test_processed, columns=all_features)

# Add 'id' back to the processed test DataFrame
processed_test_df['id'] = test_df['id'].values

# Merge with aggregated test actigraphy data
merged_test_df = pd.merge(processed_test_df, aggregated_test_actigraphy, on='id', how='left')

In [24]:
merged_train_df.to_csv('./data/merged_train.csv', index=False)
merged_test_df.to_csv('./data/merged_test.csv', index=False)

In [25]:
# Display the first few rows of the merged training data
merged_train_df.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,mean_battery_voltage,std_battery_voltage,min_battery_voltage,max_battery_voltage,mode_time_of_day,mode_weekday,mode_quarter,mean_relative_date_PCIAT,min_relative_date_PCIAT,max_relative_date_PCIAT
0,00008ff9,Fall,5.0,0.0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,,,
1,000fd460,Summer,9.0,0.0,Spring,65.159266,Fall,14.03559,48.0,46.0,...,,,,,,,,,,
2,00105258,Summer,10.0,1.0,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,,,,,,,
3,00115b9f,Winter,9.0,0.0,Fall,71.0,Summer,18.292347,56.0,81.6,...,4053.579102,112.404045,3824.0,4188.5,58915000000000.0,5.0,3.0,53.201683,41.0,85.0
4,001f3379,Spring,13.0,1.0,Winter,50.0,Summer,22.279952,59.5,112.2,...,3838.189453,155.573868,3098.166748,4175.0,0.0,3.0,3.0,79.435593,68.0,91.0


In [26]:
# Display the first few rows of the merged test data
merged_test_df.head()

Unnamed: 0,Basic_Demos-Age,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,...,mean_battery_voltage,std_battery_voltage,min_battery_voltage,max_battery_voltage,mode_time_of_day,mode_weekday,mode_quarter,mean_relative_date_PCIAT,min_relative_date_PCIAT,max_relative_date_PCIAT
0,-1.528487,-1.296014,-0.476635,-1.39205,-0.88136,1.618361e-15,0.0,-1.082896e-15,-8.672395e-16,8.526156e-16,...,,,,,,,,,,
1,-0.361407,0.0,-1.07906,-1.110744,-0.995576,-2.106935,0.39973,-0.9009683,0.2973259,8.526156e-16,...,,,,,,,,,,
2,-0.069637,0.534609,-0.5251,0.08480655,-0.291242,1.618361e-15,-0.362392,0.9278811,-0.007806868,-0.001313215,...,,,,,,,,,,
3,-0.361407,0.534609,-0.176658,0.01448003,-0.148472,1.618361e-15,-0.743453,1.156487,-0.007806868,0.9586466,...,4053.579102,112.404045,3824.0,4188.5,58915000000000.0,5.0,3.0,53.201683,41.0,85.0
4,2.264525,0.0,0.0,-9.993998e-16,0.0,1.618361e-15,0.0,-1.082896e-15,-8.672395e-16,8.526156e-16,...,,,,,,,,,,
