# Install and Load Libraries/Data

In [1]:
import pandas as pd #type: ignore
import numpy as np #type: ignore
from sklearn.pipeline import Pipeline #type: ignore
from sklearn.compose import ColumnTransformer #type: ignore
from sklearn.impute import SimpleImputer, KNNImputer #type: ignore
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder #type: ignore
from sklearn.ensemble import RandomForestClassifier #type: ignore
from sklearn.model_selection import train_test_split #type: ignore
from sklearn.metrics import accuracy_score, classification_report #type: ignore
from sklearn.base import BaseEstimator, TransformerMixin #type: ignore
import joblib #type: ignore
import os
import pyarrow.parquet as pq #type: ignore
from glob import glob

In [2]:
# Paths to data folders
train_csv_path = './data/train.csv'
test_csv_path = './data/test.csv'
series_train_path = './data/series_train.parquet'
series_test_path = './data/series_test.parquet'

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

In [3]:
# Drop rows where 'sii' is NaN
train_df = train_df[train_df['sii'].notna()].copy()

# Imputing Missing Values for Non-Parquet Training Data

In [4]:
# Display Entire List of Missing Values
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0] 
print(missing_values.to_string()) 

CGAS-Season                                394
CGAS-CGAS_Score                            394
Physical-Season                            141
Physical-BMI                               209
Physical-Height                            206
Physical-Weight                            164
Physical-Waist_Circumference              2253
Physical-Diastolic_BP                      258
Physical-HeartRate                         250
Physical-Systolic_BP                       258
Fitness_Endurance-Season                  1476
Fitness_Endurance-Max_Stage               2005
Fitness_Endurance-Time_Mins               2008
Fitness_Endurance-Time_Sec                2008
FGC-Season                                  89
FGC-FGC_CU                                 817
FGC-FGC_CU_Zone                            852
FGC-FGC_GSND                              1864
FGC-FGC_GSND_Zone                         1872
FGC-FGC_GSD                               1865
FGC-FGC_GSD_Zone                          1872
FGC-FGC_PU   

In [5]:
features = train_df.drop(columns=['id', 'sii'])
target = train_df['sii']

In [6]:
# Numerical Columns
numerical_cols = [
    'Basic_Demos-Age',
    'CGAS-CGAS_Score',
    'Physical-BMI',
    'Physical-Height',
    'Physical-Weight',
    'Physical-Waist_Circumference',
    'Physical-Diastolic_BP',
    'Physical-HeartRate',
    'Physical-Systolic_BP',
    'Fitness_Endurance-Max_Stage',
    'Fitness_Endurance-Time_Mins',
    'Fitness_Endurance-Time_Sec',
    'FGC-FGC_CU',
    'FGC-FGC_GSND',
    'FGC-FGC_GSD',
    'FGC-FGC_PU',
    'FGC-FGC_SRL',
    'FGC-FGC_SRR',
    'FGC-FGC_TL',
    'BIA-BIA_BMC',
    'BIA-BIA_BMI',
    'BIA-BIA_BMR',
    'BIA-BIA_DEE',
    'BIA-BIA_ECW',
    'BIA-BIA_FFM',
    'BIA-BIA_FFMI',
    'BIA-BIA_FMI',
    'BIA-BIA_Fat',
    'BIA-BIA_ICW',
    'BIA-BIA_LDM',
    'BIA-BIA_LST',
    'BIA-BIA_SMM',
    'BIA-BIA_TBW',
    'PAQ_A-PAQ_A_Total',
    'PAQ_C-PAQ_C_Total',
    'PCIAT-PCIAT_Total',
    'SDS-SDS_Total_Raw',
    'SDS-SDS_Total_T',
    'PCIAT-PCIAT_01',
    'PCIAT-PCIAT_02',
    'PCIAT-PCIAT_03',
    'PCIAT-PCIAT_04',
    'PCIAT-PCIAT_05',
    'PCIAT-PCIAT_06',
    'PCIAT-PCIAT_07',
    'PCIAT-PCIAT_08',
    'PCIAT-PCIAT_09',
    'PCIAT-PCIAT_10',
    'PCIAT-PCIAT_11',
    'PCIAT-PCIAT_12',
    'PCIAT-PCIAT_13',
    'PCIAT-PCIAT_14',
    'PCIAT-PCIAT_15',
    'PCIAT-PCIAT_16',
    'PCIAT-PCIAT_17',
    'PCIAT-PCIAT_18',
    'PCIAT-PCIAT_19',
    'PCIAT-PCIAT_20'
]

# Ordinal Categorical Columns
ordinal_categorical_cols = [
    'Basic_Demos-Sex',  # 0=Male, 1=Female (binary, can be treated as ordinal)
    'FGC-FGC_CU_Zone',  # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_GSND_Zone',  # 1=Weak, 2=Normal, 3=Strong
    'FGC-FGC_GSD_Zone',   # 1=Weak, 2=Normal, 3=Strong
    'FGC-FGC_PU_Zone',    # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_SRL_Zone',   # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_SRR_Zone',   # 0=Needs Improvement, 1=Healthy Fitness Zone
    'FGC-FGC_TL_Zone',    # 0=Needs Improvement, 1=Healthy Fitness Zone
    'BIA-BIA_Activity_Level_num',  # 1=Very Light, 2=Light, 3=Moderate, 4=Heavy, 5=Exceptional
    'BIA-BIA_Frame_num',           # 1=Small, 2=Medium, 3=Large
    'PreInt_EduHx-computerinternet_hoursday'  # 0=Less than 1h/day, 1=Around 1h/day, 2=Around 2hs/day, 3=More than 3hs/day
]

# Nominal Categorical Columns
nominal_categorical_cols = [
    'Basic_Demos-Enroll_Season',
    'CGAS-Season',
    'Physical-Season',
    'Fitness_Endurance-Season',
    'FGC-Season',
    'BIA-Season',
    'PAQ_A-Season',
    'PAQ_C-Season',
    'PCIAT-Season',
    'SDS-Season',
    'PreInt_EduHx-Season'
]


In [7]:
# Custom transformer to select columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

In [8]:
# Numerical Pipeline
numerical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=numerical_cols)),
    ('imputer', KNNImputer(n_neighbors=7, weights='uniform')),
    ('scaler', StandardScaler())
])

# Ordinal Categorical Pipeline
ordinal_categorical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=ordinal_categorical_cols)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Nominal Categorical Pipeline
nominal_categorical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=nominal_categorical_cols)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)) # set drop='first' to avoid dummy variable trap for LR and Skorch NN Regressor
])

# Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('ord_cat', ordinal_categorical_pipeline, ordinal_categorical_cols),
    ('nom_cat', nominal_categorical_pipeline, nominal_categorical_cols)
])

In [9]:
# Split the data into known and missing 'sii' - Comment out if running code with dropped missing sii
#train_known = train_df[train_df['sii'].notnull()].copy()
#train_missing = train_df[train_df['sii'].isnull()].copy()

# Features and Target for Known 'sii'
#X_known = train_known.drop(columns=['id', 'sii'])
#y_known = train_known['sii']
#X_missing = train_missing.drop(columns=['id', 'sii'])

In [10]:
# Split known data into training and validation sets - Comment out if running code with dropped missing sii
#X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
#    X_known, y_known, test_size=0.8, random_state=22, stratify=y_known
#)

In [11]:
# Separate features and target - Uncomment if running code with dropped missing sii
X = train_df.drop(columns=['id', 'sii'])
y = train_df['sii'].astype(np.int64) 

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y, test_size=0.2, random_state=22, stratify=y
)

In [12]:
# Create preprocessing pipeline
feature_preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the training split
X_train_processed = feature_preprocessing_pipeline.fit_transform(X_train_split)

# Transform the validation split
X_val_processed = feature_preprocessing_pipeline.transform(X_val_split)

In [13]:
# Train the Random Forest classifier on the training split
rf = RandomForestClassifier(n_estimators=100, random_state=22, class_weight='balanced')
rf.fit(X_train_processed, y_train_split)

# Predict on the validation split
y_val_pred = rf.predict(X_val_processed)

# Evaluate performance
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred))

Validation Accuracy: 0.9927007299270073
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       319
           1       1.00      0.99      1.00       146
           2       0.96      1.00      0.98        76
           3       1.00      0.57      0.73         7

    accuracy                           0.99       548
   macro avg       0.99      0.89      0.93       548
weighted avg       0.99      0.99      0.99       548



In [14]:
# Fit the preprocessing pipeline on the entire known data - Comment out if running code with dropped missing sii
#X_known_full_processed = feature_preprocessing_pipeline.fit_transform(X_known)

In [15]:
# Fit the preprocessing pipeline on the entire data - Uncomment if running code with dropped missing sii
X_full_processed = feature_preprocessing_pipeline.fit_transform(X)

In [16]:
# Retrain the Random Forest classifier on the entire known data - Comment out if running code with dropped missing sii
#rf_full = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
#rf_full.fit(X_known_full_processed, y_known)

In [17]:
# Retrain the Random Forest classifier on the entire data - Uncomment if running code with dropped missing sii
rf_full = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_full.fit(X_full_processed, y)

In [18]:
# Predict on missing data # Comment out if running code with dropped missing sii
#X_missing_processed = feature_preprocessing_pipeline.transform(X_missing)
#sii_pred = rf_full.predict(X_missing_processed)
#train_missing['sii'] = sii_pred

In [19]:
# Combine the known and imputed data # Comment out if running code with dropped missing sii
#train_df_imputed = pd.concat([train_known, train_missing], ignore_index=True)

In [20]:
# Verify that there are no missing 'sii' values # Comment out if running code with dropped missing sii
#print("Missing 'sii' after imputation:", train_df_imputed['sii'].isnull().sum())

In [21]:
# Export the imputed dataset
#train_df_imputed.to_csv('./data/train_imputed_knn.csv', index=False)
#print("Imputed dataset exported successfully.")

In [22]:
# Separate features and target from the imputed training data - Comment out if running code with dropped missing sii
#X_train_full = train_df_imputed.drop(columns=['id', 'sii'])
#y_train_full = train_df_imputed['sii'].astype(np.int64)

# Use the same feature preprocessing pipeline to transform the training data 
# Since the pipeline was fitted on X_known_full earlier, we can use it directly - Comment out if running code with dropped missing sii
#X_train_full_processed = feature_preprocessing_pipeline.transform(X_train_full)

# Retrieve feature names after transformation
preprocessor_named_steps = feature_preprocessing_pipeline.named_steps['preprocessor']

# Get numerical features
num_features = numerical_cols

# Get ordinal categorical features
ord_cat_features = ordinal_categorical_cols

# Get nominal categorical features after one-hot encoding
nom_cat_pipeline = preprocessor_named_steps.named_transformers_['nom_cat']
nom_cat_features = nom_cat_pipeline.named_steps['encoder'].get_feature_names_out(nominal_categorical_cols)

# Combine all feature names #
all_features = list(num_features) + list(ord_cat_features) + list(nom_cat_features)

# Create the processed training DataFrame - Comment out if running code with dropped missing sii
#processed_train_df = pd.DataFrame(X_train_full_processed, columns=all_features)
# Uncomment if running code with dropped missing sii
processed_train_df = pd.DataFrame(X_full_processed, columns=all_features)

# Add 'sii' and 'id' back to the processed training DataFrame - Comment if running code with dropped missing sii
#processed_train_df['sii'] = y_train_full.values
#processed_train_df['id'] = train_df_imputed['id'].values
# Uncomment if running code with dropped missing sii
processed_train_df['sii'] = y.values
processed_train_df['id'] = train_df['id'].values

In [23]:
# Combine the known and imputed data
processed_train_df.to_csv('./data/processed_train_df_drop_one_no_target.csv', index=False)

In [24]:
# Save the preprocessing pipeline
#joblib.dump(feature_preprocessing_pipeline, 'feature_preprocessing_pipeline_non_parquet.pkl')

# Save the trained RF classifier
#joblib.dump(rf_full, 'RF_classifier_sii_imputation.pkl')

# Joining CSV with Parquet Files and Imputing Missing Values for Parquet Training Data

In [116]:
# Aggregation function for actigraphy data

def aggregate_actigraphy_data(series_path):
    participant_data = []
    
    for participant_folder in glob(os.path.join(series_path, 'id=*')):
        participant_id = os.path.basename(participant_folder).split('=')[1]
        parquet_file_path = os.path.join(participant_folder, 'part-0.parquet')
        
        if os.path.isfile(parquet_file_path):
            # Load the parquet file
            df = pq.read_table(parquet_file_path).to_pandas()
            
            # Aggregate metrics for each column
            agg_data = {
                'id': participant_id,
                
                # X, Y, Z columns: mean, std, min, max
                'mean_x': df['X'].mean(),
                'std_x': df['X'].std(),
                'min_x': df['X'].min(),
                'max_x': df['X'].max(),
                
                'mean_y': df['Y'].mean(),
                'std_y': df['Y'].std(),
                'min_y': df['Y'].min(),
                'max_y': df['Y'].max(),
                
                'mean_z': df['Z'].mean(),
                'std_z': df['Z'].std(),
                'min_z': df['Z'].min(),
                'max_z': df['Z'].max(),
                
                # ENMO and Angle-Z columns: mean, std, min, max
                'mean_enmo': df['enmo'].mean(),
                'std_enmo': df['enmo'].std(),
                'min_enmo': df['enmo'].min(),
                'max_enmo': df['enmo'].max(),
                
                'mean_anglez': df['anglez'].mean(),
                'std_anglez': df['anglez'].std(),
                'min_anglez': df['anglez'].min(),
                'max_anglez': df['anglez'].max(),
                
                # Non-wear flag: ratio of non-wear time
                'non_wear_ratio': df['non-wear_flag'].mean(),
                
                # Light and Battery Voltage columns: mean, std, min, max
                'mean_light': df['light'].mean(),
                'std_light': df['light'].std(),
                'min_light': df['light'].min(),
                'max_light': df['light'].max(),
                
                'mean_battery_voltage': df['battery_voltage'].mean(),
                'std_battery_voltage': df['battery_voltage'].std(),
                'min_battery_voltage': df['battery_voltage'].min(),
                'max_battery_voltage': df['battery_voltage'].max(),
                
                # Time of Day, Weekday, Quarter: mode (most frequent)
                'mode_time_of_day': df['time_of_day'].mode()[0] if not df['time_of_day'].mode().empty else np.nan,
                'mode_weekday': df['weekday'].mode()[0] if not df['weekday'].mode().empty else np.nan,
                'mode_quarter': df['quarter'].mode()[0] if not df['quarter'].mode().empty else np.nan,
                
                # Relative date to PCIAT: mean, min, max
                'mean_relative_date_PCIAT': df['relative_date_PCIAT'].mean(),
                'min_relative_date_PCIAT': df['relative_date_PCIAT'].min(),
                'max_relative_date_PCIAT': df['relative_date_PCIAT'].max()
            }
            
            # Append the aggregated data for this participant
            participant_data.append(agg_data)
    
    # Convert list of dicts to DataFrame
    return pd.DataFrame(participant_data)

# Aggregate actigraphy data for training and testing
aggregated_train_actigraphy = aggregate_actigraphy_data(series_train_path)
aggregated_test_actigraphy = aggregate_actigraphy_data(series_test_path)

In [117]:
# Define numerical and categorical columns in actigraphy data
actigraphy_numerical_cols = [
    'mean_x', 'std_x', 'min_x', 'max_x',
    'mean_y', 'std_y', 'min_y', 'max_y',
    'mean_z', 'std_z', 'min_z', 'max_z',
    'mean_enmo', 'std_enmo', 'min_enmo', 'max_enmo',
    'mean_anglez', 'std_anglez', 'min_anglez', 'max_anglez',
    'non_wear_ratio',
    'mean_light', 'std_light', 'min_light', 'max_light',
    'mean_battery_voltage', 'std_battery_voltage', 'min_battery_voltage', 'max_battery_voltage',
    'mean_relative_date_PCIAT', 'min_relative_date_PCIAT', 'max_relative_date_PCIAT'
]

actigraphy_categorical_cols = ['mode_time_of_day', 'mode_weekday', 'mode_quarter']

# Impute numerical columns with mean
num_imputer = SimpleImputer(strategy='mean')
aggregated_train_actigraphy[actigraphy_numerical_cols] = num_imputer.fit_transform(
    aggregated_train_actigraphy[actigraphy_numerical_cols]
)
aggregated_test_actigraphy[actigraphy_numerical_cols] = num_imputer.transform(
    aggregated_test_actigraphy[actigraphy_numerical_cols]
)

# Impute categorical columns with most frequent
cat_imputer = SimpleImputer(strategy='most_frequent')
aggregated_train_actigraphy[actigraphy_categorical_cols] = cat_imputer.fit_transform(
    aggregated_train_actigraphy[actigraphy_categorical_cols]
)
aggregated_test_actigraphy[actigraphy_categorical_cols] = cat_imputer.transform(
    aggregated_test_actigraphy[actigraphy_categorical_cols]
)

# Merge with the imputed training data
merged_train_df = pd.merge(processed_train_df, aggregated_train_actigraphy, on='id', how='left')

There will still be nulls at this point after this imputation because anyone who does not have actigraphy data will not be affected. Since this whole section of code will mostly be replaced, I am going to impute missing actigraphy data after merging, which is DEFINITELY wrong - but does fill in the NANs and retains all participants so that the end result is a training CSV file with no NANs (which is the end goal of this file along with any feature engineering/selection) - Ethan

In [119]:
# Combine actigraphy columns
actigraphy_cols = actigraphy_numerical_cols + actigraphy_categorical_cols

# Impute missing actigraphy data
# Impute numerical actigraphy columns
num_imputer_actigraphy = SimpleImputer(strategy='mean')
merged_train_df[actigraphy_numerical_cols] = num_imputer_actigraphy.fit_transform(
    merged_train_df[actigraphy_numerical_cols]
)

# Impute categorical actigraphy columns
cat_imputer_actigraphy = SimpleImputer(strategy='most_frequent')
merged_train_df[actigraphy_categorical_cols] = cat_imputer_actigraphy.fit_transform(
    merged_train_df[actigraphy_categorical_cols]
)

# Verify that there are no missing values
missing_values = merged_train_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print("Missing values in merged training data after imputing actigraphy data:")
print(missing_values.to_string())

Missing values in merged training data after imputing actigraphy data:
Series([], )


# Process and Merge Test Data

In [25]:
# Ensure all required columns are present in test_df
required_cols = numerical_cols + ordinal_categorical_cols + nominal_categorical_cols
missing_cols = set(required_cols) - set(test_df.columns)

for col in missing_cols:
    test_df[col] = np.nan

In [26]:
# Transform the test features using the fitted pipeline
X_test = test_df.drop(columns=['id'])
X_test_processed = feature_preprocessing_pipeline.transform(X_test)

# Create the processed test DataFrame
processed_test_df = pd.DataFrame(X_test_processed, columns=all_features)

# Add 'id' back to the processed test DataFrame
processed_test_df['id'] = test_df['id'].values

In [27]:
processed_test_df.to_csv('./data/processed_test_df_drop_one_no_target.csv', index=False)

In [28]:
# Impute missing actigraphy data in test data
#merged_test_df[actigraphy_numerical_cols] = num_imputer_actigraphy.transform(
#    merged_test_df[actigraphy_numerical_cols]
#)
#merged_test_df[actigraphy_categorical_cols] = cat_imputer_actigraphy.transform(
#    merged_test_df[actigraphy_categorical_cols]
#)

In [29]:
processed_train_df.shape

(2736, 104)

In [30]:
processed_test_df.shape

(20, 103)