In [9]:
import pandas as pd  # type: ignore
import numpy as np  # type: ignore
from sklearn.pipeline import Pipeline  # type: ignore
from sklearn.compose import ColumnTransformer  # type: ignore
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer  # noqa
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder  # type: ignore
from sklearn.ensemble import RandomForestClassifier  # type: ignore
from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.metrics import accuracy_score, classification_report  # type: ignore
from sklearn.base import BaseEstimator, TransformerMixin  # type: ignore

# Paths to data folders
train_csv_path = 'data/train.csv'
test_csv_path = 'data/test.csv'

# Load the CSV files
train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

# Display Entire List of Missing Values
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values.to_string())

# Column definitions
numerical_cols = [
    'Basic_Demos-Age', 'CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
    'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
    'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
    'FGC-FGC_CU', 'FGC-FGC_GSND', 'FGC-FGC_GSD', 'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_TL',
    'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW',
    'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST',
    'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 'PCIAT-PCIAT_Total',
    'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03',
    'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09',
    'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15',
    'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20'
]

ordinal_categorical_cols = [
    'Basic_Demos-Sex', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU_Zone',
    'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL_Zone', 'BIA-BIA_Frame_num',
    'PreInt_EduHx-computerinternet_hoursday'
]

nominal_categorical_cols = [
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season',
    'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season',
    'PreInt_EduHx-Season'
]

# Validate column presence in train_df
for col_list, col_type in [
    (numerical_cols, "Numerical"),
    (ordinal_categorical_cols, "Ordinal"),
    (nominal_categorical_cols, "Nominal")
]:
    for col in col_list:
        if col not in train_df.columns:
            print(f"Missing {col_type} column: {col}")

# Custom transformer to select columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns]

# Iterative Imputer for All Data Types
class IterativeImputerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=42, max_iter=10, initial_strategy='mean'):
        self.random_state = random_state
        self.max_iter = max_iter
        self.initial_strategy = initial_strategy
        self.iterative_imputer = None
        self.dummies_columns = None

    def fit(self, X, y=None):
        # Encode data and store the columns
        X_encoded = pd.get_dummies(X)
        self.dummies_columns = X_encoded.columns

        # Fit the iterative imputer on the encoded data
        self.iterative_imputer = IterativeImputer(
            random_state=self.random_state,
            max_iter=self.max_iter,
            initial_strategy=self.initial_strategy
        )
        self.iterative_imputer.fit(X_encoded)
        return self

    def transform(self, X):
        if self.iterative_imputer is None:
            raise RuntimeError("IterativeImputerTransformer must be fitted before calling transform.")
        
        # Encode data and align to stored columns
        X_encoded = pd.get_dummies(X)
        X_encoded = X_encoded.reindex(columns=self.dummies_columns, fill_value=0)
        
        # Impute missing values
        X_imputed = self.iterative_imputer.transform(X_encoded)
        
        # Return a DataFrame with the original index
        return pd.DataFrame(X_imputed, columns=self.dummies_columns, index=X.index)

# Numerical Pipeline
numerical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=numerical_cols)),
    ('imputer', IterativeImputerTransformer(random_state=42, max_iter=10, initial_strategy='mean')),
    ('scaler', StandardScaler())
])

# Ordinal Categorical Pipeline
ordinal_categorical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=ordinal_categorical_cols)),
    ('imputer', IterativeImputerTransformer(random_state=42, max_iter=10, initial_strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Nominal Categorical Pipeline
nominal_categorical_pipeline = Pipeline(steps=[
    ('selector', ColumnSelector(columns=nominal_categorical_cols)),
    ('imputer', IterativeImputerTransformer(random_state=42, max_iter=10, initial_strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine Numerical and Categorical Pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('ord_cat', ordinal_categorical_pipeline, ordinal_categorical_cols),
    ('nom_cat', nominal_categorical_pipeline, nominal_categorical_cols)
])

# Continue with splitting, preprocessing, and modeling


# Split the data into known and missing 'sii'
train_known = train_df[train_df['sii'].notnull()].copy()
train_missing = train_df[train_df['sii'].isnull()].copy()

# Features and Target for Known 'sii'
X_known = train_known.drop(columns=['id', 'sii'])
y_known = train_known['sii']

# Features for Missing 'sii'
X_missing = train_missing.drop(columns=['id', 'sii'])

# Split known data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_known, y_known, test_size=0.8, random_state=22, stratify=y_known
)

# Create a preprocessing pipeline
feature_preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the training split
X_train_processed = feature_preprocessing_pipeline.fit_transform(X_train_split)

# Transform the validation split
X_val_processed = feature_preprocessing_pipeline.transform(X_val_split)

# Train the Random Forest classifier on the training split
rf = RandomForestClassifier(n_estimators=100, random_state=22, class_weight='balanced')
rf.fit(X_train_processed, y_train_split)

# Predict on the validation split
y_val_pred = rf.predict(X_val_processed)

# Evaluate performance
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred))

# Fit the preprocessing pipeline on the entire known data
X_known_full = X_known
y_known_full = y_known

X_known_full_processed = feature_preprocessing_pipeline.fit_transform(X_known_full)

# Retrain the Random Forest classifier on the entire known data
rf_full = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_full.fit(X_known_full_processed, y_known_full)

# Transform the missing features using the fitted pipeline
X_missing_processed = feature_preprocessing_pipeline.transform(X_missing)

# Predict 'sii' for missing data
sii_pred = rf_full.predict(X_missing_processed)

# Assign predictions to missing 'sii'
train_missing['sii'] = sii_pred

# Combine the known and imputed data
train_df_imputed = pd.concat([train_known, train_missing], ignore_index=True)

# Save the fully imputed data to CSV
train_df_imputed.to_csv('data/train_imputed_iterative.csv', index=False)

# Verify that there are no missing 'sii' values
print("Missing 'sii' after imputation:", train_df_imputed['sii'].isnull().sum())


CGAS-Season                               1405
CGAS-CGAS_Score                           1539
Physical-Season                            650
Physical-BMI                               938
Physical-Height                            933
Physical-Weight                            884
Physical-Waist_Circumference              3062
Physical-Diastolic_BP                     1006
Physical-HeartRate                         993
Physical-Systolic_BP                      1006
Fitness_Endurance-Season                  2652
Fitness_Endurance-Max_Stage               3217
Fitness_Endurance-Time_Mins               3220
Fitness_Endurance-Time_Sec                3220
FGC-Season                                 614
FGC-FGC_CU                                1638
FGC-FGC_CU_Zone                           1678
FGC-FGC_GSND                              2886
FGC-FGC_GSND_Zone                         2898
FGC-FGC_GSD                               2886
FGC-FGC_GSD_Zone                          2897
FGC-FGC_PU   



Validation Accuracy: 0.9698492462311558
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      1275
         1.0       0.94      0.99      0.96       584
         2.0       0.92      0.88      0.90       303
         3.0       1.00      0.19      0.31        27

    accuracy                           0.97      2189
   macro avg       0.96      0.76      0.79      2189
weighted avg       0.97      0.97      0.97      2189





Missing 'sii' after imputation: 0
