In [2]:
# %load_ext cudf.pandas
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# TODO: delete this
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [4]:
target_labels = ['None','Mild','Moderate','Severe']

In [5]:
columns_not_in_test = set(train_df.columns) - set(test_df.columns)
columns_not_in_test

{'PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20',
 'PCIAT-PCIAT_Total',
 'PCIAT-Season',
 'sii'}

In [6]:
train_df['sii'].isnull().sum()/len(train_df)

np.float64(0.3090909090909091)

The test set does not have any of the PCIAT data, and we're missing about 30% of the `sii` label in the training data. 

In [7]:
pciat_aggs = train_df.groupby('sii')['PCIAT-PCIAT_Total'].agg(['min', 'max', 'mean'])
pciat_aggs = pciat_aggs.rename(
    columns={'min': 'Minimum PCIAT total Score', 'max': 'Maximum total PCIAT Score', 'mean': 'Average total PCIAT Score'}
)
pciat_aggs

Unnamed: 0_level_0,Minimum PCIAT total Score,Maximum total PCIAT Score,Average total PCIAT Score
sii,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.0,30.0,13.929737
1.0,31.0,49.0,38.726027
2.0,50.0,79.0,60.65873
3.0,80.0,93.0,85.911765


The sum of the PCIAT scores all align with the classification set by the Severity Impairment Index (sii)

In [9]:
# List all columns that are of the form PCIAT-PCIAT_XX
pciat_columns = [f'PCIAT-PCIAT_{i:02d}' for i in range(1, 21)]

# Check if the sum of all PCIAT columns is equal to the PCIAT-PCIAT_Total column. Handle nulls with fillna(0)
train_df['PCIAT_summed_total'] = train_df[pciat_columns].fillna(0).sum(axis=1)
is_calculated_sum_equal_to_total_column = train_df['PCIAT_summed_total'] == train_df['PCIAT-PCIAT_Total'].fillna(0)
is_calculated_sum_equal_to_total_column.sum() == len(train_df)

np.True_

The number of instances where PCIAT-PCIAT_1 -> PCIAT-PCIAT_20 is equal to PCIAT-PCIAT_Total is equal to the number of records in the dataset. This shows that PCIAT-PCIAT_Total is 100% is a linear transformation of other features in this dataset, so we can drop it from the dataset

In [10]:
train_df.drop(columns=['PCIAT_summed_total'], inplace=True)

We use KNN to impute the missing data of all numeric columns, including `sii` (which we need to round to the nearest integer)

In [24]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputer = KNNImputer(n_neighbors=5)
train[numeric_cols] = imputer.fit_transform(train[numeric_cols])

# Round 'sii' to the nearest integer
train['sii'] = train['sii'].round().astype(int).clip(upper=3)
train


Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5.0,0.0,Winter,51.0,Fall,16.877316,46.00,50.80,...,4.0,2.0,4.0,55.0,,48.4,62.2,Fall,3.0,2
1,000fd460,Summer,9.0,0.0,,70.0,Fall,14.035590,48.00,46.00,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0
2,00105258,Summer,10.0,1.0,Fall,71.0,Fall,16.648696,56.50,75.60,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0
3,00115b9f,Winter,9.0,0.0,Fall,71.0,Summer,18.292347,56.00,81.60,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1
4,0016bb22,Spring,18.0,1.0,Summer,73.2,,24.695872,63.69,148.16,...,1.6,1.6,0.8,51.6,,42.8,59.2,,2.4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13.0,0.0,Spring,60.0,Fall,16.362460,59.50,82.40,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1
3956,ffa9794a,Winter,10.0,0.0,,58.6,Spring,18.764678,53.50,76.40,...,0.8,0.8,0.4,22.4,,38.6,54.8,Winter,0.0,0
3957,ffcd4dbd,Fall,11.0,0.0,Spring,68.0,Winter,21.441500,60.00,109.80,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1
3958,ffed1dd5,Spring,13.0,0.0,Spring,70.0,Winter,12.235895,70.70,87.00,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0


# Exploratory Data Analysis

In [320]:
# train_df[['BIA-BIA_FFM','BIA-BIA_SMM']].corr()
columns_to_exclude = train_df.columns[train_df.columns.str.contains('PCIAT', case=False)]
df_excluded = train_df.drop(columns=columns_to_exclude, axis=1)


correlation_matrix = df_excluded.select_dtypes(include=['float64','int64'])[['Basic_Demos-Age','Basic_Demos-Sex','Physical-BMI','Physical-Waist_Circumference']].corr()
correlation_matrix

# df_excluded = df_excluded[['Basic_Demos-Age','Basic_Demos-Sex','CGAS-CGAS_Score','Physical-BMI']]
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='Blues', cbar=True)
# plt.title("Correlation Heatmap")
# plt.show()



Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,Physical-BMI,Physical-Waist_Circumference
Basic_Demos-Age,1.0,0.064004,0.492112,0.609601
Basic_Demos-Sex,0.064004,1.0,0.025793,-0.020013
Physical-BMI,0.492112,0.025793,1.0,0.892149
Physical-Waist_Circumference,0.609601,-0.020013,0.892149,1.0


In [None]:
# TODO: Explore more heat maps. Remove sii from the correlation matrix
# also, maybe remove columns that are largely null
df_excluded.select_dtypes(include=['float64','int64'])

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,PCIAT_total
0,5,0,51.0,16.877316,46.0,50.8,,,,,...,38.9177,19.5413,32.6909,,,,,3.0,2.0,55.0
1,9,0,,14.035590,48.0,46.0,22.0,75.0,70.0,122.0,...,39.4497,15.4107,27.0552,,2.340,46.0,64.0,0.0,0.0,0.0
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,...,,,,,2.170,38.0,54.0,2.0,0.0,28.0
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,...,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0,1.0,44.0
4,18,1,,,,,,,,,...,,,,1.04,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,13,0,60.0,16.362460,59.5,82.4,,71.0,70.0,104.0,...,66.2889,29.7790,52.8320,,3.260,35.0,50.0,1.0,1.0,32.0
3956,10,0,,18.764678,53.5,76.4,27.0,60.0,78.0,118.0,...,,,,,2.340,,,0.0,,0.0
3957,11,0,68.0,21.441500,60.0,109.8,,79.0,99.0,116.0,...,71.3903,28.7792,54.4630,,2.729,56.0,77.0,0.0,1.0,31.0
3958,13,0,70.0,12.235895,70.7,87.0,,59.0,61.0,113.0,...,86.2475,45.4340,67.9038,,3.300,33.0,47.0,1.0,0.0,19.0
