# Import Module

In [134]:
import pandas as pd
import os
import numpy as np
from scipy import stats
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data

In [None]:
train_df_path = 'child-mind-institute-problematic-internet-use/train.csv'
train_df = pd.read_csv(train_df_path)
test_df_path = 'child-mind-institute-problematic-internet-use/test.csv'
test_df = pd.read_csv(test_df_path)
train_parquet_path = "child-mind-institute-problematic-internet-use/series_train.parquet"
test_parquet_path = "child-mind-institute-problematic-internet-use/series_test.parquet"

# Preprocess Training Data

In [136]:
def Preprocess_Training_Data(train_df: pd.DataFrame) -> pd.DataFrame: 
    return_df = train_df.copy()

    age_masks = {
        '5-7': (return_df['Basic_Demos-Age'] >= 5) & (return_df['Basic_Demos-Age'] <= 7),
        '8-10': (return_df['Basic_Demos-Age'] >= 8) & (return_df['Basic_Demos-Age'] <= 10),
        '11-13': (return_df['Basic_Demos-Age'] >= 11) & (return_df['Basic_Demos-Age'] <= 13),
        '14-17': (return_df['Basic_Demos-Age'] >= 14) & (return_df['Basic_Demos-Age'] <= 17),
        '18-22': (return_df['Basic_Demos-Age'] >= 18) & (return_df['Basic_Demos-Age'] <= 22),
    }

    sex_masks = {
        '1': return_df['Basic_Demos-Sex'] == 1,
        '0': return_df['Basic_Demos-Sex'] == 0,
    }

    season_features = ['CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'SDS-Season']
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            for season_feature in season_features:
                if return_df.loc[age_mask & sex_mask, season_feature].value_counts().empty:
                    most_season = return_df[season_feature].value_counts().index[0]
                    return_df.loc[sex_mask & age_mask & return_df[season_feature].isna(), season_feature] = most_season
                else:
                    most_season = return_df.loc[age_mask & sex_mask, season_feature].value_counts().index[0]
                    return_df.loc[sex_mask & age_mask & return_df[season_feature].isna(), season_feature] = most_season
    
    # Physical Measures --------------------------------------------------------------------------------------------------- #
    physical_measures = ['Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference', 'Physical-Diastolic_BP', 'Physical-Systolic_BP', 'Physical-HeartRate']
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            for physical_measure in physical_measures:
                median = np.nan
                if return_df.loc[age_mask & sex_mask, physical_measure].notna().any():
                    median = return_df.loc[age_mask & sex_mask, physical_measure].median()
                elif return_df.loc[sex_mask, physical_measure].notna().any():
                    median = return_df.loc[sex_mask, physical_measure].median()
                else:
                    median = return_df[physical_measure].median()
                return_df.loc[sex_mask & age_mask & return_df[physical_measure].isna(), physical_measure] = median

    # Children's Global Assessment Scale ---------------------------------------------------------------------------------- #
    return_df.loc[(return_df['CGAS-CGAS_Score']>100)|(return_df['CGAS-CGAS_Score']<0), 'CGAS-CGAS_Score'] = pd.NA
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            median = np.nan
            if return_df.loc[age_mask & sex_mask, 'CGAS-CGAS_Score'].notna().any():
                median = return_df.loc[age_mask & sex_mask, 'CGAS-CGAS_Score'].median()
            elif return_df.loc[sex_mask, 'CGAS-CGAS_Score'].notna().any():
                median = return_df.loc[sex_mask, 'CGAS-CGAS_Score'].median()
            else:
                median = return_df['CGAS-CGAS_Score'].median()
            return_df.loc[sex_mask & age_mask & return_df['CGAS-CGAS_Score'].isna(), 'CGAS-CGAS_Score'] = median
    
    # FitnessGram Vitals and Treadmill ------------------------------------------------------------------------------------ #
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            mean_max_stage = np.nan
            if return_df.loc[age_mask & sex_mask, 'Fitness_Endurance-Max_Stage'].notna().any():
                mean_max_stage = return_df.loc[age_mask & sex_mask, 'Fitness_Endurance-Max_Stage'].median()
            elif return_df.loc[sex_mask, 'Fitness_Endurance-Max_Stage'].notna().any():
                mean_max_stage = return_df.loc[sex_mask, 'Fitness_Endurance-Max_Stage'].median()
            else:
                mean_max_stage = return_df['Fitness_Endurance-Max_Stage'].median()
            return_df.loc[sex_mask & age_mask & return_df['Fitness_Endurance-Max_Stage'].isna(), 'Fitness_Endurance-Max_Stage'] = mean_max_stage
            mean_sec = np.nan
            if return_df.loc[age_mask & sex_mask, 'Fitness_Endurance-Time_Sec'].notna().any():
                mean_sec = return_df.loc[age_mask & sex_mask, 'Fitness_Endurance-Time_Sec'].median()
            elif return_df.loc[sex_mask, 'Fitness_Endurance-Time_Sec'].notna().any():
                mean_sec = return_df.loc[sex_mask, 'Fitness_Endurance-Time_Sec'].median()
            else:
                mean_sec = return_df['Fitness_Endurance-Time_Sec'].median()
            return_df.loc[sex_mask & age_mask & return_df['Fitness_Endurance-Time_Sec'].isna(), 'Fitness_Endurance-Time_Sec'] = mean_sec

    # FitnessGram Child --------------------------------------------------------------------------------------------------- #
    two_categorical_fgc_features = ['FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_TL']
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items(): 
            for fgc_feature in two_categorical_fgc_features:
                zone = fgc_feature+'_Zone'
                mean = np.nan
                if return_df.loc[age_mask & sex_mask & (return_df[zone]==1), fgc_feature].notna().any():
                    mean = return_df.loc[age_mask & sex_mask & (return_df[zone]==1), fgc_feature].mean()
                else:
                    mean = return_df.loc[return_df[zone]==1, fgc_feature].mean()
                return_df.loc[sex_mask & age_mask & return_df[fgc_feature].isna(), fgc_feature] = mean
                return_df.loc[sex_mask & age_mask & return_df[zone].isna(), zone] = 1

    three_categorical_fgc_features = ['FGC-FGC_GSND', 'FGC-FGC_GSD']
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            for fgc_feature in three_categorical_fgc_features:
                zone = fgc_feature+'_Zone'
                mean = np.nan
                if return_df.loc[age_mask & sex_mask & (return_df[zone]==2), fgc_feature].notna().any():
                    mean = return_df.loc[age_mask & sex_mask & (return_df[zone]==2), fgc_feature].mean()
                else:
                    mean = return_df.loc[return_df[zone]==2, fgc_feature].mean()
                return_df.loc[sex_mask & age_mask & return_df[fgc_feature].isna(), fgc_feature] = mean
                return_df.loc[sex_mask & age_mask & return_df[zone].isna(), zone] = 2
    
    # Sleep Disturbance Scale --------------------------------------------------------------------------------------------- #
    return_df.loc[(return_df['SDS-SDS_Total_Raw']>100)|(return_df['SDS-SDS_Total_Raw']<0), 'SDS-SDS_Total_Raw'] = pd.NA
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            mean_sds = np.nan
            if return_df.loc[age_mask & sex_mask, 'SDS-SDS_Total_Raw'].notna().any():
                mean_sds = return_df.loc[age_mask & sex_mask, 'SDS-SDS_Total_Raw'].mean()
            elif return_df.loc[sex_mask, 'SDS-SDS_Total_Raw'].notna().any():
                mean_sds = return_df.loc[sex_mask, 'SDS-SDS_Total_Raw'].mean()
            else:
                mean_sds = return_df['SDS-SDS_Total_Raw'].mean()
            return_df.loc[sex_mask & age_mask & return_df['SDS-SDS_Total_Raw'].isna(), 'SDS-SDS_Total_Raw'] = mean_sds
    
    # Bio-electric Impedance Analysis ------------------------------------------------------------------------------------- #
    numerical_bia_features = ['BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW']
    for range_label, age_mask in age_masks.items():
        for sex_label, sex_mask in sex_masks.items():
            for bia_features in numerical_bia_features:
                mean = np.nan
                if return_df.loc[age_mask & sex_mask, bia_features].notna().any():
                    mean = return_df.loc[age_mask & sex_mask, bia_features].mean()
                elif return_df.loc[sex_mask, bia_features].notna().any():
                    mean = return_df.loc[sex_mask, bia_features].mean()
                else:
                    mean = return_df[bia_features].mean()
                return_df.loc[sex_mask & age_mask & return_df[bia_features].isna(), bia_features] = mean
    return_df.loc[return_df['BIA-BIA_Activity_Level_num'].isna(), 'BIA-BIA_Activity_Level_num'] = 3
    return_df.loc[return_df['BIA-BIA_Frame_num'].isna(), 'BIA-BIA_Frame_num'] = 2

    # Physical Activity Questionnaire ------------------------------------------------------------------------------------- #
    age_masks_PAQ = {
        '5-13': (return_df['Basic_Demos-Age'] >= 5) & (return_df['Basic_Demos-Age'] <= 13),
        '14-22': (return_df['Basic_Demos-Age'] >= 14) & (return_df['Basic_Demos-Age'] <= 22)
    }
    return_df['PAQ_Total'] = return_df['PAQ_A-PAQ_A_Total'].combine_first(return_df['PAQ_C-PAQ_C_Total'])
    return_df['PAQ_Season'] = return_df['PAQ_A-Season'].combine_first(return_df['PAQ_C-Season'])
    for range_label, age_mask in age_masks_PAQ.items():
        for sex_label, sex_mask in sex_masks.items():
            mean_paq = return_df.loc[age_mask & sex_mask, 'PAQ_Total'].mean()
            return_df.loc[sex_mask & age_mask & return_df['PAQ_Total'].isna(), 'PAQ_Total'] = mean_paq
            most_paq_season = return_df.loc[age_mask & sex_mask, 'PAQ_Season'].value_counts().index[0]
            return_df.loc[sex_mask & age_mask & return_df['PAQ_Season'].isna(), 'PAQ_Season'] = most_paq_season
    
    return_df.drop(['Basic_Demos-Enroll_Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-SDS_Total_T', 'Fitness_Endurance-Time_Mins', 'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday'], axis=1, inplace=True)
    return_df.drop(['PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10',
       'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20'], axis=1, inplace=True)
        
    return return_df

# Preprocess Training Parquet

In [137]:
def Preprocess_Training_Parquet(train_parquet_path) -> pd.DataFrame:
    def Preprocess_Parquet(parquet: pd.DataFrame) -> pd.DataFrame:
        data = parquet.copy()
        data['timestamp'] = pd.to_datetime(data['relative_date_PCIAT'], unit='D') + pd.to_timedelta(data['time_of_day'])
        data = data[data['non-wear_flag'] == 0]
        hour = pd.to_datetime(data['time_of_day']).dt.hour
        time_masks = {
            'morning': (hour >= 6) & (hour < 12),
            'afternoon': (hour >= 12) & (hour < 17),
            'evening': (hour >= 17) & (hour < 22),
            'night': (hour >= 22) | (hour < 6)
        }
        
        features = dict()
        # season -----------------------------------------------------------------------------------------------#
        features.update({
            'actigraphy_season': data['quarter'].value_counts().index[0]
        })
        # weekend vs weekday -----------------------------------------------------------------------------------#
        proportions = data['weekday'].isin([6, 7]).value_counts(normalize=True).to_dict()
        features.update({
            'weekend/weekday': proportions.get(True, 0)
        })
        # time base data ---------------------------------------------------------------------------------------#
        for period, mask in time_masks.items():
            features.update({
                f'{period}_enmo_mean': data.loc[mask, 'enmo'].mean(),
                f'{period}_enmo_std': data.loc[mask, 'enmo'].std(),
                f'{period}_anglez_std': data.loc[mask, 'anglez'].std(),
            })
        # sleep quality ----------------------------------------------------------------------------------------#
        sleep_hours = time_masks['night']
        features.update({
            'sleep_disruption_count': len(data.loc[sleep_hours & (data['enmo'] > data['enmo'].mean() + 2 * data['enmo'].std())]),
            'light_exposure_during_sleep': data.loc[sleep_hours, 'light'].mean(),
            'light_exposure_during_sleep_std': data.loc[sleep_hours, 'light'].std(),
            'light_exposure_during_sleep_disruption': len(data.loc[sleep_hours & (data['light'] > data['light'].mean() + 2 * data['light'].std())]),
            'sleep_position_changes': len(data.loc[sleep_hours & (abs(data['anglez'].diff()) > 60)]),
            'good_sleep_cycle': int(data.loc[sleep_hours, 'light'].mean() < 50)
        })
        # non-wear metrices ------------------------------------------------------------------------------------#
        features.update({
            'wear_consistency': data.loc[mask, 'enmo'].std(),
        })
        # arm movement -----------------------------------------------------------------------------------------#
        bins = [-90, -60, -30, 0, 30, 60, 90]
        labels = ['anglez -90 to -60', 'anglez -60 to -30', 'anglez -30 to 0', 'anglez 0 to 30', 'anglez 30 to 60', 'anglez 60 to 90']
        interval_proportion_dict = pd.cut(data['anglez'], bins=bins, labels=labels, right=False).value_counts(normalize=True).sort_index().to_dict()
        features.update(interval_proportion_dict)

        return pd.DataFrame([features])
    
    ids = os.listdir(train_parquet_path)
    data_frames = []
    for file_id in tqdm(ids, desc="Preprocess Training Parquet"):
        result = Preprocess_Parquet(pd.read_parquet(os.path.join(train_parquet_path, file_id, 'part-0.parquet')))
        result['id'] = file_id.split('=')[1]
        data_frames.append(result)

    return pd.concat(data_frames, ignore_index=True)

# Preprocessing

In [138]:
post_train_data = Preprocess_Training_Data(train_df)
post_train_parquet = Preprocess_Training_Parquet(train_parquet_path)
merged_train_df = pd.merge(post_train_data, post_train_parquet, on='id', how='left')
merged_train_df['with_parquet'] = merged_train_df['actigraphy_season'].notna()

Preprocess Training Parquet: 100%|██████████| 996/996 [02:37<00:00,  6.32it/s]
  return pd.concat(data_frames, ignore_index=True)


## train post_train_data

In [139]:
# sns.pairplot(post_train_data['Basic_Demos-Age','Basic_Demos-Sex']],hue='sii')
import helpers.data_mining_helpers as dmh
post_train_data.isnull().apply(lambda x: dmh.check_missing_values(x)).T

Unnamed: 0,0,1
id,The amoung of missing records is:,0
Basic_Demos-Age,The amoung of missing records is:,0
Basic_Demos-Sex,The amoung of missing records is:,0
CGAS-Season,The amoung of missing records is:,0
CGAS-CGAS_Score,The amoung of missing records is:,0
Physical-Season,The amoung of missing records is:,0
Physical-BMI,The amoung of missing records is:,0
Physical-Height,The amoung of missing records is:,0
Physical-Weight,The amoung of missing records is:,0
Physical-Waist_Circumference,The amoung of missing records is:,0


In [140]:
post_train_data.dropna(inplace=True)
post_train_data.isnull().apply(lambda x: dmh.check_missing_values(x)).T

Unnamed: 0,0,1
id,The amoung of missing records is:,0
Basic_Demos-Age,The amoung of missing records is:,0
Basic_Demos-Sex,The amoung of missing records is:,0
CGAS-Season,The amoung of missing records is:,0
CGAS-CGAS_Score,The amoung of missing records is:,0
Physical-Season,The amoung of missing records is:,0
Physical-BMI,The amoung of missing records is:,0
Physical-Height,The amoung of missing records is:,0
Physical-Weight,The amoung of missing records is:,0
Physical-Waist_Circumference,The amoung of missing records is:,0


In [141]:
post_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            2736 non-null   object 
 1   Basic_Demos-Age               2736 non-null   int64  
 2   Basic_Demos-Sex               2736 non-null   int64  
 3   CGAS-Season                   2736 non-null   object 
 4   CGAS-CGAS_Score               2736 non-null   float64
 5   Physical-Season               2736 non-null   object 
 6   Physical-BMI                  2736 non-null   float64
 7   Physical-Height               2736 non-null   float64
 8   Physical-Weight               2736 non-null   float64
 9   Physical-Waist_Circumference  2736 non-null   float64
 10  Physical-Diastolic_BP         2736 non-null   float64
 11  Physical-HeartRate            2736 non-null   float64
 12  Physical-Systolic_BP          2736 non-null   float64
 13  Fitness_

In [142]:
post_train_data_value = post_train_data.select_dtypes(exclude=['object'])
post_train_data_value

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,sii,PAQ_Total
0,5,0,51.0,16.877316,46.0,50.8,23.0,66.0,86.0,110.0,...,1.0,24.434900,8.89536,38.917700,19.541300,32.690900,55.0,41.257576,2.0,2.658115
1,9,0,65.0,14.035590,48.0,46.0,22.0,75.0,70.0,122.0,...,1.0,21.035200,14.97400,39.449700,15.410700,27.055200,0.0,46.000000,0.0,2.340000
2,10,1,71.0,16.648696,56.5,75.6,25.0,65.0,94.0,117.0,...,2.0,34.361827,26.41625,69.139954,38.859023,62.874586,28.0,38.000000,0.0,2.170000
3,9,0,71.0,18.292347,56.0,81.6,25.0,60.0,97.0,117.0,...,2.0,30.404100,16.77900,58.933800,26.479800,45.996600,44.0,31.000000,1.0,2.451000
5,13,1,50.0,22.279952,59.5,112.2,28.0,60.0,73.0,102.0,...,2.0,32.914100,20.90200,79.698200,35.380400,63.126500,34.0,40.000000,1.0,4.110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,8,0,65.0,17.139810,52.5,67.2,25.0,60.0,65.0,112.0,...,1.0,25.711800,15.81500,49.330100,20.264500,36.718100,22.0,41.000000,0.0,3.440000
3954,7,1,66.0,13.927006,48.5,46.6,23.0,65.0,75.0,105.0,...,1.0,20.057200,15.14000,42.818500,18.093700,30.045300,33.0,48.000000,1.0,2.486920
3955,13,0,60.0,16.362460,59.5,82.4,29.0,71.0,70.0,104.0,...,1.0,33.370900,17.97970,66.288900,29.779000,52.832000,32.0,35.000000,1.0,3.260000
3957,11,0,68.0,21.441500,60.0,109.8,29.0,79.0,99.0,116.0,...,2.0,33.980500,21.34030,71.390300,28.779200,54.463000,31.0,56.000000,1.0,2.729000


In [143]:
from sklearn.model_selection import train_test_split
x_data_value = post_train_data_value.drop('sii',axis=1)
y_data_value = post_train_data_value['sii']
x_train, x_val, y_train, y_val = train_test_split(x_data_value,y_data_value,test_size=0.2,random_state=101)


In [144]:
from sklearn.tree import DecisionTreeClassifier
dtree_data_value = DecisionTreeClassifier()
dtree_data_value.fit(x_train,y_train)

In [145]:
predictions_data = dtree_data_value.predict(x_val)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
accuracy = accuracy_score(y_val, predictions_data)
print(f"DecisionTreeClassifier Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_val,predictions_data))
print(confusion_matrix(y_val,predictions_data))

DecisionTreeClassifier Accuracy: 100.00%
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       322
         1.0       1.00      1.00      1.00       138
         2.0       1.00      1.00      1.00        80
         3.0       1.00      1.00      1.00         8

    accuracy                           1.00       548
   macro avg       1.00      1.00      1.00       548
weighted avg       1.00      1.00      1.00       548

[[322   0   0   0]
 [  0 138   0   0]
 [  0   0  80   0]
 [  0   0   0   8]]


In [146]:
from sklearn.ensemble import RandomForestClassifier

rf_data_value = RandomForestClassifier(n_estimators=100)
rf_data_value.fit(x_train,y_train)

rfc_pred = rf_data_value.predict(x_val)

accuracy = accuracy_score(y_val, rfc_pred)
print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
print(confusion_matrix(y_val,rfc_pred))
print(classification_report(y_val,rfc_pred))

RandomForestClassifier Accuracy: 97.45%
[[322   0   0   0]
 [  0 138   0   0]
 [  0   6  74   0]
 [  0   2   6   0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       322
         1.0       0.95      1.00      0.97       138
         2.0       0.93      0.93      0.93        80
         3.0       0.00      0.00      0.00         8

    accuracy                           0.97       548
   macro avg       0.72      0.73      0.72       548
weighted avg       0.96      0.97      0.97       548



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [190]:
test_columns=test_df.columns.tolist()
test_data=test_df.copy()
columns=x_train.columns.tolist()
for i in columns:
     if i not in test_columns:
        print(i)
for i in test_data:
     if i not in columns:
        test_data=test_data.drop(i,axis=1)
             
test_y_path = 'child-mind-institute-problematic-internet-use/sample_submission.csv'
test_data_y = pd.read_csv(test_y_path).drop('id',axis=1).values.tolist()

## test 沒有PCIAT-PCIAT_Total,PAQ_Total 所以前面訓練要再做一次

In [184]:
from sklearn.model_selection import train_test_split
x_data_value = post_train_data_value.drop(['sii','PCIAT-PCIAT_Total','PAQ_Total'],axis=1)
y_data_value = post_train_data_value['sii']
x_train, x_val, y_train, y_val = train_test_split(x_data_value,y_data_value,test_size=0.2,random_state=101)


In [191]:
from sklearn.tree import DecisionTreeClassifier
dtree_data_value = DecisionTreeClassifier()
dtree_data_value.fit(x_train,y_train)

predictions_data = dtree_data_value.predict(x_val)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
accuracy = accuracy_score(y_val, predictions_data)
print(f"DecisionTreeClassifier Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_val,predictions_data))
print(confusion_matrix(y_val,predictions_data))

DecisionTreeClassifier Accuracy: 50.00%
              precision    recall  f1-score   support

         0.0       0.66      0.66      0.66       322
         1.0       0.30      0.33      0.32       138
         2.0       0.20      0.16      0.18        80
         3.0       0.25      0.25      0.25         8

    accuracy                           0.50       548
   macro avg       0.35      0.35      0.35       548
weighted avg       0.50      0.50      0.50       548

[[213  77  29   3]
 [ 72  46  20   0]
 [ 37  27  13   3]
 [  1   2   3   2]]


In [186]:
from sklearn.ensemble import RandomForestClassifier

rf_data_value = RandomForestClassifier(n_estimators=100)
rf_data_value.fit(x_train,y_train)

rfc_pred = rf_data_value.predict(x_val)

accuracy = accuracy_score(y_val, rfc_pred)
print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
print(confusion_matrix(y_val,rfc_pred))
print(classification_report(y_val,rfc_pred))

RandomForestClassifier Accuracy: 60.77%
[[287  29   6   0]
 [ 89  43   6   0]
 [ 52  25   3   0]
 [  0   6   2   0]]
              precision    recall  f1-score   support

         0.0       0.67      0.89      0.77       322
         1.0       0.42      0.31      0.36       138
         2.0       0.18      0.04      0.06        80
         3.0       0.00      0.00      0.00         8

    accuracy                           0.61       548
   macro avg       0.32      0.31      0.30       548
weighted avg       0.52      0.61      0.55       548



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [192]:
# test
predictions_data = dtree_data_value.predict(test_data)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
accuracy = accuracy_score(test_data_y, predictions_data)
print(f"DecisionTreeClassifier Accuracy: {accuracy * 100:.2f}%")
print(classification_report(test_data_y,predictions_data))
print(confusion_matrix(test_data_y,predictions_data))

DecisionTreeClassifier Accuracy: 20.00%
              precision    recall  f1-score   support

           0       0.11      0.20      0.14         5
           1       0.33      0.40      0.36         5
           2       0.20      0.20      0.20         5
           3       0.00      0.00      0.00         5

    accuracy                           0.20        20
   macro avg       0.16      0.20      0.18        20
weighted avg       0.16      0.20      0.18        20

[[1 1 3 0]
 [2 2 1 0]
 [3 1 1 0]
 [3 2 0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## train post_train_parquet

In [207]:
post_test_parquet = Preprocess_Training_Parquet(test_parquet_path)
post_test_parquet.columns==post_train_parquet.columns


Preprocess Training Parquet: 100%|██████████| 2/2 [00:00<00:00, 11.68it/s]


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [208]:
post_train_parquet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 996 entries, 0 to 995
Data columns (total 28 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   actigraphy_season                       996 non-null    int8   
 1   weekend/weekday                         996 non-null    float64
 2   morning_enmo_mean                       993 non-null    float32
 3   morning_enmo_std                        993 non-null    float32
 4   morning_anglez_std                      993 non-null    float32
 5   afternoon_enmo_mean                     993 non-null    float32
 6   afternoon_enmo_std                      993 non-null    float32
 7   afternoon_anglez_std                    993 non-null    float32
 8   evening_enmo_mean                       996 non-null    float32
 9   evening_enmo_std                        996 non-null    float32
 10  evening_anglez_std                      996 non-null    float3

In [236]:
ids=post_train_parquet['id'].values.tolist()
t_parquet=[]
for i in ids:
    t_parquet.append(train_df[train_df['id']==i]['sii'].values[0])

train_parquet=pd.concat([post_train_parquet,pd.DataFrame({'sii': t_parquet})], axis=1)

In [238]:
from sklearn.model_selection import train_test_split
x_data_parquet = post_train_parquet.drop(['id'],axis=1)
y_data_parquet = train_parquet['sii']
x_train, x_val, y_train, y_val = train_test_split(x_data_parquet,y_data_parquet,test_size=0.2,random_state=101)

In [239]:
from sklearn.tree import DecisionTreeClassifier
dtree_data_value = DecisionTreeClassifier()
dtree_data_value.fit(x_train,y_train)

predictions_data = dtree_data_value.predict(x_val)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
accuracy = accuracy_score(y_val, predictions_data)
print(f"DecisionTreeClassifier Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_val,predictions_data))
print(confusion_matrix(y_val,predictions_data))

DecisionTreeClassifier Accuracy: 47.50%
              precision    recall  f1-score   support

         0.0       0.60      0.62      0.61       111
         1.0       0.33      0.30      0.31        57
         2.0       0.28      0.30      0.29        30
         3.0       0.00      0.00      0.00         2

    accuracy                           0.47       200
   macro avg       0.30      0.30      0.30       200
weighted avg       0.47      0.47      0.47       200

[[69 25 17  0]
 [33 17  6  1]
 [12  9  9  0]
 [ 1  1  0  0]]


In [240]:
from sklearn.ensemble import RandomForestClassifier

rf_data_value = RandomForestClassifier(n_estimators=100)
rf_data_value.fit(x_train,y_train)

rfc_pred = rf_data_value.predict(x_val)

accuracy = accuracy_score(y_val, rfc_pred)
print(f"RandomForestClassifier Accuracy: {accuracy * 100:.2f}%")
print(confusion_matrix(y_val,rfc_pred))
print(classification_report(y_val,rfc_pred))

RandomForestClassifier Accuracy: 55.50%
[[100  11   0   0]
 [ 47  10   0   0]
 [ 25   4   1   0]
 [  1   1   0   0]]
              precision    recall  f1-score   support

         0.0       0.58      0.90      0.70       111
         1.0       0.38      0.18      0.24        57
         2.0       1.00      0.03      0.06        30
         3.0       0.00      0.00      0.00         2

    accuracy                           0.56       200
   macro avg       0.49      0.28      0.25       200
weighted avg       0.58      0.56      0.47       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
