In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

# Categorical features

In [36]:
df_train= pd.read_excel("TRAIN_CATEGORICAL_METADATA_new.xlsx")
df_test = pd.read_excel("TEST_CATEGORICAL.xlsx")
outcome= pd.read_excel("TRAINING_SOLUTIONS.xlsx")

In [37]:
outcome

Unnamed: 0,participant_id,ADHD_Outcome,Sex_F
0,UmrK0vMLopoR,1,1
1,CPaeQkhcjg7d,1,0
2,Nb4EetVPm3gs,1,0
3,p4vPhVu91o4b,1,1
4,M09PXs7arQ5E,1,1
...,...,...,...
1208,Atx7oub96GXS,0,0
1209,groSbUfkQngM,0,1
1210,zmxGvIrOD0bt,0,1
1211,rOmWFuJCud5G,0,0


In [38]:
df_train.set_index('participant_id', inplace=True)
df_test.set_index('participant_id', inplace=True)

In [40]:
#Replace values that are not reported to NaN

df_train.loc[(df_train.PreInt_Demos_Fam_Child_Race == 10) | (df_train.PreInt_Demos_Fam_Child_Race == 11), 'PreInt_Demos_Fam_Child_Race'] = np.nan
df_train.loc[df_train.PreInt_Demos_Fam_Child_Ethnicity == 3, 'PreInt_Demos_Fam_Child_Ethnicity'] = np.nan
df_test.loc[(df_test.PreInt_Demos_Fam_Child_Race == 10) | (df_test.PreInt_Demos_Fam_Child_Race == 11), 'PreInt_Demos_Fam_Child_Race'] = np.nan
df_test.loc[df_test.PreInt_Demos_Fam_Child_Ethnicity == 3, 'PreInt_Demos_Fam_Child_Ethnicity'] = np.nan

In [41]:
seed = 41
X_train, X_val, y_train, y_val = train_test_split(df_train, outcome, test_size=0.2, random_state=seed)

In [None]:
def cat_feature_engineering(data):
    ''' This function performs feature engineering on categorical data'''

    # Mapping occupation to categories
    occupation_mapping = {
        0: "manual_labor", 5: "manual_labor", 10: "manual_labor", 15: "manual_labor", 20: "manual_labor",
        25: "skilled_labor", 30: "skilled_labor", 35: "skilled_labor",
        40: "intellectual_labor", 45: "intellectual_labor"
    }

    # Mapping education levels to categories
    education_mapping = {
        3: 0,  # Low education
        6: 0,
        9: 0,
        12: 1, # Medium education
        15: 1,
        18: 2, # High education
        21: 2
    }

    for edu_col in ["Barratt_Barratt_P1_Edu", "Barratt_Barratt_P2_Edu"]:
        data[edu_col+'_map'] = data[edu_col].map(education_mapping)

    # Áp dụng mapping
    for occ_col in ["Barratt_Barratt_P1_Occ", "Barratt_Barratt_P2_Occ"]:
        data[occ_col+'_map'] = data[occ_col].map(occupation_mapping)

    # Chênh lệch học vấn và nghề nghiệp giữa P1 và P2
    data['Edu_Diff'] = abs(data['Barratt_Barratt_P1_Edu'] - data['Barratt_Barratt_P2_Edu'])
    data['Occ_Diff'] = abs(data['Barratt_Barratt_P1_Occ'] - data['Barratt_Barratt_P2_Occ'])

    # Tổng mức độ học vấn và nghề nghiệp
    data['Edu_Total'] = data['Barratt_Barratt_P1_Edu'] + data['Barratt_Barratt_P2_Edu']
    data['Occ_Total'] = data['Barratt_Barratt_P1_Occ'] + data['Barratt_Barratt_P2_Occ']

    # Tạo đặc trưng occupation_education kết hợp
    data['P1_EduOcc'] = data['Barratt_Barratt_P1_Edu_map'].astype(str) + "_" + data['Barratt_Barratt_P1_Occ_map'].astype(str)
    data['P2_EduOcc'] = data['Barratt_Barratt_P2_Edu_map'].astype(str) + "_" + data['Barratt_Barratt_P2_Occ_map'].astype(str)

    # Mức độ giống nhau về học vấn và nghề nghiệp
    data['Same_Edu'] = (data['Barratt_Barratt_P1_Edu'] == data['Barratt_Barratt_P2_Edu']).astype(int)
    data['Same_Occ'] = (data['Barratt_Barratt_P1_Occ'] == data['Barratt_Barratt_P2_Occ']).astype(int)

    # Frequency Encoding cho occupation và education
    for col in ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
                'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']:
        freq = data[col].value_counts(normalize=True)
        data[f'{col}_freq'] = data[col].map(freq)

    # 6. One-hot encoding cho P1_EduOcc và P2_EduOcc
    #data = pd.get_dummies(data, columns=['P1_EduOcc', 'P2_EduOcc'], drop_first=True)

    return data

def cat_encode(data):
    ''' This function performs encoding on categorical data using OneHotEncoder and OrdinalEncoder'''
    
    # Ordinal Encoding for occupation groups
    occ_category_mapping = {"manual_labor": 0, "skilled_labor": 1, "intellectual_labor": 2}
    for occ_col in ["Barratt_Barratt_P1_Occ_map", "Barratt_Barratt_P2_Occ_map"]:
        data[occ_col] = data[occ_col].map(occ_category_mapping)

    # Chọn các cột cần điền giá trị thiếu
    target_cols = ['Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site',
       'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
       'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu',
       'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
       'Barratt_Barratt_P2_Occ', 'Barratt_Barratt_P1_Edu_map',
       'Barratt_Barratt_P2_Edu_map', 'P1_EduOcc', 'P2_EduOcc']

    # One-Hot Encoding cho các cột còn lại
    oh = OneHotEncoder(handle_unknown='ignore')
    oh_data = oh.fit_transform(data[target_cols])
    oh_columns = oh.get_feature_names_out(target_cols)
    oh_df = pd.DataFrame(oh_data.toarray(), columns=oh_columns, index=data.index)
    data = pd.concat([data.drop(columns=target_cols), oh_df], axis=1)
    return data

In [None]:
#Combine dataset for feature engineering
encode_data= pd.concat([X_train, X_val, df_test], axis=0)

In [44]:
#Apply feature engineering and encoding
data_encoded = cat_feature_engineering(encode_data)
data_encoded = cat_encode(data_encoded)

In [45]:
data_encoded.head()

Unnamed: 0_level_0,Barratt_Barratt_P1_Occ_map,Barratt_Barratt_P2_Occ_map,Edu_Diff,Occ_Diff,Edu_Total,Occ_Total,Same_Edu,Same_Occ,Barratt_Barratt_P1_Edu_freq,Barratt_Barratt_P1_Occ_freq,...,P2_EduOcc_1.0_nan,P2_EduOcc_1.0_skilled_labor,P2_EduOcc_2.0_intellectual_labor,P2_EduOcc_2.0_manual_labor,P2_EduOcc_2.0_nan,P2_EduOcc_2.0_skilled_labor,P2_EduOcc_nan_intellectual_labor,P2_EduOcc_nan_manual_labor,P2_EduOcc_nan_nan,P2_EduOcc_nan_skilled_labor
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a7qT3UHHKevQ,1.0,1.0,0.0,5.0,24.0,65.0,1,0,0.069287,0.133333,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UVwstnvYU3bM,1.0,0.0,9.0,25.0,33.0,45.0,0,0,0.426382,0.178451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3jfYEZ9nAKKr,1.0,1.0,0.0,0.0,42.0,50.0,1,1,0.426382,0.061953,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
tuHEwUBGYRCO,2.0,2.0,0.0,0.0,42.0,80.0,1,1,0.426382,0.133333,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
z1FVD7d0BGpV,2.0,0.0,0.0,45.0,36.0,45.0,1,0,0.343771,0.181145,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
data_encoded.isnull().sum()

Barratt_Barratt_P1_Occ_map           32
Barratt_Barratt_P2_Occ_map          264
Edu_Diff                            236
Occ_Diff                            270
Edu_Total                           236
                                   ... 
P2_EduOcc_2.0_skilled_labor           0
P2_EduOcc_nan_intellectual_labor      0
P2_EduOcc_nan_manual_labor            0
P2_EduOcc_nan_nan                     0
P2_EduOcc_nan_skilled_labor           0
Length: 119, dtype: int64

In [47]:
X_train=data_encoded.iloc[:X_train.shape[0], :]
X_val=data_encoded.iloc[X_train.shape[0]:X_train.shape[0]+X_val.shape[0], :]
df_test=data_encoded.iloc[X_train.shape[0]+X_val.shape[0]:, :]

In [None]:
def cat_imputer(data):
    ''' This function imputes missing values in categorical data using KNN and most_frequent strategy'''
    target_cols = [col for col in data.columns if data[col].isnull().sum() > 0]
    # Áp dụng KNN Imputer
    imputer = KNNImputer(n_neighbors=5)
    data[target_cols] = imputer.fit_transform(data[target_cols].astype(float))

    # Danh sách các cột có missing values cần điền
    missing_cols = [col for col in data.columns if data[col].isnull().sum() > 0]

    # Áp dụng SimpleImputer (most_frequent) cho các cột missing
    if missing_cols:
        imputer_simple = SimpleImputer(strategy='most_frequent')
        data[missing_cols] = imputer_simple.fit_transform(data[missing_cols])
    return data

In [49]:
X_train=cat_imputer(X_train)
X_val=cat_imputer(X_val)
df_test=cat_imputer(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[target_cols] = imputer.fit_transform(data[target_cols].astype(float))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[target_cols] = imputer.fit_transform(data[target_cols].astype(float))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[target_cols] = imputer.fit_transform(data[target_c

In [52]:
X_train.isnull().sum().sum(), X_val.isnull().sum().sum(), df_test.isnull().sum().sum()

(np.int64(0), np.int64(0), np.int64(0))

# Numerical features

In [None]:
#Predefined functions used in transforming numerical features

apq_col = ['APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP']
sdq_col = ['SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
        'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
        'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial']

def preprocessing_train(df):
    ''' This function is used to fill missing values in training data.
    '''
    for i in apq_col:
        mode_value = df[i].mode()[0]
        df[i] = df[i].fillna(mode_value)
    df['EHQ_EHQ_Total'] = df['EHQ_EHQ_Total'].fillna(df['EHQ_EHQ_Total'].mode()[0])
    df['MRI_Track_Age_at_Scan'] = df['MRI_Track_Age_at_Scan'].replace(0, np.nan)
    df['MRI_Track_Age_at_Scan'] = df['MRI_Track_Age_at_Scan'].fillna(df['MRI_Track_Age_at_Scan'].mean())
    df['ColorVision_CV_Score'] = df['ColorVision_CV_Score'].fillna(df['ColorVision_CV_Score'].mode()[0])
    df['Pass_ColorVision_CV_Score'] = df['ColorVision_CV_Score'] >= 10
    df['Pass_ColorVision_CV_Score'] = df['Pass_ColorVision_CV_Score'].astype(int)
    df.drop(columns=['ColorVision_CV_Score'], inplace=True)
    for i in sdq_col:
        df[i] = df[i].fillna(df[i].median())
    return df

def preprocessing_test(df):
    ''' This function is used to fill missing values in test data'''

    df['EHQ_EHQ_Total'] = df['EHQ_EHQ_Total'].fillna(df['EHQ_EHQ_Total'].mode()[0])
    df['EHQ_EHQ_Total'] = df['EHQ_EHQ_Total'].clip(lower=-100, upper=100)
    for i in apq_col:
        mode_value_test = df[i].mode()[0]
        df[i] = df[i].fillna(mode_value_test)
    df['ColorVision_CV_Score'] = df['ColorVision_CV_Score'].fillna(df['ColorVision_CV_Score'].mode()[0])
    df['Pass_ColorVision_CV_Score'] = df['ColorVision_CV_Score'] >= 10
    df['Pass_ColorVision_CV_Score'] = df['Pass_ColorVision_CV_Score'].astype(int)
    df.drop(columns=['ColorVision_CV_Score'], inplace=True)
    for i in sdq_col:
        df[i] = df[i].fillna(df[i].median())
    return df

def age_group(x):
    ''' This function is used to classify age groups based on the age at scan.
        Returns a string representing the age group.
    '''
    if x < 8:
        return 'young_child'
    elif x < 12:
        return 'child'
    elif x < 16:
        return 'teen'
    else:
        return 'young_adult'

def classify_handedness(score):
    ''' This function classifies handedness based on the EHQ_EHQ_Total score.
        Returns "Left", "Right", or "Middle" based on the score.
    '''
    if score <= -28:
        return "Left"
    elif score >= 48:
        return "Right"
    else:
        return "Middle"

def FE_quant(df):
    ''' This function performs feature engineering on the quantitative features of the dataset.'''

    scaler = StandardScaler()
    parenting_scaled = scaler.fit_transform(df[apq_col])
    kmeans = KMeans(n_clusters=3, random_state=42)
    df['Parenting_Type'] = kmeans.fit_predict(parenting_scaled)
    # df[apq_col] = MinMaxScaler(feature_range=(0, 10)).fit_transform(df[apq_col])
    df['APQ_Positive'] = df[['APQ_P_APQ_P_PM', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_PP']].mean(axis=1)
    df['APQ_Negative'] = df[['APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID']].mean(axis=1)
    df['SDQ_SDQ_Prosocial_Inverted'] = 10 - df['SDQ_SDQ_Prosocial']  # Vì max = 10
    sdq_problem_cols = [col for col in df.columns if col.startswith("SDQ_") and "Prosocial" not in col]
    df['SDQ_Total_Problems'] = df[sdq_problem_cols].sum(axis=1)
    df['ParentingStyle_vs_Problems'] = df['APQ_Negative'] * df['SDQ_Total_Problems']
    df['Positive_to_Negative_APQ_Ratio'] = df['APQ_Positive'] / (df['APQ_Negative'] + 1e-5)
    df['MRI_Age_Group'] = df['MRI_Track_Age_at_Scan'].apply(age_group)
    df['MRI_Age_Group'] = OneHotEncoder().fit_transform(df[['MRI_Age_Group']]).toarray() 
    df["Handedness"] = df["EHQ_EHQ_Total"].apply(classify_handedness)
    df["Handedness"] = OneHotEncoder().fit_transform(df[["Handedness"]]).toarray()
    

    return df

In [46]:
train_quant= pd.read_excel("TRAIN_QUANTITATIVE_METADATA_new.xlsx", index_col=0)
test_quant= pd.read_excel("TEST_QUANTITATIVE_METADATA.xlsx", index_col=0)

In [None]:
X_train_q, X_val_q, y_train_q, y_val_q = train_test_split(train_quant, outcome, test_size=0.2, random_state=seed)

X_train_q = preprocessing_train(X_train_q)
X_val_q = preprocessing_train(X_val_q)
X_train_q = FE_quant(X_train_q)
X_val_q = FE_quant(X_val_q)

test_quant = preprocessing_test(test_quant)
test_quant = FE_quant(test_quant)

In [48]:
X_train_q

Unnamed: 0_level_0,EHQ_EHQ_Total,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,SDQ_SDQ_Difficulties_Total,SDQ_SDQ_Emotional_Problems,...,Pass_ColorVision_CV_Score,Parenting_Type,APQ_Positive,APQ_Negative,SDQ_SDQ_Prosocial_Inverted,SDQ_Total_Problems,ParentingStyle_vs_Problems,Positive_to_Negative_APQ_Ratio,MRI_Age_Group,Handedness
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a7qT3UHHKevQ,80.00,3.0,8.0,46.0,17.0,15.0,28.0,0.0,16.0,6.0,...,1,0,29.666667,9.333333,1.0,54.0,504.000000,3.178568,1.0,0.0
UVwstnvYU3bM,61.14,3.0,13.0,39.0,18.0,22.0,23.0,1.0,9.0,0.0,...,1,2,28.000000,11.333333,1.0,31.0,351.333333,2.470586,0.0,0.0
3jfYEZ9nAKKr,44.47,3.0,16.0,39.0,15.0,15.0,24.0,1.0,14.0,7.0,...,1,2,26.000000,11.333333,1.0,51.0,578.000000,2.294116,1.0,0.0
tuHEwUBGYRCO,60.00,5.0,17.0,37.0,18.0,21.0,20.0,3.0,11.0,0.0,...,1,2,26.000000,13.333333,3.0,38.0,506.666667,1.949999,1.0,0.0
z1FVD7d0BGpV,53.34,4.0,13.0,49.0,16.0,10.0,27.0,0.0,0.0,0.0,...,1,0,28.666667,11.000000,0.0,0.0,0.000000,2.606058,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fKQY0gay4y9E,17.81,3.0,6.0,38.0,17.0,20.0,30.0,0.0,7.0,2.0,...,1,0,29.333333,8.666667,0.0,21.0,182.000000,3.384611,0.0,0.0
Bm6FfgcqV5IC,93.34,5.0,12.0,37.0,19.0,18.0,24.0,4.0,12.0,3.0,...,1,2,26.333333,12.000000,2.0,42.0,504.000000,2.194443,1.0,0.0
Dg89spvEvIPB,80.00,3.0,7.0,49.0,12.0,14.0,27.0,0.0,1.0,0.0,...,1,0,30.000000,7.333333,0.0,3.0,22.000000,4.090904,0.0,0.0
XC6aisOM0WzL,100.00,3.0,15.0,36.0,18.0,14.0,23.0,1.0,8.0,2.0,...,0,2,24.333333,12.000000,1.0,32.0,384.000000,2.027776,1.0,0.0


In [None]:
#Merging categorical and quantitative features
X_train_of =pd.merge(X_train, X_train_q, on='participant_id', how='left')
X_val_of =pd.merge(X_val, X_val_q, on='participant_id', how='left')
test_of =pd.merge(df_test, test_quant, on='participant_id', how='left')
X_train_of

Unnamed: 0_level_0,Barratt_Barratt_P1_Occ_map,Barratt_Barratt_P2_Occ_map,Edu_Diff,Occ_Diff,Edu_Total,Occ_Total,Same_Edu,Same_Occ,Barratt_Barratt_P1_Edu_freq,Barratt_Barratt_P1_Occ_freq,...,Pass_ColorVision_CV_Score,Parenting_Type,APQ_Positive,APQ_Negative,SDQ_SDQ_Prosocial_Inverted,SDQ_Total_Problems,ParentingStyle_vs_Problems,Positive_to_Negative_APQ_Ratio,MRI_Age_Group,Handedness
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a7qT3UHHKevQ,1.0,1.0,0.0,5.0,24.0,65.0,1,0,0.069287,0.133333,...,1,0,29.666667,9.333333,1.0,54.0,504.000000,3.178568,1.0,0.0
UVwstnvYU3bM,1.0,0.0,9.0,25.0,33.0,45.0,0,0,0.426382,0.178451,...,1,2,28.000000,11.333333,1.0,31.0,351.333333,2.470586,0.0,0.0
3jfYEZ9nAKKr,1.0,1.0,0.0,0.0,42.0,50.0,1,1,0.426382,0.061953,...,1,2,26.000000,11.333333,1.0,51.0,578.000000,2.294116,1.0,0.0
tuHEwUBGYRCO,2.0,2.0,0.0,0.0,42.0,80.0,1,1,0.426382,0.133333,...,1,2,26.000000,13.333333,3.0,38.0,506.666667,1.949999,1.0,0.0
z1FVD7d0BGpV,2.0,0.0,0.0,45.0,36.0,45.0,1,0,0.343771,0.181145,...,1,0,28.666667,11.000000,0.0,0.0,0.000000,2.606058,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fKQY0gay4y9E,1.0,1.0,9.0,5.0,33.0,65.0,0,0,0.069287,0.133333,...,1,0,29.333333,8.666667,0.0,21.0,182.000000,3.384611,0.0,0.0
Bm6FfgcqV5IC,1.0,1.0,0.0,10.0,36.0,60.0,1,0,0.343771,0.178451,...,1,2,26.333333,12.000000,2.0,42.0,504.000000,2.194443,1.0,0.0
Dg89spvEvIPB,0.0,1.0,3.0,35.0,39.0,35.0,0,0,0.426382,0.233670,...,1,0,30.000000,7.333333,0.0,3.0,22.000000,4.090904,0.0,1.0
XC6aisOM0WzL,0.0,1.0,0.0,15.0,24.0,55.0,1,0,0.069287,0.022896,...,0,2,24.333333,12.000000,1.0,32.0,384.000000,2.027776,1.0,0.0


In [105]:
X_train_of.isnull().sum().sum(), X_val_of.isnull().sum().sum(), test_of.isnull().sum().sum()

(np.int64(0), np.int64(0), np.int64(0))

In [204]:
X_train_of.to_csv("X_train_of.csv", index=True)
X_val_of.to_csv("X_val_of.csv", index=True)
test_of.to_csv("test_of.csv", index=True)