In [1]:
# Import thư viện cần thiết
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import optuna
from sklearn.ensemble import VotingClassifier

In [2]:
# Đọc dữ liệu
train_file = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
test_file = '/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'

df = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [3]:
# Xử lý cột ID
id_column = df_test['id']
df_test = df_test.drop(columns=['id'])
df = df.drop(columns=['id'])

In [4]:
# Xử lý cột mục tiêu
target = df.pop('sii')
df = df[target.notna()]
target = target[target.notna()]

In [5]:
# encode object type
for column in df.columns:
    if df[column].dtype == object:
        df[column], _ = pd.factorize(df[column])
for column in df_test.columns:
    if df_test[column].dtype == object:
        df_test[column], _ = pd.factorize(df_test[column])

In [6]:
# Điền giá trị thiếu
df.fillna(df.median(), inplace=True)
df_test.fillna(df_test.median(), inplace=True)

In [7]:
# Đồng bộ hóa cột giữa df và df_test
#common_columns = df.columns.intersection(df_test.columns)
common_columns = ['SDS-SDS_Total_Raw', 'Basic_Demos-Age', 'SDS-SDS_Total_T', 'Physical-Height',
                  'Physical-Weight', 'PreInt_EduHx-computerinternet_hoursday', 'Physical-BMI',
                  'Physical-HeartRate', 'Physical-Systolic_BP', 'CGAS-CGAS_Score',
                  'Physical-Diastolic_BP', 'PAQ_C-PAQ_C_Total', 'FGC-FGC_CU',
                  'BIA-BIA_LDM', 'BIA-BIA_DEE', 'BIA-BIA_ICW', 'BIA-BIA_FFMI',
                  'BIA-BIA_BMC', 'BIA-BIA_LST', 'BIA-BIA_ECW', 'BIA-BIA_SMM',
                  'FGC-FGC_SRR', 'BIA-BIA_FFM', 'BIA-BIA_BMR', 'BIA-BIA_Fat',
                  'FGC-FGC_SRL', 'BIA-BIA_FMI', 'BIA-BIA_TBW', 'FGC-FGC_TL',
                  'CGAS-Season', 'FGC-FGC_PU', 'FGC-Season', 'BIA-BIA_BMI',
                  'FGC-FGC_GSD', 'FGC-FGC_GSND', 'SDS-Season', 'Physical-Season',
                  'PAQ_C-Season', 'PreInt_EduHx-Season', 'Basic_Demos-Enroll_Season',
                  'Fitness_Endurance-Time_Sec', 'Fitness_Endurance-Season']
df = df[common_columns]
df_test = df_test[common_columns]
df

Unnamed: 0,SDS-SDS_Total_Raw,Basic_Demos-Age,SDS-SDS_Total_T,Physical-Height,Physical-Weight,PreInt_EduHx-computerinternet_hoursday,Physical-BMI,Physical-HeartRate,Physical-Systolic_BP,CGAS-CGAS_Score,...,BIA-BIA_BMI,FGC-FGC_GSD,FGC-FGC_GSND,SDS-Season,Physical-Season,PAQ_C-Season,PreInt_EduHx-Season,Basic_Demos-Enroll_Season,Fitness_Endurance-Time_Sec,Fitness_Endurance-Season
0,39.0,5,55.0,46.0,50.8,3.0,16.877316,81.0,114.0,51.0,...,16.8792,20.8,19.4,-1,0,-1,0,0,28.0,-1
1,46.0,9,64.0,48.0,46.0,0.0,14.035590,70.0,122.0,65.0,...,14.0371,20.8,19.4,0,0,0,1,1,28.0,-1
2,38.0,10,54.0,56.5,75.6,2.0,16.648696,94.0,117.0,71.0,...,17.8506,14.7,10.2,0,0,1,1,1,33.0,0
3,31.0,9,45.0,56.0,81.6,0.0,18.292347,97.0,117.0,71.0,...,18.2943,20.8,19.4,1,1,2,2,2,37.0,1
5,40.0,13,56.0,59.5,112.2,0.0,22.279952,73.0,102.0,50.0,...,30.1865,17.9,16.5,1,1,3,3,3,28.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,41.0,8,58.0,52.5,67.2,2.0,17.139810,65.0,112.0,65.0,...,17.1417,20.8,19.4,0,0,0,0,0,28.0,-1
3954,48.0,7,67.0,48.5,46.6,0.0,13.927006,75.0,105.0,65.0,...,13.6457,20.8,19.4,1,1,-1,1,1,28.0,-1
3955,35.0,13,50.0,59.5,82.4,1.0,16.362460,70.0,104.0,60.0,...,16.3642,19.9,18.0,2,0,2,0,0,28.0,-1
3957,56.0,11,77.0,60.0,109.8,0.0,21.441500,99.0,116.0,68.0,...,21.4438,15.8,18.5,2,2,2,0,0,28.0,-1


In [8]:
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(df)
X_test = scaler.transform(df_test)
y = target.values

In [9]:
# Cân bằng dữ liệu với SMOTE
smote = SMOTE(k_neighbors=5)
X, y = smote.fit_resample(X, y)
X, y

(array([[-0.18555297, -1.52848742, -0.19473238, ..., -1.35846744,
          0.00998159, -0.79349522],
        [ 0.52547355, -0.36140681,  0.52255859, ..., -0.47436494,
          0.00998159, -0.79349522],
        [-0.28712819, -0.06963666, -0.27443138, ..., -0.47436494,
          0.55617413, -0.12405244],
        ...,
        [ 0.37792749,  1.70299244,  0.39476611, ...,  0.47642531,
          0.00998159, -0.79349522],
        [ 1.85275922,  0.53740302,  1.91754907, ..., -0.40315869,
          0.00998159, -0.79349522],
        [ 0.47127186,  0.65651979,  0.51898688, ..., -1.35846744,
          0.00998159,  0.57539144]]),
 array([2., 0., 0., ..., 3., 3., 3.]))

In [10]:
#optuna: params tunning
#def objective(
#     trial: optuna.Trial,
# ) -> float:
#     """Objective function for optuna optimisation."""
#     params = {
#         "boosting_type": "gbdt",
#         "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.3, step=0.01),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 45, step=10),
#         "max_depth": 10,
#         "max_leave": 100,
#         "bagging_fraction" : trial.suggest_float("bagging_fraction", 0.5, 0.9, step=0.1),
#         "bagginf_freq" : trial.suggest_int("bagging_freq", 1, 3, step=1),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 0.9, step=0.1),
#        "verbose" : -1,
#     }
    
     #kfold
#     kappas = []
#     kf = StratifiedKFold(n_splits=5, shuffle=True)
#     for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#         X_train, X_val = X[train_idx], X[val_idx]
#         y_train, y_val = y[train_idx], y[val_idx]

#         train_data = lgb.Dataset(X_train, y_train)
#         val_data = lgb.Dataset(X_val, y_val)
#         model_lgb = lgb.train(params, train_data)

    
#         y_pred = np.round(model_lgb.predict(X_val), 0)
#         kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
#         kappas.append(kappa)
#     return np.mean(kappas)

     #single fit
     # classifier = LGBMClassifier(**params)
     # classifier.fit(X_train, y_train)
     # y_pred = classifier.predict(X_val)
     # return cohen_kappa_score(y_val, y_pred, weights='quadratic')
#objective_func = lambda trial: objective(
#     trial,
#)

 # Run the optimisation
#study = optuna.create_study(direction='maximize')
#study.optimize(objective_func, n_trials=50)

 # Get the best hyperparameters
#print(study.best_params)

In [11]:
params = {
    "boosting_type" : "gbdt", 'verbose' : -1, 'max_bin': 255,
    'learning_rate': 0.27, 'min_data_in_leaf': 5, 'bagging_fraction': 0.7,
    'bagging_freq': 3, 'feature_fraction': 0.7
}
#LGBM default 0.8778305601985833 base
#model = LGBMClassifier(verbose=-1)

In [12]:
# #8:2 split training
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# #lgb train
# train_data = lgb.Dataset(X_train, y_train)
# val_data = lgb.Dataset(X_val, y_val)
# model_lgb = lgb.train(params, train_data)

# #Skit fit
# # model_lgb = LGBMClassifier(**params)
# # model_lgb.fit(X_train, y_train)

# y_pred = np.round(model_lgb.predict(X_val), 0)
# kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
# print(kappa)

In [13]:
#kfold train
model_lgb = LGBMClassifier(**params)

kappas = []
kf = StratifiedKFold(n_splits=5, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # train_data = lgb.Dataset(X_train, y_train)
    # val_data = lgb.Dataset(X_val, y_val)
    # model_lgb = lgb.train(params, train_data)

    # y_pred = np.round(model_lgb.predict(X_val), 0)
    # y_pred = np.where(y_pred == -0. , 0. , y_pred)

    model_lgb.fit(X_train, y_train)
    y_pred = model_lgb.predict(X_val)
    # print(y_pred)
    kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
    kappas.append(kappa)

print (np.mean(kappas))

0.8995744380854542


In [14]:
# Huấn luyện mô hình Random Forest
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
kappa = cohen_kappa_score(y_val, y_pred_rf, weights='quadratic')
print(kappa)

0.9077148629035421


In [15]:
#RF Kfold
#for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#    X_train, X_val = X[train_idx], X[val_idx]
#    y_train, y_val = y[train_idx], y[val_idx]

    # train_data = lgb.Dataset(X_train, y_train)
    # val_data = lgb.Dataset(X_val, y_val)
    # model_lgb = lgb.train(params, train_data)

    # y_pred = np.round(model_lgb.predict(X_val), 0)
    # y_pred = np.where(y_pred == -0. , 0. , y_pred)

#    rf_model.fit(X_train, y_train)
#    y_pred = rf_model.predict(X_val)
    # print(y_pred)
#    kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
#    kappas.append(kappa)

#print (np.mean(kappas))

In [16]:
# def objective_weights(
#     trial: optuna.Trial,
# ) -> float:
#     params = {
#         "weight_lgb" : trial.suggest_float("weight_lgb", 1.0, 2.0, step=0.1),
#         "weight_rf" : trial.suggest_float("weight_rf", 1.0, 2.0, step=0.1)
#     }
#     weight_lgb = params["weight_lgb"]
#     weight_rf = params["weight_rf"]

#     voting_clf = VotingClassifier(
#         estimators=[
#         ('lightgbm', model_lgb),
#         ('random_forest', rf_model)
#         ],weights=[weight_lgb, weight_rf]
#     )

#     voting_clf.fit(X_train, y_train)
#     y_pred_ens = voting_clf.predict(X_val)
#     kappa = cohen_kappa_score(y_val, y_pred_ens, weights='quadratic')
#     return kappa

# objective_func = lambda trial: objective_weights(
#     trial,
# )

# study = optuna.create_study(direction='maximize')
# study.optimize(objective_func, n_trials=50)

# print(study.best_params)

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
voting_clf = VotingClassifier(
    estimators=[
    ('lightgbm', model_lgb),
    ('random_forest', rf_model)
    ],weights=[1.0, 1.0]
)

voting_clf.fit(X_train, y_train)
y_pred_ens = voting_clf.predict(X_val)
kappa = cohen_kappa_score(y_val, y_pred_ens, weights='quadratic')
print(kappa)

0.8910361091744536


In [18]:
test = pd.read_csv(test_file)
test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,32.6909,,,,,,,,Fall,3.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,Summer,1.04,,,,,,,
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,63.1265,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,...,47.2211,,,Winter,3.67,Winter,27.0,40.0,Fall,3.0
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,50.4767,,,Fall,1.27,,,,Fall,2.0
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,,,Summer,2.0
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,


In [19]:
df_test

Unnamed: 0,SDS-SDS_Total_Raw,Basic_Demos-Age,SDS-SDS_Total_T,Physical-Height,Physical-Weight,PreInt_EduHx-computerinternet_hoursday,Physical-BMI,Physical-HeartRate,Physical-Systolic_BP,CGAS-CGAS_Score,...,BIA-BIA_BMI,FGC-FGC_GSD,FGC-FGC_GSND,SDS-Season,Physical-Season,PAQ_C-Season,PreInt_EduHx-Season,Basic_Demos-Enroll_Season,Fitness_Endurance-Time_Sec,Fitness_Endurance-Season
0,37.5,5,53.5,46.0,50.8,3.0,16.877316,80.0,116.0,51.0,...,16.8792,17.9,16.5,-1,0,-1,0,0,33.0,-1
1,46.0,9,64.0,48.0,46.0,0.0,14.03559,70.0,122.0,63.0,...,14.0371,17.9,16.5,0,0,0,1,1,33.0,-1
2,38.0,10,54.0,56.5,75.6,2.0,16.648696,94.0,117.0,71.0,...,17.78405,14.7,10.2,0,0,1,1,1,33.0,0
3,31.0,9,45.0,56.0,81.6,0.0,18.292347,97.0,117.0,71.0,...,18.2943,17.9,16.5,1,1,2,2,2,37.0,1
4,37.5,18,53.5,55.0,81.6,2.0,18.292347,80.0,116.0,63.0,...,17.78405,17.9,16.5,-1,-1,-1,-1,3,33.0,-1
5,40.0,13,56.0,59.5,112.2,0.0,22.279952,73.0,102.0,50.0,...,30.1865,17.9,16.5,1,1,3,3,3,33.0,-1
6,27.0,10,40.0,55.0,84.6,3.0,19.66076,83.0,163.0,63.0,...,19.6629,17.9,16.5,2,0,2,0,0,33.0,-1
7,37.5,10,53.5,59.25,84.2,2.0,16.861286,90.0,116.0,63.0,...,16.8631,11.1,12.6,-1,0,0,0,0,33.0,-1
8,37.5,15,53.5,55.0,81.6,2.0,18.292347,80.0,116.0,63.0,...,17.78405,17.9,16.5,-1,2,-1,1,1,33.0,-1
9,37.5,19,53.5,55.0,81.6,2.0,18.292347,80.0,116.0,63.0,...,17.78405,17.9,16.5,-1,-1,-1,-1,1,33.0,-1


In [20]:
# y_pred_test = np.round(model_lgb.predict(X_test), 0)
# y_pred_test = np.where(y_pred_test == -0. , 0. , y_pred_test)
# y_pred_test
#votingclf
y_pred_test = voting_clf.predict(X_test)
y_pred_test

array([2., 0., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1.])

In [21]:
#lgbmClf
# y_pred_test = voting_clf.predict(X_test)
# y_pred_test

In [22]:
test['sii'] = y_pred_test
test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,Summer,1.04,,,,,,,,1.0
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0,1.0
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,...,,,Winter,3.67,Winter,27.0,40.0,Fall,3.0,0.0
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,,,Fall,1.27,,,,Fall,2.0,1.0
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,,Summer,2.0,0.0
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,1.0


In [23]:
submission = test[['id', 'sii']]
print(submission)
submission.to_csv('submission.csv', index=False)

          id  sii
0   00008ff9  2.0
1   000fd460  0.0
2   00105258  0.0
3   00115b9f  1.0
4   0016bb22  1.0
5   001f3379  1.0
6   0038ba98  0.0
7   0068a485  1.0
8   0069fbed  0.0
9   0083e397  1.0
10  0087dd65  0.0
11  00abe655  0.0
12  00ae59c9  1.0
13  00af6387  0.0
14  00bd4359  0.0
15  00c0cd71  0.0
16  00d56d4b  0.0
17  00d9913d  0.0
18  00e6167c  0.0
19  00ebc35d  1.0
