In [1]:
# Import thư viện cần thiết
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import optuna
from sklearn.ensemble import VotingClassifier

In [2]:
# Đọc dữ liệu
train_file = '/kaggle/input/child-mind-institute-problematic-internet-use/train.csv'
test_file = '/kaggle/input/child-mind-institute-problematic-internet-use/test.csv'

df = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [3]:
# Xử lý cột ID
id_column = df_test['id']
df_test = df_test.drop(columns=['id'])
df = df.drop(columns=['id'])

In [4]:
# Xử lý cột mục tiêu
target = df.pop('sii')
df = df[target.notna()]
target = target[target.notna()]

In [5]:
# encode object type
for column in df.columns:
    if df[column].dtype == object:
        df[column], _ = pd.factorize(df[column])
for column in df_test.columns:
    if df_test[column].dtype == object:
        df_test[column], _ = pd.factorize(df_test[column])

In [6]:
# Điền giá trị thiếu
df.fillna(df.median(), inplace=True)
df_test.fillna(df_test.median(), inplace=True)

In [7]:
# Đồng bộ hóa cột giữa df và df_test
common_columns = df.columns.intersection(df_test.columns)
df = df[common_columns]
df_test = df_test[common_columns]

In [8]:
# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X = scaler.fit_transform(df)
X_test = scaler.transform(df_test)
y = target.values

In [9]:
# Cân bằng dữ liệu với SMOTE
smote = SMOTE(random_state=42, k_neighbors=5)
X, y = smote.fit_resample(X, y)
X, y

(array([[-1.35846744, -1.52848742, -0.75717765, ..., -0.19473238,
         -1.3138536 ,  1.86535646],
        [-0.47436494, -0.36140681, -0.75717765, ...,  0.52255859,
         -0.44325456, -0.95328231],
        [-0.47436494, -0.06963666,  1.32069402, ..., -0.27443138,
         -0.44325456,  0.9258102 ],
        ...,
        [ 1.29384006,  1.25097053,  0.336178  , ...,  1.08880089,
          1.29794353,  1.42019021],
        [-1.30732702,  1.4904778 ,  1.32069402, ...,  1.02973998,
         -1.26349428, -0.84458724],
        [-1.3211493 ,  1.08512827, -0.75717765, ...,  2.67106739,
         -1.27710544,  0.96546864]]),
 array([2., 0., 0., ..., 3., 3., 3.]))

In [10]:
# #optuna: params tunning
# def objective(
#     trial: optuna.Trial,
# ) -> float:
#     """Objective function for optuna optimisation."""
#     params = {
#         "boosting_type": "gbdt",
#         "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.3, step=0.01),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 150, step=10),
#         "max_depth": trial.suggest_int("max_depth", 10, 20),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1, step=0.1),
#         "verbose" : -1,
#     }
    
#     #kfold
#     kappas = []
#     kf = StratifiedKFold(n_splits=5, shuffle=True)
#     for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#         X_train, X_val = X[train_idx], X[val_idx]
#         y_train, y_val = y[train_idx], y[val_idx]

#         train_data = lgb.Dataset(X_train, y_train)
#         val_data = lgb.Dataset(X_val, y_val)
#         model_lgb = lgb.train(params, train_data)

    
#         y_pred = np.round(model_lgb.predict(X_val), 0)
#         kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
#         kappas.append(kappa)
#     return np.mean(kappas)

#     #single fit
#     # classifier = LGBMClassifier(**params)
#     # classifier.fit(X_train, y_train)
#     # y_pred = classifier.predict(X_val)
#     # return cohen_kappa_score(y_val, y_pred, weights='quadratic')

# objective_func = lambda trial: objective(
#     trial,
# )

# # Run the optimisation
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_func, n_trials=200)

# # Get the best hyperparameters
# print(study.best_params)

In [11]:
params = {
    "boosting_type" : "gbdt", 'verbose' : -1, 'max_bin': 255,
    # 'learning_rate': 0.21, 'max_depth': 13, 'feature_fraction': 0.7
    #'learning_rate': 0.28, 'min_data_in_leaf': 40, 'max_depth': 10, 'feature_fraction': 0.7
    # 'learning_rate': 0.17, 'min_data_in_leaf': 80, 'max_depth': 17, 'feature_fraction': 0.9
    # 'learning_rate': 0.25, 'min_data_in_leaf': 20, 'max_depth': 14, 'feature_fraction': 0.6
    # 'learning_rate': 0.3, 'min_data_in_leaf': 100, 'max_depth': 20, 'feature_fraction': 0.8
    'learning_rate': 0.25, 'min_data_in_leaf': 70, 'max_depth': 15, 'feature_fraction': 0.8
}
#LGBM default 0.8778305601985833 base
#model = LGBMClassifier(verbose=-1)

In [12]:
# #8:2 split training
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# #lgb train
# train_data = lgb.Dataset(X_train, y_train)
# val_data = lgb.Dataset(X_val, y_val)
# model_lgb = lgb.train(params, train_data)

# #Skit fit
# # model_lgb = LGBMClassifier(**params)
# # model_lgb.fit(X_train, y_train)

# y_pred = np.round(model_lgb.predict(X_val), 0)
# kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
# print(kappa)

In [13]:
#kfold train
model_lgb = LGBMClassifier(**params)

kappas = []
kf = StratifiedKFold(n_splits=5, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # train_data = lgb.Dataset(X_train, y_train)
    # val_data = lgb.Dataset(X_val, y_val)
    # model_lgb = lgb.train(params, train_data)

    # y_pred = np.round(model_lgb.predict(X_val), 0)
    # y_pred = np.where(y_pred == -0. , 0. , y_pred)

    model_lgb.fit(X_train, y_train)
    y_pred = model_lgb.predict(X_val)
    # print(y_pred)
    kappa = cohen_kappa_score(y_val, y_pred, weights='quadratic')
    kappas.append(kappa)

print (np.mean(kappas))

0.8975964481182892


In [14]:
# Huấn luyện mô hình Random Forest
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state= 40)
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_val)
kappa = cohen_kappa_score(y_val, y_pred_rf, weights='quadratic')
print(kappa)

0.9060411609872963


In [15]:
# def objective_weights(
#     trial: optuna.Trial,
# ) -> float:
#     params = {
#         "weight_lgb" : trial.suggest_float("weight_lgb", 1.0, 2.0, step=0.1),
#         "weight_rf" : trial.suggest_float("weight_rf", 1.0, 2.0, step=0.1)
#     }
#     weight_lgb = params["weight_lgb"]
#     weight_rf = params["weight_rf"]

#     voting_clf = VotingClassifier(
#         estimators=[
#         ('lightgbm', model_lgb),
#         ('random_forest', rf_model)
#         ],weights=[weight_lgb, weight_rf]
#     )

#     voting_clf.fit(X_train, y_train)
#     y_pred_ens = voting_clf.predict(X_val)
#     kappa = cohen_kappa_score(y_val, y_pred_ens, weights='quadratic')
#     return kappa

# objective_func = lambda trial: objective_weights(
#     trial,
# )

# study = optuna.create_study(direction='maximize')
# study.optimize(objective_func, n_trials=50)

# print(study.best_params)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
voting_clf = VotingClassifier(
    estimators=[
    ('lightgbm', model_lgb),
    ('random_forest', rf_model)
    ],weights=[1.0, 1.0]
)

voting_clf.fit(X_train, y_train)
y_pred_ens = voting_clf.predict(X_val)
kappa = cohen_kappa_score(y_val, y_pred_ens, weights='quadratic')
print(kappa)

0.8809330898404855


In [17]:
test = pd.read_csv(test_file)
test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,32.6909,,,,,,,,Fall,3.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,Summer,1.04,,,,,,,
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,63.1265,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,...,47.2211,,,Winter,3.67,Winter,27.0,40.0,Fall,3.0
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,50.4767,,,Fall,1.27,,,,Fall,2.0
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,,,Summer,2.0
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,


In [18]:
df_test

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,0,5,0,0,51.0,0,16.877316,46.0,50.8,24.0,...,32.6909,-1,1.04,-1,2.34,-1,37.5,53.5,0,3.0
1,1,9,0,-1,63.0,0,14.03559,48.0,46.0,22.0,...,27.0552,-1,1.04,0,2.34,0,46.0,64.0,1,0.0
2,1,10,1,1,71.0,0,16.648696,56.5,75.6,24.0,...,46.60885,-1,1.04,1,2.17,0,38.0,54.0,1,2.0
3,2,9,0,1,71.0,1,18.292347,56.0,81.6,24.0,...,45.9966,-1,1.04,2,2.451,1,31.0,45.0,2,0.0
4,3,18,1,2,63.0,-1,18.292347,55.0,81.6,24.0,...,46.60885,0,1.04,-1,2.34,-1,37.5,53.5,-1,2.0
5,3,13,1,0,50.0,1,22.279952,59.5,112.2,24.0,...,63.1265,-1,1.04,3,4.11,1,40.0,56.0,3,0.0
6,0,10,0,-1,63.0,0,19.66076,55.0,84.6,24.0,...,47.2211,-1,1.04,2,3.67,2,27.0,40.0,0,3.0
7,0,10,1,-1,63.0,0,16.861286,59.25,84.2,27.0,...,50.4767,-1,1.04,0,1.27,-1,37.5,53.5,0,2.0
8,1,15,0,-1,63.0,2,18.292347,55.0,81.6,24.0,...,46.60885,-1,1.04,-1,2.34,-1,37.5,53.5,1,2.0
9,1,19,1,2,63.0,-1,18.292347,55.0,81.6,24.0,...,46.60885,-1,1.04,-1,2.34,-1,37.5,53.5,-1,2.0


In [19]:
# y_pred_test = np.round(model_lgb.predict(X_test), 0)
# y_pred_test = np.where(y_pred_test == -0. , 0. , y_pred_test)
# y_pred_test
y_pred_test = voting_clf.predict(X_test)
y_pred_test

array([2., 0., 0., 1., 2., 1., 0., 0., 2., 2., 0., 0., 1., 0., 2., 2., 0.,
       0., 0., 0.])

In [20]:
test['sii'] = y_pred_test
test

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,Summer,1.04,,,,,,,,2.0
5,001f3379,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,...,,,Spring,4.11,Summer,40.0,56.0,Spring,0.0,1.0
6,0038ba98,Fall,10,0,,,Fall,19.66076,55.0,84.6,...,,,Winter,3.67,Winter,27.0,40.0,Fall,3.0,0.0
7,0068a485,Fall,10,1,,,Fall,16.861286,59.25,84.2,...,,,Fall,1.27,,,,Fall,2.0,0.0
8,0069fbed,Summer,15,0,,,Spring,,,,...,,,,,,,,Summer,2.0,2.0
9,0083e397,Summer,19,1,Summer,,,,,,...,,,,,,,,,,2.0


In [21]:
submission = test[['id', 'sii']]
print(submission)
submission.to_csv('submission.csv', index=False)

          id  sii
0   00008ff9  2.0
1   000fd460  0.0
2   00105258  0.0
3   00115b9f  1.0
4   0016bb22  2.0
5   001f3379  1.0
6   0038ba98  0.0
7   0068a485  0.0
8   0069fbed  2.0
9   0083e397  2.0
10  0087dd65  0.0
11  00abe655  0.0
12  00ae59c9  1.0
13  00af6387  0.0
14  00bd4359  2.0
15  00c0cd71  2.0
16  00d56d4b  0.0
17  00d9913d  0.0
18  00e6167c  0.0
19  00ebc35d  0.0
