In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv").drop(["id","Name"],axis =1) 
original = pd.read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv").drop("Name",axis = 1)
original["Depression"] = original["Depression"].map({"No": 0, "Yes": 1})
train = pd.concat([train,original])
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
test = test.drop(["id","Name"],axis = 1)

object_columns = [col for col in train.columns if train[col].dtype == 'object']
encode_columns = object_columns + ["Academic Pressure", "Work Pressure", "Study Satisfaction", "Work/Study Hours", "Financial Stress","Job Satisfaction"]

for col in encode_columns:
    train[col] = train[col].astype("category")
    test[col] = test[col].astype("category")

cat_features = encode_columns.copy()
del object_columns,encode_columns

X,y = train.iloc[:,:-1].copy(),train.iloc[:,-1].copy()
skf = StratifiedKFold(n_splits = 10,random_state = 42,shuffle = True)

positive_count = sum(y == 1) 
negative_count = sum(y == 0) 
scale_pos_weight = negative_count / positive_count

lgb_oof_preds = np.zeros(len(y))
lgb_oof_probas = np.zeros(len(y))
lgb_test_preds = np.zeros(len(test))
lgb_test_probas = np.zeros(len(test))
lgb_train_scores = []


lgb_params = {'learning_rate': 0.07164153618562107, 
              'num_leaves': 42,
              'max_depth': 10,
              'min_child_samples': 95,
              'subsample': 0.6396409682368092,
              'colsample_bytree': 0.40808651926024586,
              'lambda_l1': 2.239039849097975e-06, 
              'lambda_l2': 0.0016799972868280992,
              'n_estimators': 3000,
              'random_state': 42,
                'objective': 'binary',
                'metric': 'binary_logloss', 
                'boosting_type': 'gbdt',
                'class_weight': 'balanced',  #for imbalanced datasets weight proportional to inverse count
                'verbosity': -1,
                'enable_categorical': True,  
             }

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[test_idx]


    lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_features)
    lgb_valid = lgb.Dataset(X_valid, y_valid, categorical_feature=cat_features, reference=lgb_train)
    
    model = lgb.train(
        lgb_params,
        lgb_train,
        valid_sets=[lgb_valid],
        callbacks=[lgb.early_stopping(stopping_rounds=200)],  
    )

    y_pred_proba = model.predict(X_valid, num_iteration=model.best_iteration)
    y_pred = (y_pred_proba > 0.5).astype(int)
    accuracy = accuracy_score(y_valid, y_pred)
    lgb_train_scores.append(accuracy)

    print("Fold:", fold, "Accuracy Score:", accuracy)

    y_test_probas = model.predict(test)
    y_test_preds = (y_test_probas > 0.5).astype(int)

    lgb_oof_preds[test_idx] = y_pred
    lgb_oof_probas[test_idx] = y_pred_proba
    lgb_test_probas += y_test_probas
    lgb_test_preds += y_test_preds

lgb_test_probas /= skf.get_n_splits()
lgb_test_preds /= skf.get_n_splits()

mean_score = np.mean(lgb_train_scores)
print("Mean Accuracy Score:", mean_score)

lgb_frame = pd.DataFrame({"lgb_oof_preds": lgb_oof_preds,
                         "lgb_oof_probas":lgb_oof_probas})
lgb_test_frame = pd.DataFrame({"lgb_test_preds": lgb_test_preds,
                              "lgb_test_probas": lgb_test_probas})
# lgb_frame.to_csv("lgbt_oof_frame.csv",index = False)
# lgb_test_frame.to_csv("lgb_test_frame.csv",index = False)
submission_id = range(140700,234499+1)
submission = pd.DataFrame({'id': submission_id,
                          'Depression': lgb_test_frame.lgb_test_preds.values.astype(int)}).set_index('id')
submission.to_csv("submission.csv", index = False)



Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[176]	valid_0's binary_logloss: 0.150616
Fold: 0 Accuracy Score: 0.9395504676811391




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[207]	valid_0's binary_logloss: 0.148239
Fold: 1 Accuracy Score: 0.9416445623342176




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[183]	valid_0's binary_logloss: 0.152246
Fold: 2 Accuracy Score: 0.9398296803015497




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[175]	valid_0's binary_logloss: 0.153511
Fold: 3 Accuracy Score: 0.9359207036158035




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[210]	valid_0's binary_logloss: 0.140979
Fold: 4 Accuracy Score: 0.9441574759179114




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[167]	valid_0's binary_logloss: 0.152438
Fold: 5 Accuracy Score: 0.9389222392852157




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[210]	valid_0's binary_logloss: 0.149428
Fold: 6 Accuracy Score: 0.9412914485165794




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[199]	valid_0's binary_logloss: 0.149834
Fold: 7 Accuracy Score: 0.9383595113438046




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[214]	valid_0's binary_logloss: 0.145931
Fold: 8 Accuracy Score: 0.9390575916230367




Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[166]	valid_0's binary_logloss: 0.150677
Fold: 9 Accuracy Score: 0.9384293193717278
Mean Accuracy Score: 0.9397162999990986
