In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv",index_col = "id")
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv",index_col = "id")

In [3]:
X = train.drop("class",axis = 1)
y = train["class"]

cat_cols = list(X.select_dtypes(include = "object").columns)
X[cat_cols] = X[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")

In [4]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)

base_params = {"random_state":42,
              "verbose":0,
              "early_stopping_round":50}
scores = []
test_predictions = []

for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]

    base_model = lgb.LGBMClassifier(**base_params)
    base_model.fit(X_train,y_train,
                   eval_set= [(X_val,y_val)],
                   categorical_feature = cat_cols
                  )

    preds = base_model.predict(X_val)
    score = matthews_corrcoef(y_val,preds)
    scores.append(score)
    
    print(f"LGBM Base Score Fold {fold+1}:", score)

    test_preds = base_model.predict(test)
    test_predictions.append(test_preds)

print("LGBM Base Average Score:", np.mean(scores))
                

LGBM Base Score Fold 1: 0.9805006397368331
LGBM Base Score Fold 2: 0.9805379563539303
LGBM Base Score Fold 3: 0.9810287327353373
LGBM Base Score Fold 4: 0.9806241276593243
LGBM Base Score Fold 5: 0.980753927998497
LGBM Base Average Score: 0.9806890768967843


In [5]:
test_results = pd.DataFrame(np.array(test_predictions).T,columns = ["Fold1","Fold2","Fold3","Fold4","Fold5"])
test_results.head()

Unnamed: 0,Fold1,Fold2,Fold3,Fold4,Fold5
0,e,e,e,e,e
1,p,p,p,p,p
2,p,p,p,p,p
3,p,p,p,p,p
4,e,e,e,e,e


In [6]:
submission = pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")
submission["class"] = test_results.mode(axis = 1)[0].values
submission.to_csv("submission.csv", index=False)