In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv",index_col = "id")
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv",index_col = "id")

In [3]:
X = train.drop("class",axis = 1)
y = train["class"]

#--------
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = pd.Series(le.fit_transform(y))
#--------

cat_cols = list(X.select_dtypes(include = "object").columns)
X[cat_cols] = X[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")

In [None]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)

base_params = {"random_state":42,
               "objective": "binary:logistic",
              "verbosity": 0,
               "enable_categorical": True
              }
scores = []
test_predictions = []

for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]

    dtrain = xgb.DMatrix(data = X_train, label = y_train,enable_categorical = True)
    dval = xgb.DMatrix(data = X_val, label = y_val,enable_categorical = True)

    base_model = xgb.train(base_params,
                          dtrain,
                          num_boost_round = 1000,
                          evals = [(dval, 'validation')],
                           verbose_eval = False,
                       
                          early_stopping_rounds = 50,
                          # callbacks= [xgb.callback.EvaluationMonitor(rank = 0,period = 100)]
                          )
                           

    preds = np.round(base_model.predict(dval))
    score = matthews_corrcoef(y_val,preds)
    scores.append(score)
    
    print(f"XGB Base Score Fold {fold+1}:", score)

    test_dmatrix = xgb.DMatrix(data=test, enable_categorical=True)
    test_preds = base_model.predict(test_dmatrix) #should be np.round(base_model.predict(test_dmatrix))
    test_predictions.append(test_preds)

print("XGB Base Average Score:", np.mean(scores))

XGB Base Score Fold 1: 0.9842329418307897
XGB Base Score Fold 2: 0.9841183883719626
XGB Base Score Fold 3: 0.9844344847206048
XGB Base Score Fold 4: 0.9840393760018963
XGB Base Score Fold 5: 0.9842480252523854
XGB Base Average Score: 0.9842146432355279


In [5]:
test_results = pd.DataFrame(np.array(test_predictions).T,columns = ["Fold1","Fold2","Fold3","Fold4","Fold5"])
test_results.head()

Unnamed: 0,Fold1,Fold2,Fold3,Fold4,Fold5
0,0.901372,0.114438,0.328848,0.649806,0.151351
1,0.390922,0.587641,0.591598,0.783078,0.984415
2,0.515817,0.020015,0.251042,0.320157,0.334768
3,0.97414,0.983644,0.996471,0.9993,0.996139
4,0.203687,0.5077,0.775405,0.254841,0.227267


In [6]:
submission = pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")
submission["class"] = le.inverse_transform(test_results.mode(axis = 1)[0].values.astype(int))
submission.to_csv("submission.csv", index=False)