In [1]:
import numpy as np
import pandas as pd
import catboost as cat
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from scipy.stats import mode

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv")

In [3]:
train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [4]:
for col in train.select_dtypes(include = "object"):
    print(col, train[col].nunique())

class 2
cap-shape 74
cap-surface 83
cap-color 78
does-bruise-or-bleed 26
gill-attachment 78
gill-spacing 48
gill-color 63
stem-root 38
stem-surface 60
stem-color 59
veil-type 22
veil-color 24
has-ring 23
ring-type 40
spore-print-color 32
habitat 52
season 4


In [5]:
train.isnull().sum()

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [6]:
train["class"].value_counts()

class
p    1705396
e    1411549
Name: count, dtype: int64

In [7]:
trainna = train.fillna("None")
testna = test.fillna("None")

X = trainna.drop("class",axis = 1)
y = trainna["class"]

cat_cols = list(X.select_dtypes(include = "object").columns)
X[cat_cols] = X[cat_cols].astype("string")
testna[cat_cols] = testna[cat_cols].astype("string")

In [8]:
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)

base_params = {"random_state":42,
              "task_type": "GPU",
              "verbose":0,
              "early_stopping_rounds":50}
scores = []
test_predictions = []

for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]

    trainDF = cat.Pool(X_train,y_train,cat_features = cat_cols)
    valDF = cat.Pool(X_val,y_val,cat_features = cat_cols)

    base_model = cat.CatBoostClassifier(**base_params)
    base_model.fit(trainDF,
                   eval_set= [valDF])

    preds = base_model.predict(valDF)
    score = matthews_corrcoef(y_val,preds)
    scores.append(score)
    
    print(f"CatBoost Base Score Fold {fold+1}:", score)

    test_preds = base_model.predict(testna)
    test_predictions.append(test_preds)

print("CatBoost Base Average Score:", np.mean(scores))
                


CatBoost Base Score Fold 1: 0.9827079115364145
CatBoost Base Score Fold 2: 0.9826580043180656
CatBoost Base Score Fold 3: 0.9830617399747371
CatBoost Base Score Fold 4: 0.9829314432631654
CatBoost Base Score Fold 5: 0.9829494128607906
CatBoost Base Average Score: 0.9828617023906346


In [9]:
test_results = pd.DataFrame(np.array(test_predictions).T,columns = ["Fold1","Fold2","Fold3","Fold4","Fold5"])
test_results.head()

Unnamed: 0,Fold1,Fold2,Fold3,Fold4,Fold5
0,e,e,e,e,e
1,p,p,p,p,p
2,p,p,p,p,p
3,p,p,p,p,p
4,e,e,e,e,e


In [10]:
submission = pd.read_csv("/kaggle/input/playground-series-s4e8/sample_submission.csv")
submission["class"] = test_results.mode(axis = 1)[0].values
submission.to_csv("submission.csv", index=False)