In [35]:
# import modules
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import (PowerTransformer, 
                                   LabelEncoder)
from sklearn.model_selection import train_test_split

In [36]:
# init global variables
train_path = '/kaggle/input/playground-series-s3e26/train.csv'
test_path = '/kaggle/input/playground-series-s3e26/test.csv'

In [37]:
# read datasets
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [38]:
true_false_cols = ["Sex", "Ascites", "Spiders", "Edema", "Hepatomegaly"]

def transform_numeric_cols_(df) -> pd.DataFrame:
    ...
    return df

def preprocess_y(df) -> pd.DataFrame: 
    ...
    return df 

def preprocess_x(df) -> pd.DataFrame: 
    df = df.copy()
    
    df = df.drop('id', axis=1)
    df = transform_numeric_cols_(df)    
    return df

In [39]:
X = preprocess_x(train_df.drop(['Status'], axis=1))
y = preprocess_y(train_df['Status'])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.7)

In [40]:
from catboost import Pool, CatBoostClassifier

train_pool = Pool(X_train, y_train, cat_features = [*true_false_cols, 'Drug'])
val_pool = Pool(X_val, y_val, cat_features = [*true_false_cols, 'Drug'])

In [41]:
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=1000,
                           random_seed = 42,
                           learning_rate=0.1,
                           max_depth=7, 
                           early_stopping_rounds = 50, 
                           loss_function='MultiClass')

# Fit model
model.fit(train_pool, 
          eval_set = val_pool, 
          use_best_model=True, 
#           plot=True
         )

0:	learn: 0.9975312	test: 0.9995745	best: 0.9995745 (0)	total: 20.2ms	remaining: 20.2s
1:	learn: 0.9222518	test: 0.9260100	best: 0.9260100 (1)	total: 41.7ms	remaining: 20.8s
2:	learn: 0.8598307	test: 0.8653612	best: 0.8653612 (2)	total: 59.8ms	remaining: 19.9s
3:	learn: 0.8081978	test: 0.8163878	best: 0.8163878 (3)	total: 77.1ms	remaining: 19.2s
4:	learn: 0.7642824	test: 0.7747363	best: 0.7747363 (4)	total: 92.4ms	remaining: 18.4s
5:	learn: 0.7270139	test: 0.7388073	best: 0.7388073 (5)	total: 108ms	remaining: 18s
6:	learn: 0.6957244	test: 0.7087766	best: 0.7087766 (6)	total: 124ms	remaining: 17.6s
7:	learn: 0.6701924	test: 0.6847397	best: 0.6847397 (7)	total: 140ms	remaining: 17.3s
8:	learn: 0.6462525	test: 0.6616435	best: 0.6616435 (8)	total: 154ms	remaining: 17s
9:	learn: 0.6247271	test: 0.6417182	best: 0.6417182 (9)	total: 170ms	remaining: 16.9s
10:	learn: 0.6058788	test: 0.6249216	best: 0.6249216 (10)	total: 186ms	remaining: 16.7s
11:	learn: 0.5901858	test: 0.6103825	best: 0.610382

<catboost.core.CatBoostClassifier at 0x7a5bc5e72410>

In [48]:
test_df = pd.read_csv(test_path)
test_X = preprocess_x(test_df)
test_pool =  Pool(test_X, cat_features = [*true_false_cols, 'Drug'])
prediction = model.predict_proba(test_pool)
prediction

array([[0.70055911, 0.01173776, 0.28770313],
       [0.72215981, 0.14539623, 0.13244396],
       [0.04847625, 0.07674251, 0.87478124],
       ...,
       [0.88703861, 0.01187512, 0.10108627],
       [0.97915342, 0.01423346, 0.00661312],
       [0.39756156, 0.02115952, 0.58127892]])

In [46]:
prediction

array([['C'],
       ['C'],
       ['D'],
       ...,
       ['C'],
       ['C'],
       ['D']], dtype=object)

In [49]:
name = 'Status'
columns = ['C', 'CL', 'D']

df_data = {f'{name}_{col}': [] for col in columns}

for tr in prediction:
    for col, val in zip(columns, tr):
        df_data[f'{name}_{col}'].append(val)

submission = pd.DataFrame({'id': test_df['id'], **df_data})

In [50]:
submission.to_csv('submission.csv', index=False)