In [28]:
# import modules
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import (PowerTransformer, 
                                   LabelEncoder)
from sklearn.model_selection import train_test_split

In [29]:
# init global variables
train_path = '/kaggle/input/playground-series-s3e26/train.csv'
test_path = '/kaggle/input/playground-series-s3e26/test.csv'

In [30]:
# read datasets
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [31]:
class GetDummies:
    def __init__(self, name='category', fit=None):
        self.columns = None
        self.fillna_value = None
        self.name = name
        
        if fit is not None: 
            self.fit(fit)
            self.first_fit = True
        else: 
            self.first_fit = False
            
    def fit(self, series):
        self.columns = series.unique()
        self.first_fit = True

    def set_fillna(self, v):
        self.fillna_value = v

    def lst_transform(self, data):
        result = []

        for val in data:
            variants = [0] * len(self.columns)
            not_founded = True

            for i, col in enumerate(self.columns):
                if val == col:
                    variants[i] = 1
                    result.append(variants)
                    not_founded = False
                    break

            if not_founded:
                result.append([self.fillna_value] * len(self.columns))
        return result

    def transform(self, data):
        tr_lst = self.lst_transform(data)

        df_data = {f'{self.name}_{col}': [] for col in self.columns}

        for tr in tr_lst:
            for col, val in zip(self.columns, tr):
                df_data[f'{self.name}_{col}'].append(val)

        return pd.DataFrame(df_data)

    def __call__(self, data):
        if self.first_fit == False: 
            self.fit(data)
            self.first_fit = True
            
        return self.transform(data.to_list())

In [44]:
false_true_cols = ["Sex", "Ascites", "Spiders", "Edema", "Hepatomegaly"]

drug_enc = GetDummies(name = "drug", fit = train_df['Drug'])

status_enc = GetDummies(name = "status", fit = train_df['Status'])

def false_true_cols_(df) -> pd.DataFrame: 
    for i in false_true_cols: 
        if i == 'Sex': 
            df[i] = df[i].apply(lambda l: 1 if l == 'F' else 0)
        else:  
            df[i] = df[i].apply(lambda l: 1 if l == 'N' else 0)
    return df 

def preprocess_y(df): 
    return status_enc(df)

def categorical_cols_(df) -> pd.DataFrame: 
    drug_dummies = drug_enc(df['Drug'])
    df = df.drop('Drug', axis=1)
    df = pd.concat([drug_dummies, df], axis=1)
    
    return df

def transform_numeric_cols_(df) -> pd.DataFrame:
    ...
    return df

def preprocess_x(df) -> pd.DataFrame: 
    df = df.copy()
    
    df = df.drop('id', axis=1)
    df = transform_numeric_cols_(df)
    df = false_true_cols_(df)
    df = categorical_cols_(df)
    
    return df

In [45]:
X = preprocess_x(train_df.drop(['Status'], axis=1))
y = preprocess_y(train_df['Status'])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8)

In [46]:
X_train.head()

Unnamed: 0,drug_D-penicillamine,drug_Placebo,N_Days,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
7384,1,0,2456,18499,1,1,0,1,1,0.6,360.0,3.9,52.0,1257.0,150.35,188.0,178.0,9.9,4.0
388,0,1,1542,25569,1,1,1,1,1,3.4,212.0,3.76,444.0,1065.0,120.9,77.0,216.0,11.0,4.0
6362,0,1,930,12120,1,1,0,1,1,2.2,518.0,3.77,77.0,2520.0,92.0,114.0,309.0,9.5,4.0
3361,0,1,1444,19002,1,1,0,0,1,5.2,1128.0,3.68,53.0,3228.0,165.85,166.0,421.0,9.9,3.0
2984,0,1,943,19098,1,1,0,0,0,28.0,382.0,3.26,111.0,4032.0,201.5,171.0,335.0,10.0,3.0


In [65]:
params = { 
    'max_depth': 7,
    'learning_rate': 0.1, 
    'early_stopping_rounds': 50
}

# Initialize the XGBoost classifier with the specified parameters
xgb_classifier = xgb.XGBClassifier(**params)
xgb_classifier.fit(X_train, y_train, eval_set = [(X_val, y_val)])

[0]	validation_0-logloss:0.58279
[1]	validation_0-logloss:0.53963
[2]	validation_0-logloss:0.50509
[3]	validation_0-logloss:0.47631
[4]	validation_0-logloss:0.45169
[5]	validation_0-logloss:0.43051
[6]	validation_0-logloss:0.41307
[7]	validation_0-logloss:0.39835
[8]	validation_0-logloss:0.38510
[9]	validation_0-logloss:0.37406
[10]	validation_0-logloss:0.36469
[11]	validation_0-logloss:0.35554
[12]	validation_0-logloss:0.34795
[13]	validation_0-logloss:0.34077
[14]	validation_0-logloss:0.33489
[15]	validation_0-logloss:0.32968
[16]	validation_0-logloss:0.32523
[17]	validation_0-logloss:0.32117
[18]	validation_0-logloss:0.31793
[19]	validation_0-logloss:0.31480
[20]	validation_0-logloss:0.31164
[21]	validation_0-logloss:0.30936
[22]	validation_0-logloss:0.30713
[23]	validation_0-logloss:0.30557
[24]	validation_0-logloss:0.30389
[25]	validation_0-logloss:0.30213
[26]	validation_0-logloss:0.30100
[27]	validation_0-logloss:0.30019
[28]	validation_0-logloss:0.29899
[29]	validation_0-loglos

In [68]:
test_df = pd.read_csv(test_path)
test_X = preprocess_x(test_df)
prediction = xgb_classifier.predict_proba(test_X)

In [75]:
name = 'Status'
columns = ['D', 'C', 'CL']

df_data = {f'{name}_{col}': [] for col in columns}

for tr in prediction:
    for col, val in zip(columns, tr):
        df_data[f'{name}_{col}'].append(val)

submission = pd.DataFrame({'id': test_df['id'], **df_data})

In [77]:
submission.to_csv('submission.csv', index=False)