In [221]:
# import modules
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import (PowerTransformer, 
                                   LabelEncoder)
from sklearn.model_selection import train_test_split

In [222]:
# init global variables
train_path = '/kaggle/input/playground-series-s3e26/train.csv'
test_path = '/kaggle/input/playground-series-s3e26/test.csv'

In [223]:
# read datasets
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [224]:
class GetDummies:
    def __init__(self, name='category', fit=None):
        self.columns = None
        self.fillna_value = None
        self.name = name
        
        if fit is not None: 
            self.fit(fit)
            self.first_fit = True
        else: 
            self.first_fit = False
            
    def fit(self, series):
        self.columns = series.unique()
        self.first_fit = True

    def set_fillna(self, v):
        self.fillna_value = v

    def lst_transform(self, data):
        result = []

        for val in data:
            variants = [0] * len(self.columns)
            not_founded = True

            for i, col in enumerate(self.columns):
                if val == col:
                    variants[i] = 1
                    result.append(variants)
                    not_founded = False
                    break

            if not_founded:
                result.append([self.fillna_value] * len(self.columns))
        return result

    def transform(self, data):
        tr_lst = self.lst_transform(data)

        df_data = {f'{self.name}_{col}': [] for col in self.columns}

        for tr in tr_lst:
            for col, val in zip(self.columns, tr):
                df_data[f'{self.name}_{col}'].append(val)

        return pd.DataFrame(df_data)

    def __call__(self, data):
        if self.first_fit == False: 
            self.fit(data)
            self.first_fit = True
            
        return self.transform(data.to_list())

In [225]:
false_true_cols = ["Sex", "Ascites", "Spiders", "Edema", "Hepatomegaly"]

drug_enc = GetDummies(name = "drug", fit = train_df['Drug'])

def false_true_cols_(df) -> pd.DataFrame: 
    for i in false_true_cols: 
        if i == 'Sex': 
            df[i] = df[i].apply(lambda l: 1 if l == 'F' else 0)
        else:  
            df[i] = df[i].apply(lambda l: 1 if l == 'N' else 0)
    return df 

def preprocess_y(df): 
    t = {'D': 0, 'C': 1, 'CL': 2}
    return df.apply(lambda i: t[i])

def caabstegorical_cols_(df) -> pd.DataFrame: 
    drug_dummies = drug_enc(df['Drug'])
    df = df.drop('Drug', axis=1)
    df = pd.concat([drug_dummies, df], axis=1)
    
    return df

def transform_numeric_cols_(df) -> pd.DataFrame:
    ...
    return df

def preprocess_x(df) -> pd.DataFrame: 
    df = df.copy()
    
    df = df.drop('id', axis=1)
    df = transform_numeric_cols_(df)
    df = false_true_cols_(df)
    df = categorical_cols_(df)
    
    return df

In [229]:
X = preprocess_x(train_df.drop(['Status'], axis=1))
y = preprocess_y(train_df['Status'])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8)

In [230]:
from tensorflow.data import Dataset
import tensorflow.keras.layers as l
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model
import tensorflow as tf

In [231]:
def build_dataset(X, y, batch_size = 16, shuffle = True, prefetch = True): 
    dataset = Dataset.from_tensor_slices((
        X, 
        y)
    ).batch(batch_size)
    
    if shuffle: 
        dataset = dataset.shuffle(16)
        
    if prefetch: 
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
    return dataset 

In [232]:
def build_model(input_shape, output_shape): 
    inputs = l.Input(input_shape, name='input')
    
    x = l.Dropout(0.3)(inputs)
    
    x = l.Dense(128, activation = 'linear', name='hidden')(x)
    x = l.Dense(64, activation = 'linear', name='hidden1')(x)
#     x = l.BatchNormalization()(inputs)

    outputs = l.Dense(output_shape, activation = 'softmax', name='output')(x)
    
    return Model(inputs, outputs)

In [233]:
train_dataset = build_dataset(X_train, y_train, batch_size = 8)
val_dataset = build_dataset(X_val, y_val, batch_size = 16)

In [234]:
# call the model
model = build_model(X_train.shape[-1], 3)
model.compile(optimizer = Adam(1e-4), loss = SparseCategoricalCrossentropy(), metrics=['accuracy'])

es = tf.keras.callbacks.EarlyStopping(patience = 60, min_delta = 1e-5, restore_best_weights = True)

In [235]:
model.fit(train_dataset, epochs = 200, validation_data = val_dataset, callbacks = [es])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x786404631d20>

In [240]:
test_df = pd.read_csv(test_path)
test_X = preprocess_x(test_df)

In [241]:
prediction = model.predict(test_X)
c_status = prediction[:, 1]
cl_status = prediction[:, 2]
d_status = prediction[:, 0]

df_data = {
    'Status_C': c_status, 
    'Status_CL': cl_status,
    'Status_D': d_status, 
}



In [242]:
submission = pd.DataFrame({'id': test_df['id'], **df_data})

In [243]:
submission.to_csv('submission.csv', index=False)