In [1]:
import pandas as pd
import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import math

In [None]:
class TabClassifier():
    def __init__(self, FILE_PATH: str = None, DATA_PARTS: list = [0.8, 0.1, 0.1],\
                 DROP_COLUMNS: list = [], CAT_COLUMNS: list = [], NUM_COLUMNS: list = [], BIN_COLUMNS: list = [],\
                 TARGET: str = None, FILL_NA: bool = False, MODEL_PARAMS: dict = None):
        
        if FILE_PATH == None:
            raise ValueError("FILE_PATH must be not None")
        else:
            dataset = pd.read_excel(self.file_path)
            dataset = dataset.drop(columns = DROP_COLUMNS)
            
        if TARGET == None:
            raise ValueError("TARGET must be not None")
            
        self.target = TARGET
        self.cat_columns = CAT_COLUMNS
        self.num_columns = NUM_COLUMNS
        
        dataset = dataset[[self.target] + self.columns]
        dataset['Set'] = np.random.choice(["train", "valid", "test"], p = DATA_PARTS, size = (dataset.shape[0],))
        self.train_indices = dataset[dataset.Set=="train"].index
        self.valid_indices = dataset[dataset.Set=="valid"].index
        self.test_indices = dataset[dataset.Set=="test"].index
        
        if FILL_NA == True:
            #add code here to handle missing data
            pass
        
        self.cat_dims = {}
        for col in self.cat_columns:
            print(col, dataset[col].nunique())
            l_enc = LabelEncoder()
            dataset[col] = l_enc.fit_transform(dataset[col].values)
            self.cat_dims[col] = len(l_enc.classes_)
            
        unused_feat = ['Set']
        self.features = [col for col in dataset.columns if col not in unused_feat + [self.target]] 
        self.cat_idxs = [i for i, f in enumerate(features) if f in self.cat_columns]
        self.cat_dims = [self.cat_dims[f] for i, f in enumerate(features) if f in self.cat_columns]
        self.cat_emb_dim = [math.sqrt(val) for val in self.cat_dims]
        
        self.data = dataset
        
        if MODEL_PARAMS == None:
            raise ValueError("You should provide NN parameters via MODEL_PARAMS")
        self.model = TabNetClassifier(**MODEL_PARAMS)
        
        
    def Fit(self, FIT_PARAMS: dict = None, MAX_EPOCHS: int = 1000):
        
        if FIT_PARAMS == None:
            raise ValueError("You should provide NN fit parameters via FIT_PARAMS")
            
        self.max_epochs = MAX_EPOCHS
            
        X_train = self.data[self.features].values[self.train_indices]
        y_train = self.data[self.target].values[self.train_indices]
        X_valid = self.data[self.features].values[self.valid_indices]
        y_valid = self.data[self.target].values[self.valid_indices]
        X_test = self.data[self.features].values[self.test_indices]
        y_test = self.data[self.target].values[self.test_indices]
        
        for ep in range(1, self.max_epochs + 1):
            
        
        
        
        

In [14]:
#Load the dataset
data = pd.read_excel('Data/data_train.xls')

#Fix the outlier
data.loc[data['СУММА_ВЫДАННОГО_КРЕДИТА'] == max(data['СУММА_ВЫДАННОГО_КРЕДИТА']),'СУММА_ВЫДАННОГО_КРЕДИТА'] = 70000

#Drop some unneccessary columsn
data = data.drop(columns = ['ИД', 'ВОЗРАСТ.ЛЕТ', 'ЗАПРОШЕННАЯ_СУММА', 'ЗАГРАН'])

#Make gender variable have values 0 and 1
data['ПОЛ'] -= 1

#One-hot encoding
ohe_columns = ['АВТО', 'БРАК', 'ГОРОД', 'ДОЛЖНОСТЬ', 'ИНОСТР.ЯЗЫК', 'ОБРАЗОВАНИЕ', 'ОТДЕЛЕНИЕ', 'ОТРАСЛЬ',\
               'РАБОТА_В_ОРГ', 'РАБОТА_ПО_НАПР', 'СМЕНА_МЖ']
ohe_dims = {}
for column in ohe_columns:
    data = pd.concat([data, pd.get_dummies(data[column], prefix = column)], axis = 1)
data = data.drop(columns = ohe_columns)

#Split data into train, test and val
data_train, data_test = train_test_split(data, test_size = 0.2, stratify = data['ДЕФОЛТ60'])
data_test, data_val = train_test_split(data_test, test_size = 0.5, stratify = data_test['ДЕФОЛТ60'])

data_train.head()

Unnamed: 0,ВОЗРАСТ,ПОЛ,ДЕТИ,ИЖДИВЕНЦЫ,ДОХОД_СЕМЬИ_,ДОХОД_НА_ОДНОГО_ЧЛЕНА_СЕМЬИ,ПЕРСОНАЛЬНЫЙ_ДОХОД,СУММА_ВЫДАННОГО_КРЕДИТА,СОБСТВЕННИК_ФАКТ.,ДОХОД_СУПРУГИ(А),...,РАБОТА_В_ОРГ_2,РАБОТА_В_ОРГ_3,РАБОТА_ПО_НАПР_1,РАБОТА_ПО_НАПР_2,РАБОТА_ПО_НАПР_3,СМЕНА_МЖ_0,СМЕНА_МЖ_1,СМЕНА_МЖ_2,СМЕНА_МЖ_3,СМЕНА_МЖ_4
4319,32.416438,1,2,2,14000,4600.0,14000,35000.0,4,0,...,1,0,1,0,0,1,0,0,0,0
1832,24.252055,1,0,0,13000,13000.0,13000,30000.0,5,0,...,0,1,0,0,1,0,1,0,0,0
2511,26.013699,1,2,2,17500,4375.0,7500,10000.0,3,10000,...,0,0,1,0,0,1,0,0,0,0
3774,21.786301,0,0,0,13000,13000.0,13000,30000.0,4,0,...,1,0,1,0,0,1,0,0,0,0
620,26.120548,1,1,1,35000,11600.0,20000,15000.0,5,15000,...,0,0,1,0,0,0,1,0,0,0
