In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import log_loss
from sklearn.preprocessing import RobustScaler

from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('.\\data\\train.csv', encoding='utf-8')
df.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [3]:
df['credit'].value_counts()

2.0    16968
1.0     6267
0.0     3222
Name: credit, dtype: int64

In [4]:
df = df.fillna('Unknown')
X = df.drop(columns=['index', 'credit'])
y = df['credit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [5]:
sample_weight = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)

In [6]:
sample_weight[:10]

array([1.4068067 , 2.74429461, 1.4068067 , 1.4068067 , 0.51954046,
       0.51954046, 1.4068067 , 2.74429461, 0.51954046, 0.51954046])

In [7]:
cat_features = [0, 1, 2, 5, 6, 7, 8, 11, 12, 13, 14, 15]

In [8]:
num_features = [(3, 'child_num'), (4, 'income_total'), (9, 'DAYS_BIRTH'), (10, 'DAYS_EMPLOYED'), (16, 'family_size'), (17, 'begin_month')]

# define scaler
scaler = RobustScaler()

# fit scaler on train set and transform the rest of datasets
X_train_scaled = pd.concat([X_train.iloc[:, cat_features].reset_index(drop=True), pd.DataFrame(scaler.fit_transform(X_train.iloc[:, [t[0] for t in num_features]]), columns=[t[1] for t in num_features])], axis=1)
X_valid_scaled = pd.concat([X_valid.iloc[:, cat_features].reset_index(drop=True), pd.DataFrame(scaler.transform(X_valid.iloc[:, [t[0] for t in num_features]]), columns=[t[1] for t in num_features])], axis=1)
X_test_scaled = pd.concat([X_test.iloc[:, cat_features].reset_index(drop=True), pd.DataFrame(scaler.transform(X_test.iloc[:, [t[0] for t in num_features]]), columns=[t[1] for t in num_features])], axis=1)

In [9]:
cat_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [10]:
X_train_scaled.iloc[:, cat_features].head()

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,FLAG_MOBIL,work_phone,phone,email,occyp_type
0,F,N,Y,State servant,Higher education,Married,House / apartment,1,1,1,0,Secretaries
1,M,N,N,Commercial associate,Lower secondary,Married,House / apartment,1,1,0,0,Laborers
2,M,N,N,Working,Higher education,Married,House / apartment,1,0,1,0,Unknown
3,M,Y,Y,Working,Higher education,Married,House / apartment,1,0,0,0,Laborers
4,F,N,Y,Pensioner,Higher education,Married,House / apartment,1,0,0,0,Unknown


In [11]:
X_train_scaled.head()

Unnamed: 0,gender,car,reality,income_type,edu_type,family_type,house_type,FLAG_MOBIL,work_phone,phone,email,occyp_type,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,family_size,begin_month
0,F,N,Y,State servant,Higher education,Married,House / apartment,1,1,1,0,Secretaries,0.0,-0.217391,0.823462,0.328131,0.0,-0.703704
1,M,N,N,Commercial associate,Lower secondary,Married,House / apartment,1,1,0,0,Laborers,1.0,0.217391,0.653489,-0.34265,1.0,0.074074
2,M,N,N,Working,Higher education,Married,House / apartment,1,0,1,0,Unknown,0.0,1.086957,0.682889,-0.352813,0.0,0.814815
3,M,Y,Y,Working,Higher education,Married,House / apartment,1,0,0,0,Laborers,0.0,2.173913,0.72399,-0.319782,0.0,0.777778
4,F,N,Y,Pensioner,Higher education,Married,House / apartment,1,0,0,0,Unknown,0.0,-0.652174,-0.830741,133.133212,0.0,-1.0


In [12]:
clf = CatBoostClassifier(loss_function='MultiClass',
                         eval_metric='MultiClass',
                         iterations=1000,
                         random_seed=42)

In [13]:
%%time

clf.fit(X=X_train_scaled, y=y_train,
        cat_features=cat_features,
        sample_weight=sample_weight,
        eval_set=(X_valid_scaled, y_valid),
        use_best_model=True,
        verbose=False,
        early_stopping_rounds=30)

Wall time: 1min 10s


<catboost.core.CatBoostClassifier at 0x1ebeab19b50>

In [14]:
y_pred = clf.predict_proba(X_test_scaled)

In [15]:
log_loss(y_true=y_test, y_pred=y_pred)

0.8874576834106023

In [None]:
X_unk = pd.read_csv('.\\data\\test.csv', encoding='utf-8')
X_unk = X_unk.drop(columns=['index'])
X_unk = X_unk.fillna('Unknown')
cat_features = [0, 1, 2, 5, 6, 7, 8, 11, 12, 13, 14, 15]
X_unk_scaled = pd.concat([X_unk.iloc[:, cat_features].reset_index(drop=True), pd.DataFrame(scaler.transform(X_unk.iloc[:, [t[0] for t in num_features]]), columns=[t[1] for t in num_features])], axis=1)
cat_features = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [None]:
curr_time = datetime.now().strftime('%y%m%d%H%M%S')

In [None]:
X_unk_raw = pd.read_csv('.\\data\\test.csv', encoding='utf-8')
l_idx = X_unk_raw['index']
submission = pd.DataFrame(np.insert(clf.predict_proba(X_unk_scaled), 0, l_idx, axis=1), columns=['index', 0, 1, 2])
submission['index'] = submission['index'].astype('int')

In [None]:
submission.to_csv(f'submission_{curr_time}.csv', index=False)