In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import time

import warnings
warnings.filterwarnings(action='ignore')


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
print(train.shape)
train.head()

(26457, 20)


Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [4]:
print(test.shape)
test.head()

(10000, 19)


Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,26457,M,Y,N,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,-21990,365243,1,0,1,0,,2.0,-60.0
1,26458,F,N,Y,0,135000.0,State servant,Higher education,Married,House / apartment,-18964,-8671,1,0,1,0,Core staff,2.0,-36.0
2,26459,F,N,Y,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,-15887,-217,1,1,1,0,Laborers,2.0,-40.0
3,26460,M,Y,N,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-19270,-2531,1,1,0,0,Drivers,2.0,-41.0
4,26461,F,Y,Y,0,225000.0,State servant,Higher education,Married,House / apartment,-17822,-9385,1,1,0,0,Managers,2.0,-8.0


# Preprocess

In [5]:
train['gender'] = train['gender'].replace(['F','M'], [0,  1])
test['gender'] = test['gender'].replace(['F','M'], [0,  1])

train['car'] = train['car'].replace(['N','Y'], [0,  1])
test['car'] = test['car'].replace(['N','Y'], [0,  1])

train['reality'] = train['reality'].replace(['N','Y'], [0,  1])
test['reality'] = test['reality'].replace(['N','Y'], [0,  1])

In [6]:
train[['gender', 'car','reality']].head()

Unnamed: 0,gender,car,reality
0,0,0,0
1,0,0,1
2,1,1,1
3,0,0,1
4,0,1,1


In [7]:
test[['gender', 'car','reality']].head()

Unnamed: 0,gender,car,reality
0,1,1,0
1,0,0,1
2,0,0,1
3,1,1,0
4,0,1,1


In [8]:
train.loc[train['child_num'] > 2, 'child_num'] = 2
test.loc[test['child_num'] > 2, 'child_num'] = 2

train.loc[train['family_size'] > 4, 'child_num'] = 4
test.loc[train['family_size'] > 4, 'child_num'] = 4

train['occyp_type'].fillna('Unemployed', inplace = True)
test['occyp_type'].fillna('Unemployed', inplace = True)

In [9]:
train[['child_num','family_size','occyp_type']].head()

Unnamed: 0,child_num,family_size,occyp_type
0,0,2.0,Unemployed
1,1,3.0,Laborers
2,0,2.0,Managers
3,0,2.0,Sales staff
4,0,2.0,Managers


In [10]:
train.isna().sum()

index            0
gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
DAYS_BIRTH       0
DAYS_EMPLOYED    0
FLAG_MOBIL       0
work_phone       0
phone            0
email            0
occyp_type       0
family_size      0
begin_month      0
credit           0
dtype: int64

In [11]:
test.isna().sum()

index            0
gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
DAYS_BIRTH       0
DAYS_EMPLOYED    0
FLAG_MOBIL       0
work_phone       0
phone            0
email            0
occyp_type       0
family_size      0
begin_month      0
dtype: int64

In [12]:
cate = ['income_type', 'edu_type', 'family_type', 'house_type','occyp_type']


In [13]:
# Categorical Variable Labeling

from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

train['income_type'] = label_encoder.fit_transform(train['income_type'])
test['income_type'] = label_encoder.transform(test['income_type'])

train['edu_type'] = label_encoder.fit_transform(train['edu_type'])
test['edu_type'] = label_encoder.transform(test['edu_type'])

train['family_type'] = label_encoder.fit_transform(train['family_type'])
test['family_type'] = label_encoder.transform(test['family_type'])

train['house_type'] = label_encoder.fit_transform(train['house_type'])
test['house_type'] = label_encoder.transform(test['house_type'])

train['occyp_type'] = label_encoder.fit_transform(train['occyp_type'])
test['occyp_type'] = label_encoder.transform(test['occyp_type'])

In [14]:
test[['income_type', 'edu_type', 'family_type', 'house_type','occyp_type']].head()

Unnamed: 0,income_type,edu_type,family_type,house_type,occyp_type
0,1,4,0,1,17
1,2,1,1,1,3
2,4,4,1,1,8
3,0,4,1,1,4
4,2,1,1,1,10


In [15]:
train[train.DAYS_EMPLOYED > 0].shape

(4438, 20)

In [16]:
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(lambda x: 0 if x > 0 else x)

In [17]:
train[train.DAYS_EMPLOYED > 0].shape

(0, 20)

In [18]:
# Numerical Variable Scaling
num = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'begin_month', 'income_total', 'income_type', 'edu_type', 'family_type', 'house_type','occyp_type']

scaler = preprocessing.StandardScaler()
train[num] = scaler.fit_transform(train[num])
test[num] = scaler.transform(test[num])

In [19]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,0,0.149136,-1.379572,-1.563528,-0.385583,0.762704,0.490075,-1.059227,1,0,0,0,1.145247,2.0,1.215231,1.0
1,1,0,0,1,1,0.590848,-1.379572,0.672647,-1.435432,-0.296331,1.089621,0.277849,1,0,0,1,-0.455987,3.0,1.27562,1.0
2,2,1,1,1,0,2.57855,0.924752,-1.563528,-0.385583,-0.296331,-0.744719,-0.943198,1,0,1,0,-0.100157,2.0,0.249003,2.0
3,3,0,0,1,0,0.149136,-1.379572,0.672647,-0.385583,-0.296331,0.207081,0.044947,1,0,1,0,0.611502,2.0,-0.656836,0.0
4,4,0,1,1,0,-0.292575,-0.22741,-1.563528,-0.385583,-0.296331,0.21922,0.039462,1,0,0,0,-0.100157,2.0,0.007446,2.0


In [20]:
test.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,26457,1,1,0,0,-0.734287,-0.803491,0.672647,-1.435432,-0.296331,-1.435662,155.032357,1,0,1,0,1.145247,2.0,-2.045789
1,26458,0,0,1,0,-0.513431,-0.22741,-1.563528,-0.385583,-0.296331,-0.715444,-2.73089,1,0,1,0,-1.345562,2.0,-0.596447
2,26459,0,0,1,0,-1.157623,0.924752,0.672647,-0.385583,-0.296331,0.016912,0.836055,1,1,1,0,-0.455987,2.0,-0.838004
3,26460,1,1,0,0,-0.734287,-1.379572,0.672647,-0.385583,-0.296331,-0.788275,-0.140277,1,1,0,0,-1.167647,2.0,-0.898393
4,26461,0,1,1,0,0.369992,-0.22741,-1.563528,-0.385583,-0.296331,-0.443637,-3.032144,1,1,0,0,-0.100157,2.0,1.094452


# Modeling

### Train_Test_Split

In [21]:
from sklearn.model_selection import train_test_split

train_x=train.drop('credit', axis=1)
train_y=train[['credit']]
test_x=test

In [22]:
print(train_x.shape, train_y.shape, test_x.shape)

(26457, 19) (26457, 1) (10000, 19)


In [23]:
train_x.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,0,0,0,0,0,0.149136,-1.379572,-1.563528,-0.385583,0.762704,0.490075,-1.059227,1,0,0,0,1.145247,2.0,1.215231
1,1,0,0,1,1,0.590848,-1.379572,0.672647,-1.435432,-0.296331,1.089621,0.277849,1,0,0,1,-0.455987,3.0,1.27562
2,2,1,1,1,0,2.57855,0.924752,-1.563528,-0.385583,-0.296331,-0.744719,-0.943198,1,0,1,0,-0.100157,2.0,0.249003
3,3,0,0,1,0,0.149136,-1.379572,0.672647,-0.385583,-0.296331,0.207081,0.044947,1,0,1,0,0.611502,2.0,-0.656836
4,4,0,1,1,0,-0.292575,-0.22741,-1.563528,-0.385583,-0.296331,0.21922,0.039462,1,0,0,0,-0.100157,2.0,0.007446


In [24]:
train_y.head()

Unnamed: 0,credit
0,1.0
1,1.0
2,2.0
3,0.0
4,2.0


In [25]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, 
                                                    stratify=train_y, test_size=0.25,
                                                    random_state = 10086)

print("Train set: ")
print(X_train.shape)
print(y_train.shape)
print("===========")
print("Validation set: ")
print(X_val.shape)
print(y_val.shape)




Train set: 
(19842, 19)
(19842, 1)
Validation set: 
(6615, 19)
(6615, 1)


### RF Model

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical

In [27]:
start_time = time.time()

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")
 
print("it takes ", (time.time()-start_time)/60, " mins")

log_loss: 2.4133065426869273
it takes  0.006389641761779785  mins
