In [36]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing dataset
* separating out x and y
* adding monotonic constraints

In [37]:
s0 = pd.read_csv('02 data split 0.csv')
s1 = pd.read_csv('02 data split 1.csv')
s2 = pd.read_csv('02 data split 2.csv')

train = pd.concat([s1,s2])
print(train.shape)
test = s0
print(test.shape)

(129, 12)
(66, 12)


In [38]:
y = train['IsBadBuy']
x = train.drop(['IsBadBuy','RefId'], axis=1)
c = []
for i in x.columns:
    c.append(1)
c = tuple(c)
c

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

### 2. RF
* identifying optimal hyper-parameters
* performance

In [39]:
parameters = {"max_depth": [2,5,10],
              "min_samples_leaf": [2,5,10],
              "n_estimators": [50,100,200,500,1000]}
model = RandomForestClassifier(max_features='log2', monotonic_cst=c, random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='roc_auc', cv=3, n_jobs=6)
grid_search = grid_search.fit(x,y)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7421582249168456
{'max_depth': 2, 'min_samples_leaf': 2, 'n_estimators': 200}


In [40]:
model = RandomForestClassifier(max_depth=2, min_samples_leaf=2, n_estimators=200, 
                               max_features='log2', monotonic_cst=c, random_state=0)

model.fit(x,y)
print(x.shape)
pred2 = []
pred1 = model.predict_proba(x)[:,1]
a1 = roc_auc_score(y,pred1)
print('auc roc  :',np.round(a1,3))

for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)
c1 = confusion_matrix(y,pred2)
p = c1[1][1] / (c1[0][1]+c1[1][1])
r = c1[1][1] / (c1[1][0]+c1[1][1])
f1 = (2*p*r) / (p+r)
print('f1 score :',np.round(f1,3))

(129, 10)
auc roc  : 0.825
f1 score : 0.705


### 3. test
* adding monotonic constraints
* performance

In [41]:
y = test['IsBadBuy']
x = test.drop(['IsBadBuy','RefId'], axis=1)
c = []
for i in x.columns:
    c.append(1)
c = tuple(c)
c

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

In [42]:
model.fit(x,y)
print(x.shape)
pred2 = []
pred1 = model.predict_proba(x)[:,1]
a1 = roc_auc_score(y,pred1)
print('auc roc  :',np.round(a1,3))

for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)
c1 = confusion_matrix(y,pred2)
p = c1[1][1] / (c1[0][1]+c1[1][1])
r = c1[1][1] / (c1[1][0]+c1[1][1])
f1 = (2*p*r) / (p+r)
print('f1 score :',np.round(f1,3))

(66, 10)
auc roc  : 0.871
f1 score : 0.735
