In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing dataset
* separating out x and y
* adding monotonic constraints

In [2]:
df = pd.read_csv('01 data prep.csv')
y = df['IsBadBuy']
x = df.drop(['IsBadBuy','RefId'], axis=1)
df.shape

(195, 20)

In [3]:
c = []
for i in x.columns:
    c.append(1)
c = tuple(c)
c

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

### 2. RF
* identifying optimal hyper-parameters
* feature importance

In [4]:
parameters = {"max_depth": [2,5,10],
              "min_samples_leaf": [2,5,10],
              "n_estimators": [200,500,1000]}
model = RandomForestClassifier(max_features='log2', monotonic_cst=c, random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='roc_auc', cv=3, n_jobs=6)
grid_search = grid_search.fit(x,y)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7453679653679653
{'max_depth': 2, 'min_samples_leaf': 2, 'n_estimators': 500}


In [5]:
model = RandomForestClassifier(max_depth=2, min_samples_leaf=2, n_estimators=500, 
                               max_features='log2', monotonic_cst=c, random_state=0)
model.fit(x,y)
importances = model.feature_importances_
gb = pd.DataFrame({'Feature':x.columns,'Gini':importances}).sort_values('Gini', ascending=False) 
print(gb)

                              Feature      Gini
2                           WheelType  0.198869
7    MMRAcquisitionRetailAveragePrice  0.110675
16                 VehicleAge_decades  0.102326
11       MMRCurrentRetailAveragePrice  0.087252
15                       WarrantyCost  0.075165
8       MMRAcquisitonRetailCleanPrice  0.069567
0                                Make  0.069541
14                               VNST  0.055519
12         MMRCurrentRetailCleanPrice  0.052389
17                       VehOdo_lakhs  0.043205
5   MMRAcquisitionAuctionAveragePrice  0.025137
9       MMRCurrentAuctionAveragePrice  0.022870
1                        Transmission  0.020297
10        MMRCurrentAuctionCleanPrice  0.019053
6     MMRAcquisitionAuctionCleanPrice  0.018307
4                                Size  0.015585
13                          PRIMEUNIT  0.011424
3                         Nationality  0.002820


### 3. optimal cut-off for f1-score
* cut-off from 0.05 to 0.65 are tested
* the cut-off where the f1-score is highest is selected

In [6]:
for i in range(5,70,5):
    pred2 = []
    pred1 = model.predict_proba(x)[:,1]
    for j in pred1:
        if j > i/100: pred2.append(1)
        else: pred2.append(0)
    c1 = confusion_matrix(y,pred2)
    p = c1[1][1] / (c1[0][1]+c1[1][1])
    r = c1[1][1] / (c1[1][0]+c1[1][1])
    f1 = (2*p*r) / (p+r)
    print(i,'\t',np.round(f1,3))

5 	 0.477
10 	 0.477
15 	 0.477
20 	 0.494
25 	 0.58
30 	 0.696
35 	 0.615
40 	 0.64
45 	 0.444
50 	 0.347
55 	 0.229
60 	 0.152
65 	 0.094


### 4. data splits
* top 4 variables are used to sort the data
* 0/1 are separated out and 3 splits are created

In [7]:
for i in ['WheelType','MMRAcquisitionRetailAveragePrice','VehicleAge_decades',
          'MMRCurrentRetailAveragePrice','WarrantyCost']:
    df[i+'bin'] = np.where(df[i].isnull(), 'Q0', 
                           np.where(df[i]<=np.percentile(df[i],25), 'Q1', 
                                    np.where(df[i]<=np.percentile(df[i],50), 'Q2', 
                                             np.where(df[i]<=np.percentile(df[i],75), 'Q3', 'Q4'))))
df = df.sort_values(['WheelTypebin','MMRAcquisitionRetailAveragePricebin','VehicleAge_decadesbin',
                     'MMRCurrentRetailAveragePricebin','WarrantyCostbin']).reset_index(drop=True)
df0 = df[df['IsBadBuy']==0].reset_index(drop=True).reset_index()
df1 = df[df['IsBadBuy']==1].reset_index(drop=True).reset_index()
print('non event :',df0.shape)
print('event :',df1.shape)

non event : (134, 26)
event : (61, 26)


In [8]:
split0 = pd.concat([df0[df0['index']%3==0], df1[df1['index']%3==0]])
split0 = split0.drop(['index','WheelTypebin','MMRAcquisitionRetailAveragePricebin','VehicleAge_decadesbin',
                      'MMRCurrentRetailAveragePricebin','WarrantyCostbin','MMRAcquisitionAuctionAveragePrice',
                      'MMRCurrentAuctionAveragePrice','Transmission','MMRCurrentAuctionCleanPrice',
                      'MMRAcquisitionAuctionCleanPrice','Size','PRIMEUNIT','Nationality'], axis=1)
print('1st split :',split0.shape)

split1 = pd.concat([df0[df0['index']%3==1], df1[df1['index']%3==1]])
split1 = split1.drop(['index','WheelTypebin','MMRAcquisitionRetailAveragePricebin','VehicleAge_decadesbin',
                      'MMRCurrentRetailAveragePricebin','WarrantyCostbin','MMRAcquisitionAuctionAveragePrice',
                      'MMRCurrentAuctionAveragePrice','Transmission','MMRCurrentAuctionCleanPrice',
                      'MMRAcquisitionAuctionCleanPrice','Size','PRIMEUNIT','Nationality'], axis=1)
print('2nd split :',split1.shape)

split2 = pd.concat([df0[df0['index']%3==2], df1[df1['index']%3==2]])
split2 = split2.drop(['index','WheelTypebin','MMRAcquisitionRetailAveragePricebin','VehicleAge_decadesbin',
                      'MMRCurrentRetailAveragePricebin','WarrantyCostbin','MMRAcquisitionAuctionAveragePrice',
                      'MMRCurrentAuctionAveragePrice','Transmission','MMRCurrentAuctionCleanPrice',
                      'MMRAcquisitionAuctionCleanPrice','Size','PRIMEUNIT','Nationality'], axis=1)
print('3rd split :',split2.shape)

1st split : (66, 12)
2nd split : (65, 12)
3rd split : (64, 12)


### 5. exporting dataset
* exporting all the three datasets
* training on s0+s1 and testing on s2
* training on s1+s2 and testing on s0
* training on s2+s0 and testing on s1

In [9]:
split0.to_csv('02 data split 0.csv', index=False)
split1.to_csv('02 data split 1.csv', index=False)
split2.to_csv('02 data split 2.csv', index=False)

df = pd.concat([split0,split1,split2])
df.to_csv('04 model test data.csv', index=False)
df.shape

(195, 12)