In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing dataset
* removed observations where IsBadBuy is missing
* keeping relevant columns
* ratio of acquisition cost paid for the vehicle at time of purchase is taken
* converting categorical to numerical

In [2]:
df = pd.read_csv('07 model holdout.csv')
df = df[df['IsBadBuy'].notnull()]
df = df[['RefId', 'IsBadBuy', 'VehBCost', 'Make', 'WheelType', 'MMRAcquisitionRetailAveragePrice', 
         'MMRAcquisitonRetailCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 
         'VNST', 'WarrantyCost', 'VehicleAge', 'VehOdo']]
df.shape

(98, 13)

In [3]:
df['MMRAcquisitionRetailAveragePrice'] = df['VehBCost'] / df['MMRAcquisitionRetailAveragePrice']
df['MMRAcquisitionRetailAveragePrice'] = np.where(df['MMRAcquisitionRetailAveragePrice']==np.inf, np.nan, df['MMRAcquisitionRetailAveragePrice'])
df['MMRAcquisitionRetailAveragePrice'] = df['MMRAcquisitionRetailAveragePrice'].fillna(df['MMRAcquisitionRetailAveragePrice'].mean())

df['MMRAcquisitonRetailCleanPrice'] = df['VehBCost'] / df['MMRAcquisitonRetailCleanPrice']
df['MMRAcquisitonRetailCleanPrice'] = np.where(df['MMRAcquisitonRetailCleanPrice']==np.inf, np.nan, df['MMRAcquisitonRetailCleanPrice'])
df['MMRAcquisitonRetailCleanPrice'] = df['MMRAcquisitonRetailCleanPrice'].fillna(df['MMRAcquisitonRetailCleanPrice'].mean())

df['MMRCurrentRetailAveragePrice'] = df['VehBCost'] / df['MMRCurrentRetailAveragePrice']
df['MMRCurrentRetailAveragePrice'] = np.where(df['MMRCurrentRetailAveragePrice']==np.inf, np.nan, df['MMRCurrentRetailAveragePrice'])
df['MMRCurrentRetailAveragePrice'] = df['MMRCurrentRetailAveragePrice'].fillna(df['MMRCurrentRetailAveragePrice'].mean())

df['MMRCurrentRetailCleanPrice'] = df['VehBCost'] / df['MMRCurrentRetailCleanPrice']
df['MMRCurrentRetailCleanPrice'] = np.where(df['MMRCurrentRetailCleanPrice']==np.inf, np.nan, df['MMRCurrentRetailCleanPrice'])
df['MMRCurrentRetailCleanPrice'] = df['MMRCurrentRetailCleanPrice'].fillna(df['MMRCurrentRetailCleanPrice'].mean())

df['WarrantyCost'] = df['WarrantyCost'] / df['VehBCost']
df['WarrantyCost'] = np.where(df['WarrantyCost']==np.inf, np.nan, df['WarrantyCost'])
df['WarrantyCost'] = df['WarrantyCost'].fillna(df['WarrantyCost'].mean())

df['VehicleAge_decades'] = df['VehicleAge']/10
df['VehOdo_lakhs'] = df['VehOdo']/100000
df.shape

(98, 15)

In [4]:
df['Make'] = np.where(df['Make'].isin(['FORD','PONTIAC','SATURN']), 1, 
                      np.where(df['Make'].isin(['DODGE']), -1, 0))
df['WheelType'] = np.where(df['WheelType'].isnull(), 1, -1)
df['VNST'] = np.where(df['VNST']=='VA', 1, -1)
df.shape

(98, 15)

In [5]:
df = df.drop(['VehicleAge','VehOdo','VehBCost'], axis=1)
df.shape

(98, 12)

### 2. RF
* separating out x and y
* adding monotonic constraints
* identifying optimal hyper-parameters
* performance

In [6]:
y = df['IsBadBuy']
x = df.drop(['IsBadBuy','RefId'], axis=1)
c = []
for i in x.columns:
    c.append(1)
c = tuple(c)
c

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

In [7]:
parameters = {"max_depth": [2,5,10,20,50],
              "min_samples_leaf": [2,5,10,20,50],
              "n_estimators": [5,10,20,50,100,200,500,1000]}
model = RandomForestClassifier(max_features='log2', monotonic_cst=c, random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='roc_auc', cv=3, n_jobs=6)
grid_search = grid_search.fit(x,y)
print(grid_search.best_score_)
print(grid_search.best_params_)

1.0
{'max_depth': 2, 'min_samples_leaf': 2, 'n_estimators': 5}


In [8]:
max_depth1 = grid_search.best_params_['max_depth']
min_samples_leaf1 = grid_search.best_params_['min_samples_leaf']
n_estimators1 = grid_search.best_params_['n_estimators']
model = RandomForestClassifier(max_depth=max_depth1, min_samples_leaf=min_samples_leaf1, n_estimators=n_estimators1, 
                               max_features='log2', monotonic_cst=c, random_state=0)

model.fit(x,y)
print(x.shape)
pred2 = []
pred1 = model.predict_proba(x)[:,1]
a1 = roc_auc_score(y,pred1)
print('auc roc  :',np.round(a1,3))

for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)
c1 = confusion_matrix(y,pred2)
p = c1[1][1] / (c1[0][1]+c1[1][1])
r = c1[1][1] / (c1[1][0]+c1[1][1])
f1 = (2*p*r) / (p+r)
print('f1 score :',np.round(f1,3))

(98, 10)
auc roc  : 1.0
f1 score : 0.854


### 3. test
* finalizing the hyper parameters
* saving the final hyper parameters

In [9]:
df0 = pd.read_csv('04 model test data.csv')
df0 = pd.concat([df0,df])
df0.to_csv('04 model test data.csv', index=False)
df0.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RefId,293.0,484.662116,350.670217,16.0,185.0,376.0,926.0,999.0
IsBadBuy,293.0,0.457338,0.499029,0.0,0.0,0.0,1.0,1.0
Make,293.0,0.098976,0.776649,-1.0,-1.0,0.0,1.0,1.0
WheelType,293.0,-0.535836,0.845766,-1.0,-1.0,-1.0,-1.0,1.0
MMRAcquisitionRetailAveragePrice,293.0,0.771152,0.143289,0.516369,0.664011,0.756496,0.856052,1.40427
MMRAcquisitonRetailCleanPrice,293.0,0.658503,0.107991,0.424541,0.58098,0.643627,0.726014,1.112867
MMRCurrentRetailAveragePrice,293.0,0.770498,0.127744,0.488752,0.680985,0.761968,0.842952,1.331971
MMRCurrentRetailCleanPrice,293.0,0.657406,0.095518,0.414563,0.593987,0.645927,0.712017,1.085917
VNST,293.0,-0.460751,0.889048,-1.0,-1.0,-1.0,1.0,1.0
WarrantyCost,293.0,0.207279,0.126297,0.063317,0.126,0.171963,0.263077,1.242898


In [10]:
df0 = pd.read_csv('04 model test hyper param.csv', header=None, index_col=0)
df1 = pd.DataFrame({'obs':[x.shape[0]], 'max_depth':[max_depth1], 'min_samples_leaf':[min_samples_leaf1],
                    'n_estimators':[n_estimators1]}).T
df0 = pd.concat([df0,df1], axis=1)
df0[2] = np.round(df0.mean(axis=1),0)
df0[2]['obs'] = df0[0]['obs'] + df0[1]['obs']
df0[2]['max_depth'] = np.round((df0[0]['max_depth']*df0[0]['obs'] + df0[1]['max_depth']*df0[1]['obs'])/df0[2]['obs'])
df0[2]['min_samples_leaf'] = np.round((df0[0]['min_samples_leaf']*df0[0]['obs'] + df0[1]['min_samples_leaf']*df0[1]['obs'])/df0[2]['obs'])
df0[2]['n_estimators'] = np.round((df0[0]['n_estimators']*df0[0]['obs'] + df0[1]['n_estimators']*df0[1]['obs'])/df0[2]['obs'])
print(df0)

                    1   0      2
obs               195  98  293.0
max_depth           2   2    2.0
min_samples_leaf    2   2    2.0
n_estimators      150   5  102.0


In [11]:
df0 = df0[[2]]
df0.to_csv('04 model test hyper param.csv', header=None)
df0.shape

(4, 1)