In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing train dataset

In [2]:
df0 = pd.read_csv('04 model test hyper param.csv', header=None, index_col=0)
df = pd.read_csv('04 model test data.csv')
df0

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
obs,195
max_depth,2
min_samples_leaf,2
n_estimators,150


In [3]:
y = df['IsBadBuy']
x = df.drop(['IsBadBuy','RefId'], axis=1)
c = []
for i in x.columns:
    c.append(1)
c = tuple(c)
c

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

### 2. training the model

In [4]:
max_depth1 = np.round(df0[1]['max_depth'],0).astype(int)
min_samples_leaf1 = np.round(df0[1]['min_samples_leaf'],0).astype(int)
n_estimators1 = np.round(df0[1]['n_estimators'],0).astype(int)
model = RandomForestClassifier(max_depth=max_depth1, min_samples_leaf=min_samples_leaf1, n_estimators=n_estimators1, 
                               max_features='log2', monotonic_cst=c, random_state=0)
model.fit(x,y)
print(x.shape)
pred2 = []
pred1 = model.predict_proba(x)[:,1]
a1 = roc_auc_score(y,pred1)
print('auc roc  :',np.round(a1,3))

for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)
c1 = confusion_matrix(y,pred2)
p = c1[1][1] / (c1[0][1]+c1[1][1])
r = c1[1][1] / (c1[1][0]+c1[1][1])
f1 = (2*p*r) / (p+r)
print('f1 score :',np.round(f1,3))

(195, 10)
auc roc  : 0.835
f1 score : 0.681


### 3. test dataset
* removed observations where IsBadBuy is missing
* keeping relevant columns
* ratio of acquisition cost paid for the vehicle at time of purchase is taken
* converting categorical to numerical

In [5]:
df = pd.read_csv('07 model holdout.csv')
df = df[['RefId', 'VehBCost', 'Make', 'WheelType', 'MMRAcquisitionRetailAveragePrice', 
         'MMRAcquisitonRetailCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 
         'VNST', 'WarrantyCost', 'VehicleAge', 'VehOdo']]
df.shape

(98, 12)

In [6]:
df['MMRAcquisitionRetailAveragePrice'] = df['VehBCost'] / df['MMRAcquisitionRetailAveragePrice']
df['MMRAcquisitionRetailAveragePrice'] = np.where(df['MMRAcquisitionRetailAveragePrice']==np.inf, np.nan, df['MMRAcquisitionRetailAveragePrice'])
df['MMRAcquisitionRetailAveragePrice'] = df['MMRAcquisitionRetailAveragePrice'].fillna(df['MMRAcquisitionRetailAveragePrice'].mean())

df['MMRAcquisitonRetailCleanPrice'] = df['VehBCost'] / df['MMRAcquisitonRetailCleanPrice']
df['MMRAcquisitonRetailCleanPrice'] = np.where(df['MMRAcquisitonRetailCleanPrice']==np.inf, np.nan, df['MMRAcquisitonRetailCleanPrice'])
df['MMRAcquisitonRetailCleanPrice'] = df['MMRAcquisitonRetailCleanPrice'].fillna(df['MMRAcquisitonRetailCleanPrice'].mean())

df['MMRCurrentRetailAveragePrice'] = df['VehBCost'] / df['MMRCurrentRetailAveragePrice']
df['MMRCurrentRetailAveragePrice'] = np.where(df['MMRCurrentRetailAveragePrice']==np.inf, np.nan, df['MMRCurrentRetailAveragePrice'])
df['MMRCurrentRetailAveragePrice'] = df['MMRCurrentRetailAveragePrice'].fillna(df['MMRCurrentRetailAveragePrice'].mean())

df['MMRCurrentRetailCleanPrice'] = df['VehBCost'] / df['MMRCurrentRetailCleanPrice']
df['MMRCurrentRetailCleanPrice'] = np.where(df['MMRCurrentRetailCleanPrice']==np.inf, np.nan, df['MMRCurrentRetailCleanPrice'])
df['MMRCurrentRetailCleanPrice'] = df['MMRCurrentRetailCleanPrice'].fillna(df['MMRCurrentRetailCleanPrice'].mean())

df['WarrantyCost'] = df['WarrantyCost'] / df['VehBCost']
df['WarrantyCost'] = np.where(df['WarrantyCost']==np.inf, np.nan, df['WarrantyCost'])
df['WarrantyCost'] = df['WarrantyCost'].fillna(df['WarrantyCost'].mean())

df['VehicleAge_decades'] = df['VehicleAge']/10
df['VehOdo_lakhs'] = df['VehOdo']/100000
df.shape

(98, 14)

In [7]:
df['Make'] = np.where(df['Make'].isin(['FORD','PONTIAC','SATURN']), 1, 
                      np.where(df['Make'].isin(['DODGE']), -1, 0))
df['WheelType'] = np.where(df['WheelType'].isnull(), 1, -1)
df['VNST'] = np.where(df['VNST']=='VA', 1, -1)
df.shape

(98, 14)

In [8]:
df = df.drop(['VehicleAge','VehOdo','VehBCost'], axis=1)
df.shape

(98, 11)

### 4. pred
* separating out x and y
* adding monotonic constraints
* identifying optimal hyper-parameters
* performance

In [9]:
x = df.drop(['RefId'], axis=1)
print(x.shape)
pred2 = []
pred1 = model.predict_proba(x)[:,1]

for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)

(98, 10)


### 5. exporting dataset
* exporting the predicted values

In [10]:
df['pred'] = pred2
df.to_csv('04 prod pred.csv', index=False)
df.shape

(98, 12)