In [7]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


In [8]:
dataset = pd.read_csv("data/complete.csv")

In [9]:
# Excluding customer ID and index because they're not relevant features
new_dataset = dataset.iloc[:,2:18]
features = new_dataset.loc[:, new_dataset.columns != 'card_offer'].values
labels = new_dataset['card_offer'].values

In [10]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                  test_size=0.2,
                                                  random_state=23,
                                                   stratify=labels)

In [11]:
# Scale both the training features and the test features

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Fit a random forest classifier using grid search onto the training data

rf = RandomForestClassifier()
params = { 'n_estimators': [100,200,400,600,800,1000] ,  'max_depth': [10,20,30]} 

skf = StratifiedKFold(n_splits=10)
rf_classifier = GridSearchCV(rf, params, cv = skf)
rf_classifier.fit(X_train_scaled, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 20, 30],
                         'n_estimators': [100, 200, 400, 600, 800, 1000]})

In [13]:
rf_classifier.best_score_

0.9722499999999998

In [17]:
pd.DataFrame(rf_classifier.cv_results_).sort_values(by='rank_test_score').head(5)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
17,5.091328,0.0178,0.120903,0.000793,30,1000,"{'max_depth': 30, 'n_estimators': 1000}",0.96875,0.965,0.97625,0.9675,0.98125,0.96625,0.9775,0.97375,0.9675,0.97875,0.97225,0.005612,1
14,2.034471,0.014257,0.048623,0.000499,30,400,"{'max_depth': 30, 'n_estimators': 400}",0.96375,0.96375,0.97875,0.96625,0.97875,0.96625,0.97875,0.97375,0.96875,0.98,0.971875,0.006453,2
10,4.078825,0.018589,0.09706,0.000917,20,800,"{'max_depth': 20, 'n_estimators': 800}",0.96625,0.965,0.97625,0.965,0.97875,0.96625,0.97625,0.9725,0.9675,0.98125,0.9715,0.005911,3
13,1.020454,0.010116,0.024776,0.000866,30,200,"{'max_depth': 30, 'n_estimators': 200}",0.96375,0.96625,0.9775,0.97,0.98,0.96375,0.97375,0.97625,0.96625,0.97375,0.971125,0.005631,4
11,5.105967,0.045429,0.121025,0.001403,20,1000,"{'max_depth': 20, 'n_estimators': 1000}",0.96625,0.96375,0.97875,0.96375,0.98125,0.9625,0.97875,0.97375,0.965,0.97625,0.971,0.007045,5


In [14]:
rf_classifier.best_estimator_


RandomForestClassifier(max_depth=30, n_estimators=1000)

The best model for a random forest uses 1000 trees and a max depth of 30 in this case


In [15]:
# Make predictions of the y values for the test set now
predictions_test = rf_classifier.predict(X_test_scaled)

In [16]:
print(f'Test accuracy of best random forest model: {accuracy_score(y_test, predictions_test)}')
print(f'Test f1 score of best random forest model: {f1_score(y_test, predictions_test)}')

Test accuracy of best random forest model: 0.977
Test f1 score of best random forest model: 0.9212328767123288


Test accuracy of best random forest model: 0.977
Test f1 score of best random forest model: 0.9212


## Predict on unseen

In [None]:
unseen = pd.read_csv('data/unseen.csv')
unseen = unseen.drop('customer_id', axis=1)
unseen_X = scaler.transform(unseen)
unseen['pred_card_offer'] = rf_classifier.predict(unseen_X)
print('Predicting on unseen data... \nUnseen data predicted counts:')
print(unseen['pred_card_offer'].value_counts())