In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier

## Read data

In [28]:
data = pd.read_csv('data/complete.csv')

In [29]:
data = data.drop('customer_id', axis=1)

## Split into train and test

In [30]:
X = data.loc[:, data.columns != 'card_offer'].values
y = data['card_offer'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=0,
                                                   stratify=y)

## Standardize

In [31]:
stdsc = StandardScaler().fit(X_train)
X_train_std = stdsc.transform(X_train)
X_test_std = stdsc.transform(X_test)

## Create model, cross validate

- For this part, the GridSearchCV object will try the different combos of parameters for us
- To use it, we just pass the model and the parameters we want it to try in a dictionary.
- Each key of the dictionary is associate with a parameter of the model, and each key is a list of values to try for that paramter
- For example, LogisticRegression has parameters called 'penalty' and 'C', so I use those as keys and associate them with a list of values to try for that parameter.

### Default Model

In [41]:
model = LogisticRegression(random_state=0)
model.fit(X_train_std, y_train)
print(f"The f1 score of the default LR model is: {f1_score(y_test, model.predict(X_test_std))}")

The f1 score of the default LR model is: 0.8664383561643836


### Hyperparameter Optimization

In [42]:
model = LogisticRegression(solver='liblinear', random_state=0)
parameters = {'penalty': ['l1', 'l2'], 
              'C': [.0001, .0005, .001, .005,  .01, .05,  1, 5, 10, 50, 100, 500, 1000]}

skf = StratifiedKFold(n_splits=10)
classifier = GridSearchCV(model, parameters, cv=skf, scoring="f1")
classifier.fit(X_train_std, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=0, solver='liblinear'),
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 1, 5,
                               10, 50, 100, 500, 1000],
                         'penalty': ['l1', 'l2']},
             scoring='f1')

## Print out top 5 models according to mean test fold score
- Look at mean_test_score column

In [43]:
pd.DataFrame(classifier.cv_results_).sort_values(by='rank_test_score').head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
13,0.029521,0.003482,0.001496,0.000492,1,l2,"{'C': 1, 'penalty': 'l2'}",0.889796,0.888889,0.887967,0.86087,0.897119,0.871795,0.871369,0.878431,0.867769,0.887967,0.880197,0.0112,1
15,0.027135,0.00248,0.001292,0.000453,5,l2,"{'C': 5, 'penalty': 'l2'}",0.888889,0.881633,0.887967,0.862069,0.893443,0.876596,0.871369,0.875,0.864198,0.892562,0.879372,0.01078,2
17,0.027926,0.004324,0.001394,0.000487,10,l2,"{'C': 10, 'penalty': 'l2'}",0.888889,0.881633,0.887967,0.858369,0.893443,0.876596,0.871369,0.875,0.864198,0.892562,0.879002,0.011412,3
16,0.082879,0.004225,0.001296,0.000449,10,l1,"{'C': 10, 'penalty': 'l1'}",0.888889,0.881633,0.887967,0.858369,0.893443,0.876596,0.871369,0.875,0.864198,0.892562,0.879002,0.011412,3
14,0.088161,0.004223,0.001297,0.000457,5,l1,"{'C': 5, 'penalty': 'l1'}",0.888889,0.881633,0.887967,0.858369,0.893443,0.876596,0.871369,0.875,0.864198,0.892562,0.879002,0.011412,3


## Pick best model

In [34]:
np.set_printoptions(suppress=True)

In [44]:
best_model = LogisticRegression(penalty='l1', C=1, solver='liblinear', random_state=0)
best_model.fit(X_train_std, y_train)
print(f"The f1 score of the best LR model is: {f1_score(y_test, best_model.predict(X_test_std))}")
print(f'Test set accuracy of best LR model: {best_model.score(X_test_std, y_test)}')

The f1 score of the best LR model is: 0.8688245315161839
Test set accuracy of best LR model: 0.9615


## Baseline

In [48]:
dummy1 = DummyClassifier(strategy='stratified')
dummy2 = DummyClassifier(strategy='most_frequent')

dummy1.fit(X_train_std, y_train)
dummy2.fit(X_train_std, y_train)

print('===========DUMMY MODELS ON TEST SET==========')
print('Dummy 1: Stratified')
print('f1 score:', f1_score(y_test, dummy1.predict(X_test_std)))

print('Dummy 2: Most Frequent')
print('f1 score:', f1_score(y_test, dummy2.predict(X_test_std)))

Dummy 1: Stratified
f1 score: 0.15000000000000002
Dummy 2: Most Frequent
f1 score: 0.0
