In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

## Read data

In [10]:
data = pd.read_csv('data/complete.csv')

## Split into train and test

In [10]:
X = data.loc[:, data.columns != 'card_offer'].values
y = data['card_offer'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=0,
                                                   stratify=y)

## Standardize

In [10]:
stdsc = StandardScaler().fit(X_train)
X_train_std = stdsc.transform(X_train)
X_test_std = stdsc.transform(X_test)

## Create model, cross validate

- For this part, the GridSearchCV object will try the different combos of parameters for us
- To use it, we just pass the model and the parameters we want it to try in a dictionary.
- Each key of the dictionary is associate with a parameter of the model, and each key is a list of values to try for that paramter
- For example, LogisticRegression has parameters called 'penalty' and 'C', so I use those as keys and associate them with a list of values to try for that parameter.

In [14]:
model = LogisticRegression(solver='liblinear', random_state=0)
parameters = {'penalty': ['l1', 'l2'], 
              'C': [.0001, .0005, .001, .005,  .01, .05,  1, 5, 10, 50, 100, 500, 1000]}

skf = StratifiedKFold(n_splits=10)
classifier = GridSearchCV(model, parameters, cv=skf)
classifier.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=0, solver='liblinear'),
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 1, 5,
                               10, 50, 100, 500, 1000],
                         'penalty': ['l1', 'l2']})

## Print out top 5 models according to mean test fold score
- Look at mean_test_score column

In [18]:
pd.DataFrame(classifier.cv_results_).sort_values(by='rank_test_score').head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
12,0.188196,0.070659,0.000499,0.000499,1,l1,"{'C': 1, 'penalty': 'l1'}",0.96375,0.96625,0.9675,0.96,0.9725,0.96625,0.9575,0.96125,0.9575,0.96625,0.963875,0.004557,1
16,0.25033,0.044806,0.000599,0.000489,10,l1,"{'C': 10, 'penalty': 'l1'}",0.96375,0.96375,0.96625,0.96,0.96875,0.965,0.96,0.96,0.95875,0.9675,0.963375,0.003356,2
14,0.304977,0.04382,0.000606,0.000495,5,l1,"{'C': 5, 'penalty': 'l1'}",0.96375,0.96375,0.96625,0.96,0.96875,0.965,0.96,0.96,0.95875,0.9675,0.963375,0.003356,2
18,0.260307,0.043067,0.000599,0.000489,50,l1,"{'C': 50, 'penalty': 'l1'}",0.96375,0.96375,0.965,0.96,0.96875,0.96375,0.96,0.96,0.95875,0.9675,0.963125,0.003223,4
20,0.278653,0.05225,0.000598,0.000488,100,l1,"{'C': 100, 'penalty': 'l1'}",0.96375,0.96375,0.965,0.96,0.9675,0.96375,0.96,0.96,0.95875,0.9675,0.963,0.003021,5


## Pick best model

In [20]:
best_model = LogisticRegression(penalty='l1', C=1, solver='liblinear', random_state=0)
best_model.fit(X_train, y_train)
print(f'Test set accuracy of best model: {best_model.score(X_test, y_test)}')

Test set accuracy of best model: 0.959
