In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score

## Read data

In [28]:
data = pd.read_csv('data/complete.csv')

In [29]:
data = data.drop('customer_id', axis=1)

## Split into train and test

In [30]:
X = data.loc[:, data.columns != 'card_offer'].values
y = data['card_offer'].values

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=0,
                                                   stratify=y)

## Standardize

In [31]:
stdsc = StandardScaler().fit(X_train)
X_train_std = stdsc.transform(X_train)
X_test_std = stdsc.transform(X_test)

## Create model, cross validate

- For this part, the GridSearchCV object will try the different combos of parameters for us
- To use it, we just pass the model and the parameters we want it to try in a dictionary.
- Each key of the dictionary is associate with a parameter of the model, and each key is a list of values to try for that paramter
- For example, LogisticRegression has parameters called 'penalty' and 'C', so I use those as keys and associate them with a list of values to try for that parameter.

In [32]:
model = LogisticRegression(solver='liblinear', random_state=0)
parameters = {'penalty': ['l1', 'l2'], 
              'C': [.0001, .0005, .001, .005,  .01, .05,  1, 5, 10, 50, 100, 500, 1000]}

skf = StratifiedKFold(n_splits=10)
classifier = GridSearchCV(model, parameters, cv=skf, scoring="f1")
classifier.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=LogisticRegression(random_state=0, solver='liblinear'),
             param_grid={'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 1, 5,
                               10, 50, 100, 500, 1000],
                         'penalty': ['l1', 'l2']},
             scoring='f1')

## Print out top 5 models according to mean test fold score
- Look at mean_test_score column

In [33]:
pd.DataFrame(classifier.cv_results_).sort_values(by='rank_test_score').head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
20,0.115691,0.018275,0.001396,0.000488,100,l1,"{'C': 100, 'penalty': 'l1'}",0.885246,0.887097,0.896266,0.865801,0.893443,0.880342,0.876033,0.878431,0.867769,0.887967,0.881839,0.009604,1
18,0.115986,0.020649,0.001401,0.000487,50,l1,"{'C': 50, 'penalty': 'l1'}",0.885246,0.887097,0.896266,0.865801,0.893443,0.880342,0.876033,0.878431,0.867769,0.887967,0.881839,0.009604,1
22,0.119688,0.018006,0.001592,0.000485,500,l1,"{'C': 500, 'penalty': 'l1'}",0.885246,0.887097,0.896266,0.865801,0.893443,0.880342,0.876033,0.875,0.867769,0.887967,0.881496,0.009779,3
24,0.12716,0.021612,0.001493,0.000495,1000,l1,"{'C': 1000, 'penalty': 'l1'}",0.885246,0.887097,0.896266,0.865801,0.893443,0.880342,0.876033,0.875,0.867769,0.887967,0.881496,0.009779,3
16,0.107218,0.001568,0.001292,0.000451,10,l1,"{'C': 10, 'penalty': 'l1'}",0.881633,0.887097,0.896266,0.865801,0.893443,0.885106,0.871369,0.878431,0.867769,0.887967,0.881488,0.009969,5


## Pick best model

In [34]:
np.set_printoptions(suppress=True)

In [35]:
best_model = LogisticRegression(penalty='l1', C=1, solver='liblinear', random_state=0)
best_model.fit(X_train, y_train)
print(f"The f1 score of the best LR model is: {f1_score(y_test, classifier.predict(X_test))}")
print(f'Test set accuracy of best LR model: {best_model.score(X_test, y_test)}')

The f1 score of the best LR model is: 0.863481228668942
Test set accuracy of best LR model: 0.96
Coefficients of best model:
est_income 0.00013782401889480045
hold_bal -0.10881197474246855
pref_cust_prob 23.38367533781703
imp_cscore 0.010442577208516808
RiskScore -0.002070193626170034
imp_crediteval -0.09102586592520628
axio_score 0.0
card_offer -0.9472957040713645
demographic_slice_AX03efs -0.16607698682501829
demographic_slice_BWEsk45 -0.969617140627039
demographic_slice_CARDIF2 0.15620207830246638
demographic_slice_DERS3w5 -4.860501220124651
country_reg_E -11.153052249109228
country_reg_W -6.279707635136627
ad_exp_N -6.377802701469967
