In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Loading data

In [2]:
X_train = pd.read_csv('./oversampled_data/X_train.csv')
X_test = pd.read_csv('./oversampled_data/X_test.csv')
y_train = pd.read_csv('./oversampled_data/y_train.csv',header = None)
y_test = pd.read_csv('./oversampled_data/y_test.csv',header = None)

In [3]:
X_train.shape

(49242, 15)

In [4]:
y_train.shape

(49242, 2)

In [5]:
X_train.drop(columns = ['Unnamed: 0'], inplace = True)
X_test.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
y_train.drop(columns = [y_train.columns.values.tolist()[0]], inplace = True)
y_test.drop(columns = [y_test.columns.values.tolist()[0]], inplace = True)

In [7]:
y_train.columns = ['subscribed']
y_test.columns = ['subscribed']

In [8]:
X_train.shape

(49242, 14)

In [9]:
y_train = y_train['subscribed']

In [10]:
y_train

0        1
1        1
2        0
3        0
4        1
5        0
6        1
7        0
8        0
9        1
10       1
11       1
12       1
13       0
14       1
15       0
16       1
17       0
18       0
19       1
20       1
21       0
22       0
23       1
24       1
25       0
26       0
27       0
28       1
29       0
        ..
49212    1
49213    1
49214    1
49215    0
49216    1
49217    0
49218    1
49219    0
49220    1
49221    1
49222    0
49223    0
49224    0
49225    1
49226    0
49227    1
49228    0
49229    1
49230    0
49231    1
49232    1
49233    1
49234    1
49235    1
49236    1
49237    0
49238    0
49239    0
49240    1
49241    1
Name: subscribed, Length: 49242, dtype: int64

### Feature Selection

In [11]:
from cLiML.model_prep import feature_select_logistic_reg

In [12]:
df_feature = feature_select_logistic_reg(X_train, y_train, cv = 5)

In [13]:
df_feature

Unnamed: 0,variable_RFECV,support,ranking,variable_skb,score
0,time_in_product_mins,True,1,time_in_product_mins,1172.951983
1,country_AU,True,1,referrer_channels_Direct,484.66647
2,devices_Desktop,True,1,devices_Desktop,254.802811
3,referrer_channels_Direct,True,1,campaigns_au_nz_mobile_product_launch,182.590298
4,referrer_channels_None,True,1,referrer_channels_Organic,149.282323
5,referrer_channels_Organic,True,1,campaigns_au_nz_eofy,75.063839
6,campaigns_au_nz_eofy,True,1,industries_Healthcare,8.970992
7,campaigns_au_nz_mobile_product_launch,True,1,industries_Education,2.43255
8,industries_Construction,True,1,referrer_channels_None,2.113623
9,industries_Finance,True,1,industries_Hospitality,0.890609


### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.5841761098249462

In [16]:
lr.score(X_test, y_test)

0.5944844579226687

In [17]:
lr_coef_df = pd.DataFrame({'Variables': X_train.columns.tolist(),
                          'Coefficients': lr.coef_[0].tolist(),
                          'Abs_Coefs': abs(lr.coef_[0])})

In [18]:
lr_coef_df.sort_values('Abs_Coefs', ascending = False)

Unnamed: 0,Variables,Coefficients,Abs_Coefs
4,referrer_channels_Direct,0.45097,0.45097
3,devices_Desktop,0.318998,0.318998
5,referrer_channels_None,0.178581,0.178581
12,industries_Healthcare,-0.08318,0.08318
6,referrer_channels_Organic,-0.057689,0.057689
11,industries_Finance,-0.040504,0.040504
8,campaigns_au_nz_mobile_product_launch,0.033251,0.033251
9,industries_Construction,-0.031827,0.031827
7,campaigns_au_nz_eofy,0.024958,0.024958
2,country_AU,-0.018894,0.018894


As can be seen here, the referral type of "Direct" and device type of "Desktop" are the highest indicator of the user will subscribe to the service.

In [19]:
from sklearn.model_selection import cross_val_score
LR = LogisticRegression()
cv_scores = cross_val_score(LR, X_train, y_train, cv = 10)
print(cv_scores)
print(cv_scores.mean())

[0.57998376 0.58874898 0.58184403 0.58184403 0.59727864 0.56539399
 0.58874898 0.59687246 0.57656377 0.58143786]
0.583871649725441


Cross validation is just conducted to ensure that result obtained from the train test split is represetative. For smaller data sets, the splitting of data set may be biased, causing the model accuracy to fluctuate wildly depending on how the data set were cut. This problem doesn't usually occur for data set of signifcant size.

### Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [21]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
rfc.score(X_train, y_train)

0.9919987002964948

In [23]:
rfc.score(X_test, y_test)

0.5274355572403336

### Random Forest Grid Search

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
params = {'max_depth': [1,5,10,14],
         'max_features': [1,5,10,14]}

rfct = RandomForestClassifier()
rfct_gs = GridSearchCV(rfct, params, n_jobs = -1, cv = 10)
rfct_gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 10, 14], 'max_features': [1, 5, 10, 14]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [26]:
rfct_gs.best_params_

{'max_depth': 14, 'max_features': 1}

In [27]:
rfct_gs.score(X_train, y_train)

0.8167011900410219

In [28]:
rfct_gs.score(X_test, y_test)

0.5502274450341168

Usually, I don't bother to tune hyper-parameter if the test accuracy is low. Hyper-parameter tuning is extremely good at getting rid of overfitting, but it does very little to raise the overall accuracy of the model. Given our model's accuracy is around 55%, doing further hyper-parameter tuning will not bring it up much.

### CATBOOST

In [29]:
from catboost import CatBoostClassifier, Pool, cv

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
catboost = CatBoostClassifier(custom_loss=['Accuracy'], logging_level='Silent')

catboost.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1a1e14e320>

In [32]:
catboost.score(X_train, y_train)

0.6051135209780268

In [33]:
catboost.score(X_test, y_test)

0.5893669446550417

### Neural Network

In [52]:
from keras.models import Sequential
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras import optimizers


In [53]:
X_train.shape

(49242, 14)

In [76]:
model = Sequential()
model.add(Dense(units = 14, activation = 'relu', kernel_initializer = 'uniform', input_dim = 14))
model.add(Dense(units = 14, activation = 'relu', kernel_initializer = 'uniform', input_dim = 14))
model.add(Dense(units = 14, activation = 'relu', kernel_initializer = 'uniform', input_dim = 14))
model.add(Dense(units = 1, activation = 'sigmoid'))

In [77]:
model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Adam(lr = 0.001), metrics = ['binary_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 14)                210       
_________________________________________________________________
dense_27 (Dense)             (None, 14)                210       
_________________________________________________________________
dense_28 (Dense)             (None, 14)                210       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 15        
Total params: 645
Trainable params: 645
Non-trainable params: 0
_________________________________________________________________


In [78]:
model.fit(X_train, y_train, batch_size = 30, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a3a290908>

In [79]:
y_pred = model.predict(X_test)

In [80]:
y_pred = (y_pred > 0.5)

In [74]:
from sklearn.metrics import accuracy_score

In [81]:
accuracy_score(y_test, y_pred)

0.5899355572403336

In [82]:
y_train_pred = model.predict(X_train)

In [83]:
y_train_pred = (y_train_pred > 0.5)

In [84]:
accuracy_score(y_train, y_train_pred)

0.5833231793996995