In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Loading data

In [2]:
X_train = pd.read_csv('./original_data/X_train.csv')
X_test = pd.read_csv('./original_data/X_test.csv')
y_train = pd.read_csv('./original_data/y_train.csv',header = None)
y_test = pd.read_csv('./original_data/y_test.csv',header = None)

In [3]:
X_train.shape

(35000, 15)

In [4]:
y_train.shape

(35000, 2)

In [5]:
X_train.drop(columns = ['Unnamed: 0'], inplace = True)
X_test.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
y_train.drop(columns = [y_train.columns.values.tolist()[0]], inplace = True)
y_test.drop(columns = [y_test.columns.values.tolist()[0]], inplace = True)

In [7]:
y_train.columns = ['subscribed']
y_test.columns = ['subscribed']

In [8]:
X_train.shape

(35000, 14)

In [9]:
y_train = y_train['subscribed']

In [48]:
baseline = 1 - y_train.mean()

In [49]:
baseline

0.7034571428571428

### Feature Selection

In [10]:
from cLiML.model_prep import feature_select_logistic_reg

In [11]:
df_feature = feature_select_logistic_reg(X_train, y_train, cv = 5)

In [12]:
df_feature

Unnamed: 0,variable_RFECV,support,ranking,variable_skb,score
0,time_in_product_mins,True,1,time_in_product_mins,717.519739
1,country_AU,True,1,referrer_channels_Direct,318.436061
2,devices_Desktop,True,1,devices_Desktop,201.48601
3,referrer_channels_Direct,True,1,campaigns_au_nz_mobile_product_launch,157.452346
4,referrer_channels_None,True,1,referrer_channels_Organic,86.253533
5,referrer_channels_Organic,True,1,campaigns_au_nz_eofy,66.939779
6,campaigns_au_nz_eofy,True,1,referrer_channels_None,2.475957
7,campaigns_au_nz_mobile_product_launch,True,1,industries_Healthcare,2.188615
8,industries_Construction,True,1,industries_Hospitality,1.021016
9,industries_Education,True,1,industries_Education,0.623498


### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_train, y_train)

0.7049714285714286

In [15]:
lr.score(X_test, y_test)

0.7054666666666667

In [16]:
lr_coef_df = pd.DataFrame({'Variables': X_train.columns.tolist(),
                          'Coefficients': lr.coef_[0].tolist(),
                          'Abs_Coefs': abs(lr.coef_[0])})

In [17]:
lr_coef_df.sort_values('Abs_Coefs', ascending = False)

Unnamed: 0,Variables,Coefficients,Abs_Coefs
4,referrer_channels_Direct,0.473253,0.473253
3,devices_Desktop,0.315306,0.315306
5,referrer_channels_None,0.196253,0.196253
12,industries_Healthcare,-0.056591,0.056591
7,campaigns_au_nz_eofy,0.046328,0.046328
6,referrer_channels_Organic,-0.04353,0.04353
9,industries_Construction,-0.038201,0.038201
11,industries_Finance,-0.031434,0.031434
2,country_AU,-0.019307,0.019307
8,campaigns_au_nz_mobile_product_launch,-0.01452,0.01452


As can be seen here, the referral type of "Direct" and device type of "Desktop" are the highest indicator of the user will subscribe to the service.

In [18]:
from sklearn.model_selection import cross_val_score
LR = LogisticRegression()
cv_scores = cross_val_score(LR, X_train, y_train, cv = 10)
print(cv_scores)
print(cv_scores.mean())

[0.70694087 0.70542857 0.70571429 0.706      0.70428571 0.70657143
 0.70314286 0.70342857 0.70628571 0.70248642]
0.7050284441585902


Cross validation is just conducted to ensure that result obtained from the train test split is represetative. For smaller data sets, the splitting of data set may be biased, causing the model accuracy to fluctuate wildly depending on how the data set were cut. This problem doesn't usually occur for data set of signifcant size.

### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [20]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
rfc.score(X_train, y_train)

0.9671428571428572

In [22]:
rfc.score(X_test, y_test)

0.6528666666666667

### Random Forest Grid Search

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
params = {'max_depth': [1,5,10,14],
         'max_features': [1,5,10,14]}

rfct = RandomForestClassifier()
rfct_gs = GridSearchCV(rfct, params, n_jobs = -1, cv = 10)
rfct_gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 10, 14], 'max_features': [1, 5, 10, 14]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
rfct_gs.best_params_

{'max_depth': 5, 'max_features': 14}

In [26]:
rfct_gs.score(X_train, y_train)

0.7084285714285714

In [27]:
rfct_gs.score(X_test, y_test)

0.7044666666666667

Usually, I don't bother to tune hyper-parameter if the test accuracy is low. Hyper-parameter tuning is extremely good at getting rid of overfitting, but it does very little to raise the overall accuracy of the model. Given our model's accuracy is around 55%, doing further hyper-parameter tuning will not bring it up much.

### CATBOOST

In [28]:
from catboost import CatBoostClassifier, Pool, cv

In [29]:
from sklearn.metrics import accuracy_score

In [30]:
catboost = CatBoostClassifier(custom_loss=['Accuracy'], logging_level='Silent')

catboost.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x1a1f6b6e48>

In [31]:
catboost.score(X_train, y_train)

0.7082285714285714

In [32]:
catboost.score(X_test, y_test)

0.7046

### Neural Network

In [33]:
from keras.models import Sequential
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras import optimizers


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [34]:
X_train.shape

(35000, 14)

In [35]:
model = Sequential()
model.add(Dense(units = 14, activation = 'relu', kernel_initializer = 'uniform', input_dim = 14))
model.add(Dense(units = 14, activation = 'relu', kernel_initializer = 'uniform', input_dim = 14))
model.add(Dense(units = 14, activation = 'relu', kernel_initializer = 'uniform', input_dim = 14))
model.add(Dense(units = 1, activation = 'sigmoid'))

In [36]:
model.compile(loss = 'binary_crossentropy', optimizer = optimizers.Adam(lr = 0.001), metrics = ['binary_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_2 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 15        
Total params: 645
Trainable params: 645
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.fit(X_train, y_train, batch_size = 30, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x111a19a90>

In [38]:
y_pred = model.predict(X_test)

In [39]:
y_pred = (y_pred > 0.5)

In [40]:
from sklearn.metrics import accuracy_score

In [41]:
accuracy_score(y_test, y_pred)

0.7034666666666667

In [42]:
y_train_pred = model.predict(X_train)

In [43]:
y_train_pred = (y_train_pred > 0.5)

In [44]:
accuracy_score(y_train, y_train_pred)

0.7034571428571429