# Your Mission, should you choose to accept it...

To hyperparameter tune and extract every ounce of accuracy out of this telecom customer churn dataset: <https://drive.google.com/file/d/1dfbAsM9DwA7tYhInyflIpZnYs7VT-0AQ/view> 

## Requirements

- Load the data
- Clean the data if necessary (it will be)
- Create and fit a baseline Keras MLP model to the data.
- Hyperparameter tune (at least) the following parameters:
 - batch_size
 - training epochs
 - optimizer
 - learning rate (if applicable to optimizer)
 - momentum (if applicable to optimizer)
 - activation functions
 - network weight initialization
 - dropout regularization
 - number of neurons in the hidden layer
 
 You must use Grid Search and Cross Validation for your initial pass of the above hyperparameters
 
 Try and get the maximum accuracy possible out of this data! You'll save big telecoms millions! Doesn't that sound great?


In [1]:
!wget https://raw.githubusercontent.com/cocoisland/DS-Unit-4-Sprint-3-Neural-Networks/master/module4-Hyperparameter-Tuning/WA_Fn-UseC_-Telco-Customer-Churn.csv

--2019-04-04 20:54:05--  https://raw.githubusercontent.com/cocoisland/DS-Unit-4-Sprint-3-Neural-Networks/master/module4-Hyperparameter-Tuning/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’


2019-04-04 20:54:10 (21.5 MB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’ saved [977501/977501]



In [3]:
!ls

sample_data  WA_Fn-UseC_-Telco-Customer-Churn.csv


In [0]:
import pandas as pd

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(df.shape)
df.head().T

In [0]:
!pip install category_encoders

In [51]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [64]:
import category_encoders as ce
import numpy as np
from sklearn.preprocessing import StandardScaler

y = df['Churn'].map({'Yes':1,'No':0})

# customerID is unique random noise
df1 = df.drop(['customerID','Churn'],axis=1)
df1['TotalCharges']=df1['TotalCharges'].replace(' ',0).astype(float)

encoder = ce.OrdinalEncoder(handle_unknown='ignore', 
                 cols=['gender', 'Partner', 'Dependents', 'PhoneService',
                       'MultipleLines','InternetService','OnlineSecurity',
                      'OnlineBackup','TechSupport','StreamingTV','StreamingMovies',
                      'Contract','PaperlessBilling','PaymentMethod',
                      'DeviceProtection'])
X_enc = encoder.fit_transform(df1)
scaler = StandardScaler()
X_enc_std = scaler.fit_transform(X_enc)




  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [0]:
print(X_enc.dtypes)
print(df.isnull().sum())

In [66]:
X_enc_std.shape, y.shape

((7043, 19), (7043,))

In [74]:
import numpy
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)


# split into input (X) and output (Y) variables
X = X_enc_std
Y = y

# Function to create model, required for KerasClassifier
def create_model(optimizer='adam'):
	# create model
	model = Sequential()
	model.add(Dense(12, input_dim=19, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
	return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
# best:batch 10, epoch=20
# batch_size = [10, 20, 40, 60, 80, 100] 
# epoch = [20,40]
# optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
# learning rate (if applicable to optimizer)
# momentum (if applicable to optimizer)
# activation functions
# network weight initialization
# dropout regularization
# number of neurons in the hidden layer

# param_grid = dict(batch_size=batch_size, epochs=epochs)

# define the grid search parameters
param_grid = {'batch_size': [10],
              'epochs': [20],
             'optimizer': ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']}

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X, Y)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 



Best: 0.8043447364617288 using {'batch_size': 10, 'epochs': 20, 'optimizer': 'Adam'}
Means: 0.8039187818160538, Stdev: 0.0031694817776395153 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'SGD'}
Means: 0.8012210670290016, Stdev: 0.006121639617271853 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'RMSprop'}
Means: 0.8006531282546387, Stdev: 0.004257068249722669 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'Adagrad'}
Means: 0.80349282623099, Stdev: 0.0014368167655811402 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'Adadelta'}
Means: 0.8043447364617288, Stdev: 0.0025899104512118196 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'Adam'}
Means: 0.7989493085802168, Stdev: 0.004561942137594733 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'Adamax'}
Means: 0.8009370980945887, Stdev: 0.002997477342900473 with: {'batch_size': 10, 'epochs': 20, 'optimizer': 'Nadam'}


### Kernel Initializer / Network weight Initialization

In [86]:
def create_model(init_mode='uniform'):
  # create model
  model = Sequential()
  model.add(Dense(12, input_dim=19, kernel_initializer=init_mode, activation='relu'))
  model.add(Dense(1, kernel_initializer=init_mode, activation='sigmoid'))
  # Compile model
 
  model.compile(loss='binary_crossentropy', optimizer=adam(lr=0.01), metrics=['accuracy'])
  return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)


param_grid = {'batch_size': [10],
              'epochs': [20],
              'init_mode' : ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
              }

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X, Y)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print(f"Means: {mean}, Stdev: {stdev} with: {param}") 



Best: 0.8015050369620441 using {'batch_size': 10, 'epochs': 20, 'init_mode': 'glorot_normal'}
Means: 0.796251594148609, Stdev: 0.0034340707284342417 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'uniform'}
Means: 0.7949737301861953, Stdev: 0.006954058603008513 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'lecun_uniform'}
Means: 0.7925599853406486, Stdev: 0.0027742655879583033 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'normal'}
Means: 0.7346301292105356, Stdev: 0.0046012089443491205 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'zero'}
Means: 0.8015050369620441, Stdev: 0.0016016848503678756 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'glorot_normal'}
Means: 0.7990912939444973, Stdev: 0.0042196159009435085 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'glorot_uniform'}
Means: 0.7986653386894891, Stdev: 0.004068461780134848 with: {'batch_size': 10, 'epochs': 20, 'init_mode': 'he_normal'}
Means: 0.793695865919115, Stdev: 0.0038658844922868658 with

### Learn rate and momentum for SGD only.

In [76]:

from keras.optimizers import SGD, adam

# Function to create model, required for KerasClassifier
def create_model(learn_rate=0.01, momentum=0):
	# create model
	model = Sequential()
	model.add(Dense(12, input_dim=19, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	optimizer = SGD(lr=learn_rate, momentum=momentum)
	model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# create model
model = KerasClassifier(build_fn=create_model, epochs=20, batch_size=10, verbose=0)
# define the grid search parameters
learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
param_grid = dict(learn_rate=learn_rate, momentum=momentum)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X, Y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.804487 using {'learn_rate': 0.01, 'momentum': 0.0}
0.776374 (0.003394) with: {'learn_rate': 0.001, 'momentum': 0.0}
0.784609 (0.005110) with: {'learn_rate': 0.001, 'momentum': 0.2}
0.789720 (0.003539) with: {'learn_rate': 0.001, 'momentum': 0.4}
0.795542 (0.001164) with: {'learn_rate': 0.001, 'momentum': 0.6}
0.798807 (0.003370) with: {'learn_rate': 0.001, 'momentum': 0.8}
0.795542 (0.007016) with: {'learn_rate': 0.001, 'momentum': 0.9}
0.804487 (0.003560) with: {'learn_rate': 0.01, 'momentum': 0.0}
0.800511 (0.004409) with: {'learn_rate': 0.01, 'momentum': 0.2}
0.802357 (0.003225) with: {'learn_rate': 0.01, 'momentum': 0.4}
0.801789 (0.003528) with: {'learn_rate': 0.01, 'momentum': 0.6}
0.798807 (0.004882) with: {'learn_rate': 0.01, 'momentum': 0.8}
0.798097 (0.003443) with: {'learn_rate': 0.01, 'momentum': 0.9}
0.799801 (0.001602) with: {'learn_rate': 0.1, 'momentum': 0.0}
0.799375 (0.004316) with: {'learn_rate': 0.1, 'momentum': 0.2}
0.788158 (0.008392) with: {'learn_rate': 

### Best Parameter
1. kernel initialization = glorot normal
2. batch size = 10
3. epoch = 20
4. activation = linear
5. learn rate = 0.01 , momentum=0 for SGD


But accuracy all hovers around 0.84

In [88]:

def create_model(activation='relu'):
  # create model
  model = Sequential()
  model.add(Dense(12, input_dim=19, kernel_initializer='glorot_normal', activation=activation))
  model.add(Dense(1, kernel_initializer='glorot_normal', activation='sigmoid'))
  # Compile model
 
  model.compile(loss='binary_crossentropy', optimizer=adam(lr=0.01), metrics=['accuracy'])
  return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)


param_grid = {'batch_size': [10],
              'epochs': [20],
              #'init_mode' : ['glorot_normal'],
              'activation' : ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
              }

# Create Grid Search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X, Y)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print(f"Means: {mean}, Stdev: {stdev} with: {param}") 




Best: 0.8043447358439325 using {'activation': 'linear', 'batch_size': 10, 'epochs': 20}
Means: 0.7932699110957179, Stdev: 0.002130368254130403 with: {'activation': 'softmax', 'batch_size': 10, 'epochs': 20}
Means: 0.7958256397822119, Stdev: 0.0006843002359515638 with: {'activation': 'softplus', 'batch_size': 10, 'epochs': 20}
Means: 0.7870225742444487, Stdev: 0.004911288676461447 with: {'activation': 'softsign', 'batch_size': 10, 'epochs': 20}
Means: 0.7927019706541514, Stdev: 0.00466220651678623 with: {'activation': 'relu', 'batch_size': 10, 'epochs': 20}
Means: 0.7939798363599353, Stdev: 0.0036487194615499817 with: {'activation': 'tanh', 'batch_size': 10, 'epochs': 20}
Means: 0.7969615191927895, Stdev: 0.00215604614704625 with: {'activation': 'sigmoid', 'batch_size': 10, 'epochs': 20}
Means: 0.7932699110110882, Stdev: 0.008190555304269431 with: {'activation': 'hard_sigmoid', 'batch_size': 10, 'epochs': 20}
Means: 0.8043447358439325, Stdev: 0.0043974256928411976 with: {'activation': '

## Stretch Goals:

- Try to implement Random Search Hyperparameter Tuning on this dataset
- Try to implement Bayesian Optimiation tuning on this dataset
- Practice hyperparameter tuning other datasets that we have looked at. How high can you get MNIST? Above 99%?
- Study for the Sprint Challenge
 - Can you implement both perceptron and MLP models from scratch with forward and backpropagation?
 - Can you implement both perceptron and MLP models in keras and tune their hyperparameters with cross validation?