# Training notebook

This notebook contains the training code for the Starbucks Capstone Challenge.

It has got the following structure:
* Checking the correlation of the features in the training data
* Based on this seleceting the sufficient features
* Shuffleing the data
* splitting the data into train, validation and test sets
* creating data loaders
* defining the models
* finding sufficient hyperparameters
* training the models
* evaluating the trained models
* compare the results
---

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from importlib import reload
from sklearn.model_selection import train_test_split
from source import training_helpers

In [5]:
reload(training_helpers)

<module 'source.training_helpers' from '/home/ferenc/Documents/Udacity/Machine_Learning_Engineer/Starbucks_Capstone_Project/source/training_helpers.py'>

### loading the dataset and checking feature correlation

In [7]:
# loading the data:
data_dir = 'data'
data_file = 'training_data_standardized.csv'
training_data_df = training_helpers.load_training_data(data_dir=data_dir, data_file=data_file)
training_data_df.head()

Read training data from data/training_data_standardized.csv


Unnamed: 0,F,M,O,U,age,income,membership_length,av_money_spent,num_received,viewed/received,...,offer_1,offer_2,offer_3,offer_4,offer_5,offer_6,offer_7,offer_8,offer_9,label
0,1.0,0.0,0.0,0.0,1.193501,1.654475,0.132255,-0.763102,-1.206595,-1.292364,...,0,0,1,0,0,0,0,0,0,3
1,0.0,0.0,0.0,1.0,0.0,0.0,-0.011285,-0.763102,-1.206595,-1.292364,...,0,0,0,1,0,0,0,0,0,3
2,0.0,1.0,0.0,0.0,0.792728,0.253302,-1.245015,-0.763102,-1.206595,-1.292364,...,0,0,0,0,0,0,0,0,1,3
3,0.0,0.0,0.0,1.0,0.0,0.0,-0.253576,-0.763102,-1.206595,-1.292364,...,1,0,0,0,0,0,0,0,0,3
4,0.0,1.0,0.0,0.0,0.620969,-0.540695,-0.865163,-0.763102,-1.206595,-1.292364,...,0,0,0,0,0,0,0,1,0,3


In [8]:
# checking the correlation matrix:
training_data_df.corr()

Unnamed: 0,F,M,O,U,age,income,membership_length,av_money_spent,num_received,viewed/received,...,offer_1,offer_2,offer_3,offer_4,offer_5,offer_6,offer_7,offer_8,offer_9,label
F,1.0,-0.742123,-0.080854,-0.288567,0.144789,0.222408,0.011123,0.19804,-0.004785,0.001956,...,0.002295,0.00112,-0.000737,0.006749,-0.004738,-0.004614,0.00437,-0.002159,-0.002574,-0.004166
M,-0.742123,1.0,-0.110272,-0.39356,-0.137893,-0.210736,-0.003784,-0.035368,0.002142,-0.027004,...,-0.001056,0.000916,-0.000871,-0.002849,0.003803,0.00532,-0.004631,-0.000889,0.002432,0.00282
O,-0.080854,-0.110272,1.0,-0.042878,-0.003326,-0.013558,-0.009273,0.018863,-0.004134,0.012073,...,-0.008012,0.000654,0.001314,0.002979,0.002733,-0.000603,0.002963,-0.001584,-0.002334,-0.002745
U,-0.288567,-0.39356,-0.042878,1.0,0.000406,0.00173,-0.007125,-0.232751,0.004901,0.033187,...,0.000882,-0.003135,0.001902,-0.006255,0.000195,-0.001151,-0.000275,0.004855,0.000786,0.002587
age,0.144789,-0.137893,-0.003326,0.000406,1.0,0.300572,0.020466,0.080883,-0.001565,0.014495,...,-0.002082,0.00421,-0.0023,0.002606,-0.000767,-6.5e-05,0.005979,7.5e-05,-0.005924,-0.007731
income,0.222408,-0.210736,-0.013558,0.00173,0.300572,1.0,0.047885,0.244121,-0.006509,0.028156,...,0.003162,0.000764,-0.003768,-0.001267,-0.000318,-0.005584,0.003583,0.002338,-0.002103,-0.003298
membership_length,0.011123,-0.003784,-0.009273,-0.007125,0.020466,0.047885,1.0,0.176122,0.000882,0.006887,...,0.005271,0.000403,0.004939,0.000589,-0.010036,0.001721,0.001926,-0.005336,-0.000274,-0.001767
av_money_spent,0.19804,-0.035368,0.018863,-0.232751,0.080883,0.244121,0.176122,1.0,0.313841,0.396038,...,0.013244,0.018239,0.002616,-0.019673,-0.011763,-0.036415,0.021134,0.016454,-0.005191,-0.029876
num_received,-0.004785,0.002142,-0.004134,0.004901,-0.001565,-0.006509,0.000882,0.313841,1.0,0.516739,...,0.073504,0.073351,-0.030938,-0.108061,-0.024229,-0.10595,0.06789,0.069505,-0.026006,-0.107171
viewed/received,0.001956,-0.027004,0.012073,0.033187,0.014495,0.028156,0.006887,0.396038,0.516739,1.0,...,0.021825,0.022699,-0.009611,-0.035243,-0.002591,-0.042932,0.019774,0.027679,-0.005509,-0.032227


The features does not seem to be correlated to eachother, so we keep all of them

### Extracting the training features and the labels

In [None]:
X = training_data_df.values[:, :-1]
y = training_data_df.values[:, -1]

# deleting the original dataframe:
training_data_df = None

### Shuffleing and splitting the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)