In [1]:
## Provide a wider display for easier viewing
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
## Import the necessary libraries

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import *
import seaborn as sns
from sklearn.model_selection import *
from sklearn.metrics import *
import tensorflow as tf
import keras

Using TensorFlow backend.


In [3]:
## Confirm the versions of Keras and Tensorflow - to confirm we have the most up to date
## We should see 2.3 and 2.0 and .24.2

print(keras.__version__)
print()
print(tf.__version__)
print()
print(pd.__version__)

2.3.0

2.0.0

0.24.2


In [4]:
## Remove annoying DeprecationWarnings from sklearn - I will upgrade after this project!

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
## List the different files we have available in the Group Project folder
## The main files that we need for this analysis are:

import os

#print(os.listdir("D:\\Group Project - Google\\datasets"))

In [7]:
## Set the working directory

## Working directory - for MSBA 6420 Virtual Machine
working_dir = "D:\\Group 1\\datasets\\"

## Working directory - for MSBA 6410 Virtual Machine
##working_dir = "D:\\Group Project - Google\\datasets\\"

## Training Data File - with flattened JSON fields
training_file = "train-flattened.csv"

## Full path to train data file
training_path = working_dir+training_file

## Testing Data File - first version
## testing_file = "test-flattened.csv"

## Full path to test data file
## testing_path = working_dir+testing_file

## Submission Data File - with subset of VisitorIds
submission_file = "sample_submission_v2.csv"

## Full path to submission file
submission_path = working_dir+submission_file

## New testing data file
testing_file_v2 = "test_2-flattened.csv"

testing_v2_path = working_dir+testing_file_v2

In [8]:
## Verify the training and testing paths

print(training_path)

print()

print(testing_v2_path)

D:\Group 1\datasets\train-flattened.csv

D:\Group 1\datasets\test_2-flattened.csv


In [9]:
## Load in the training data from the flattened CSV file
training_data = pd.read_csv(training_path, low_memory = False)

In [10]:
## Confirm that our original training data is loaded

training_data.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,...,,,,(not set),,,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,...,,,,(not set),,True,(not provided),organic,,google


In [28]:
## See the shape of the training data - to know how many dimension/features we have available
training_data.shape

(903653, 55)

In [29]:
## Fill transaction revenue column, our target, with zeros and convert to floating point measurement
target = training_data["totals.transactionRevenue"].fillna(0).astype(float)

## One of the most vital steps - transform the new target variable to 
target = target.apply(lambda x: np.log1p(x))

In [30]:
## Set a new variable with the columns we identified in our feature analysis
final_feature_cols = ["visitNumber", "date", "totals.hits", "totals.pageviews", 
                      "geoNetwork.city", "geoNetwork.country", "trafficSource.source"]

In [31]:
## Subset our new training data
new_train_data = training_data[final_feature_cols]

## Confirm the shape matches our original set of training data
print(new_train_data.shape)

(903653, 7)


In [32]:
## Set up a for loop to go through the columns in our new training data set
## If the data type is object - this means it is a categorical feature
## For the columns that contain categorical feature, we use factorize to transfrom them to numeric values
## This transformation happens "inplace" - we will overwrite the original columns with the newly factorized features

for col in new_train_data.columns:
    if new_train_data[col].dtypes == object:
        new_train_data[col], indexer = pd.factorize(new_train_data[col])

In [33]:
## One additonal step - fill the null/empty values for the "pageviews" column with zeros
## Then we transform this field into an integer field so that we have no floating point values left in our training data

new_train_data["totals.pageviews"] = new_train_data["totals.pageviews"].fillna(0).astype(int)

In [34]:
## Confirm that all of our datatypes are integer - this is needed for the LGB model

new_train_data.dtypes

visitNumber             int64
date                    int64
totals.hits             int64
totals.pageviews        int32
geoNetwork.city         int64
geoNetwork.country      int64
trafficSource.source    int64
dtype: object

In [35]:
## Create our "X" variable that contains all the features we would like to use for our model
X = np.array(new_train_data)

## Create our second "y" variable - for our regression problem using our new target variable
y_regression = np.array(target)

In [36]:
## Split data training 85 % and testing 15%
X_train_reg, X_val_reg, y_train_reg, y_val_reg = train_test_split(X, y_regression, 
                                                                    test_size = 0.15, random_state = 42)

In [37]:
## Just confirming how much data we have to train and validate against
print("Train shape: {}".format(X_train_reg.shape))
print()
print("Validation shape: {}".format(X_val_reg.shape))

Train shape: (768105, 7)

Validation shape: (135548, 7)


In [38]:
## Import mean_squared_error from sklearn
from sklearn.metrics import mean_squared_error

## Define our own function for calculating the RMSE
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

# <span style="color:#ffcc33">Appendix: Model Performance Evaluation</span>

In [41]:
## Transform our training data using RobustScaler object
## We have to reshape the training data because it is a flat array, and the RobustScaler object is expecting a 2-D array

X_train_reg_robust = preprocessing.RobustScaler().fit_transform(X_train_reg)

X_val_reg_robust = preprocessing.RobustScaler().fit_transform(X_val_reg)

y_train_reg_robust = preprocessing.RobustScaler().fit_transform(y_train_reg.reshape(-1, 1))

## We then flatten the transformed target back to its original state
y_train_reg_robust = y_train_reg_robust.flatten()

y_val_reg_robust = preprocessing.RobustScaler().fit_transform(y_val_reg.reshape(-1, 1))

## We then flatten the transformed target back to its original state
y_val_reg_robust = y_val_reg_robust.flatten()

In [42]:
print(X_train_reg_robust.shape)
print()
print(y_train_reg_robust.shape)

(768105, 7)

(768105,)


## <span style="color:##7a0019">Gradient Boost Regression - Training and MSE/RMSE Scoring Outuput</span>

In [24]:
## Set up a cross validation grid for GBoost
### Use least squares loss for our regression
gbr_p_grid = {'loss': ['lad'], 
                 'n_estimators': [100, 200]}

## Set a number of trials to run for the models
num_trials = 20

## Empty arrays to store scores for classifier
nested_scores_gbr = np.zeros(num_trials)

## Initiate a new Gradient Boost instance - to use in the nested cross validation step
gbr = ensemble.GradientBoostingRegressor()

In [None]:
%%time
## Loop for each trial
for i in range(20):
    
    ## Choose cross-validation techniques for the inner and outer loops,
    ## independently of the dataset.
    inner_cv = KFold(n_splits = 4, shuffle = True, random_state = i)
    outer_cv = KFold(n_splits = 4, shuffle = True, random_state = i)

## Nested CV for Logit Regression
    
    gbreg= GridSearchCV(estimator=gbr, n_jobs = 8, param_grid=gbr_p_grid, cv=inner_cv, scoring = "neg_mean_squared_error", verbose = 2)
    gbreg.fit(X_train_reg_robust, y_train_reg_robust)
    
    nested_score = cross_val_score(gbreg, X = X_train_reg_robust, y = y_train_reg_robust, cv = outer_cv, scoring = "neg_mean_squared_error")
    nested_scores_gbr[i] = nested_score.mean()

Fitting 4 folds for each of 2 candidates, totalling 8 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:  1.8min remaining:  3.1min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  3.6min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  3.6min finished


Fitting 4 folds for each of 2 candidates, totalling 8 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:  1.3min remaining:  2.1min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  2.4min finished


Fitting 4 folds for each of 2 candidates, totalling 8 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   3 out of   8 | elapsed:  1.4min remaining:  2.3min
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  2.5min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:  2.5min finished


## Picture here

## <span style="color:##7a0019">DecisionTree Regression - Training and MSE/RMSE Scoring Outuput</span>

In [None]:
## Initiate a new instance of Decision Tree Regressor
dtr = tree.DecisionTreeRegressor()

## Set up a grid for the DecisionTree Regressor
dtr_p_grid = {"criterion": ["mse"],
            "splitter": ["best", "random"],
            "max_features": [4, 5, 6],
            "max_depth": [5, 10, 15]}

## Set a number of trials to run for the models
num_trials = 20

## Empty arrays to store scores for classifier
nested_scores_dtr = np.zeros(num_trials)

In [None]:
%%time

## Loop for each trial
for i in range(20):
    
    ## Choose cross-validation techniques for the inner and outer loops,
    ## independently of the dataset.
    inner_cv = KFold(n_splits = 4, shuffle = True, random_state = i)
    outer_cv = KFold(n_splits = 4, shuffle = True, random_state = i)

## Nested CV for DecisionTree Regressor
    
    dtreg= GridSearchCV(estimator=dtr, param_grid=dtr_p_grid, cv=inner_cv, scoring = "neg_mean_squared_error", verbose = 2)
    dtreg.fit(X_train_reg_robust, y_train_reg_robust)
    
    nested_score = cross_val_score(dtreg, X = X_train_reg_robust, y = y_train_reg_robust, 
                                   cv = outer_cv, scoring = "neg_mean_squared_error")
    
    nested_scores_dtr[i] = nested_score.mean()

## <span style="color:##7a0019">Output of the tests as an image - we had to run concurrent tests on each VM.</span>

![image.png](attachment:image.png)

## <span style="color:##7a0019">Lasso Regression - Time to Train and MSE/RMSE Scoring</span>

In [None]:
## Lasso Regression
parameters = {"alpha": [1e-3, 1e-2, 1, 5]}

0.005, 0.02, 0.03, 0.05, 0.1

## Empty array to contain scores
nested_scores_lasso = np.zeros(num_trials)

## Linear Regression
lasso = linear_model.Lasso()

In [None]:
## Loop for each trial
for i in range(20):
    
    ## Choose cross-validation techniques for the inner and outer loops,
    ## independently of the dataset.
    inner_cv = KFold(n_splits = 4, shuffle = True, random_state = i)
    outer_cv = KFold(n_splits = 4, shuffle = True, random_state = i)

## Nested CV for Logit Regression
    
    lasso_reg= GridSearchCV(estimator=lasso, param_grid=lr_p_grid, cv=inner_cv, scoring = "neg_mean_squared_error", verbose = 2) ## Adding verbosity to see output
    lasso_reg.fit(X_train_reg_robust, y_train_reg_robust_int)
    
    nested_score = cross_val_score(lreg, X = X_train_reg_robust, y = y_train_reg_robust, cv = outer_cv,
                                  scoring = "neg_mean_squared_error")
    
    nested_scores_lr[i] = nested_score.mean()

## <span style="color:##7a0019">Output of the tests as an image - we had to run concurrent tests on each VM.</span>

![image.png](attachment:image.png)

## <span style="color:##7a0019">Two Layer Neural Network Time to Train and MSE/RMSE Scoring</span>

In [69]:
## Import our libraries needed to build the Neural Network model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, BatchNormalization
from keras.utils import np_utils
from tensorflow.keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor

In [75]:
## Cycle through these different hidden units for our two layer NN model
nb_hiddens = np.array([128])

## Create a function to build the two layer model
def test_model_two(activation='relu', nb_hidden = 120):
    
    two_layer_model = Sequential()
    
    ## The first hidden layer will cycle through different hidden layers
    ## The model expects rows of data with 7 features (the 'input_dim = 7' argument)
    two_layer_model.add(Dense(nb_hidden, input_dim = 7, activation = "relu"))
    
    ## The second hidden layer will have the same number of hidden units as the first layer and uses the relu activation function.
    two_layer_model.add(Dense(nb_hidden, activation = "relu"))
    
    ## The output layer has one node and uses the linear activation function.
    two_layer_model.add(Dense(1, activation = "linear"))
    
    ## We will define the optimizer as the efficient stochastic gradient descent algorithm "adam". 
    ## This is a popular version of gradient descent because it automatically tunes itself and gives good results in a wide range of problems.
    two_layer_model.compile(loss = "mse", optimizer = "adam", metrics = ["mse"])
    
    ## Return the completed model
    return two_layer_model

In [79]:
## Save our results to an empty list so that we can capture this and plot it later
two_layer_model_results = []

## Loop through all of the hidden layers and replace the hidden units in each iteration of tests
for i in nb_hiddens:
    
    ## Establish a new version of our model above
    two_layer_model = test_model_two(nb_hidden = i)
    
    ## Save the history of the model and use 15% of the training data to validate our model results
    history = two_layer_model.fit(X_train_reg_robust, y_train_reg_robust, epochs = 20, verbose = 0, validation_split = 0.15)
    
    ## Save the results into the list above
    two_layer_model_results.append(history)

In [80]:
## Save the results of our validation MSE to list to bring them back later
first_run_mse_b = two_layer_model_results[0].history['val_mse']

## <span style="color:##7a0019">Output of the tests as an image - we had to run concurrent tests on each VM.</span>

![image.png](attachment:image.png)

## <span style="color:##ffcc33">LG Boost Time to Train and MSE/RMSE Scoring</span>

In [None]:
## Import the library that will make our regression problem a cinch!
import lightgbm as lgb

## Import mean_squared_error from sklearn
from sklearn.metrics import mean_squared_error

## Define our own function for calculating the RMSE
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

In [None]:
## Create a function to build our LGB model
## We'll go into detail on each hyper as we build it

def lgb_model(X_train, y_train, X_val, y_val, X_test):
    
    ## Build a dictionary with our parameters
    ## We mimic-ed another sample notebook for some of these parameter settings
    params = {
        
        ## Learning objectives and rates
        "objective": "regression", ## Performing a regression problem
        "metric": "rmse", ## Use rmse, our custom function, as our metric to be used on our evaluation set
        "num_leaves": 40, ## max number of leaves in one tree
        "learning_rate": 0.005, ## also known as shrinkage rate, we go with a low value to give the algorithm better chance to learn
        
        ## Bagging to help with overfitting and to speed up our training
        "bagging_fraction": 0.6, ## Randomly select part of data without replacement to help with overfitting
        "bagging_frequency": 6, ## Perform bagging every 6th iteration
        "bagging_seed": 42, ## Random seed for bagging, 42 because it is the answer to everything
        
        ## Randomly select part of features on each iteration (tree)
        "feature_fraction": 0.6, ## Select 60% of features before training each tree
        
        ## Only warn if there is a "fatal" error
        "verbosity" : -1,
        ## Seed used to generate all other seeds, one seed to rule them all
        "seed": 42
    }
    
    ## First thing that is different about LGB, it takes a different object than our earlier algorithms
    ## LightGBM can load data from csv, txt, numpy arrays, pandas DataFrame - very flexible
    lgb_train_data = lgb.Dataset(X_train, label = y_train)
    lgb_val_data = lgb.Dataset(X_val, label = y_val)
    
    ## Build the actual LGB Model, now that we have set up our parameters, training, and validation data set up
    model = lgb.train(params, ## Parameters for training
                      lgb_train_data, ## Data to be trained on
                     num_boost_round = 5000, ## Number of boosting iterations
                     valid_sets = [lgb_train_data, lgb_val_data], ## List of data to be evaluated on during training
                     early_stopping_rounds = 200, ## Activates early stopping, where the model will training only until validation score stops improving
                     verbose_eval = 500) ## Evaluation metric will be printed on every 500th boosting stage, and the last boosting stage will print
    
    ## Use the model to predict Revenue on our training data
    y_pred_train = model.predict(X_train, num_iteration = model.best_iteration)
    
    ## Use the same model to predict Revenue on our validation data
    y_pred_val = model.predict(X_val, num_iteration = model.best_iteration)
    
    ## Print  out the results of our RMSE function using the training and validation
    print(f"LGBM: RMSE val: {rmse(y_val, y_pred_val)} - RMSE train: {rmse(y_train, y_pred_train)}")
    
    ## Return our predicted values for the validation data, and the final model
    return y_pred_val, model

In [None]:
%%time

## Train LGBM and generate predictions - get the output
lgb_preds, lgb_model_output = lgb_model(X_train_reg, y_train_reg, 
                               X_val_reg, y_val_reg, X_test_reg)

In [None]:
## Train LGBM and generate predictions - get the output
lgb_preds, lgb_model_output = lgb_model(X_train_reg, y_train_reg, 
                               X_val_reg, y_val_reg, X_test_reg)

## <span style="color:##ffcc33">Output of the tests as an image - we had to run concurrent tests on each VM.</span>

![image.png](attachment:image.png)

In [None]:
Summary Table

In [None]:
DT = 105 mins, 2.932, 1.712

Lasso = 28 mins, 3.354, 1.831

NN = 13 mins, 2.731, 1.652

LGBoost = 1 min, 2.650, 1.628

In [65]:
lasso_summary = ["Lasso", 28.7, 3.354, 1.831]
dt_summary = ["Decision Tree", 105.1, 2.932, 1.712]
nn_summary = ["Neural Network", 13.3, 2.731, 1.652]
lgb_summary = ["LightGB", 1.3, 2.651, 1.628]

data = [lasso_summary, dt_summary, nn_summary, lgb_summary]

summary_table = pd.DataFrame(data, columns = ["Algorithm", "Time to Train (mins)", "Best Nested/Valdiation MSE", "Best Nested/Validation RMSE"])

summary_table["% increase/decrease in RMSE"] = [round((i - 1.628) / abs(1.628) * 100, 3) for i in summary_table["Best Nested/Validation RMSE"]]
  
# print dataframe. 
summary_table

Unnamed: 0,Algorithm,Time to Train (mins),Best Nested/Valdiation MSE,Best Nested/Validation RMSE,% increase/decrease in RMSE
0,Lasso,28.7,3.354,1.831,12.469
1,Decision Tree,105.1,2.932,1.712,5.16
2,Neural Network,13.3,2.731,1.652,1.474
3,LightGB,1.3,2.651,1.628,0.0
