# TPOT - Automated Machine Learning in Python

<b> Import modules

In [97]:
import pandas as pd
import numpy as np

# import module (to test execution time of a codeblock to run)
import time

In [98]:
##################### MODULES TPOT ###################################
#### source: http://epistasislab.github.io/tpot/installing/
### already installed in our environment by Prof. Tao in JupyterLab
# !pip install deap update_checker tqdm stopit --user
### for binary classification problem use TPOTClassifier
#### already installed in our environment by Prof. Tao in JupyterLab
#!pip install tpot --user

<b> Import Dataset

In [99]:
# import file with data cleaned (one-hot encoded categorical features and feature-engineerd features)
df = pd.read_csv("df_calculation.csv")

<b> Optional shuffle the data

In [100]:
# It's generally a good idea to randomly shuffle the data before starting to avoid any type of ordering in the data.
# shuffle the dataset
df_shuffle = df.iloc[np.random.permutation(len(df))]
# reset the index numbers of the dataframe
df_shuffled = df_shuffle.reset_index(drop=True)
# show first 5 rows of the dataframe
df_shuffled.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,Weekend,...,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,avg_exit_bounce_rates,add_exit_bounce_rates,div_exit_bounce_rates,perc_exit_bounce_rates,Administrative_Duration_pp,Informational_Duration_pp,ProductRelated_Duration_pp
0,1,7.0,0,0.0,20,887.583333,0.0,0.01614,34.253368,0,...,0,0,1,0.00807,0.01614,0.0,-1.0,7.0,0.0,44.379167
1,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0,...,0,1,0,0.2,0.4,1.0,0.0,0.0,0.0,0.0
2,0,0.0,0,0.0,12,312.033333,0.0,0.009091,17.808,0,...,0,0,1,0.004545,0.009091,0.0,-1.0,0.0,0.0,26.002778
3,0,0.0,0,0.0,3,0.0,0.2,0.2,0.0,0,...,0,0,1,0.2,0.4,1.0,0.0,0.0,0.0,0.0
4,12,432.583333,3,112.0,42,2132.683333,0.021289,0.041499,0.0,0,...,0,0,1,0.031394,0.062787,0.51299,-0.48701,36.048611,37.333333,50.778175


In [101]:
# check missing values
(pd.isnull(df_shuffled).any()).value_counts()
# note: no missing values

False    87
dtype: int64

<b> Train Test Split Indices

In [102]:
# cross_validation train_test_split
from sklearn.model_selection import train_test_split

# define your target variable: Y
# The first and most important step in using TPOT on any data set is to rename the target class/response variable to class.
# source: https://github.com/EpistasisLab/tpot/blob/master/tutorials/Titanic_Kaggle.ipynb
df_shuffled.rename(columns={'Revenue': 'class'}, inplace=True)
df_class = df_shuffled['class'].values

# split the data into training and testing (75% training, 25% testing, random state = fixed)
training_indices, testing_indices = train_test_split(df_shuffled.index, stratify = df_class, train_size=0.75, test_size=0.25, random_state = 2019)

<b> Check Size of Train Test Split

In [103]:
# check the size of the training set and validation set
training_indices.size, testing_indices.size

(9247, 3083)

<b> Run TPOT - Automated Machine Learning Model

In [105]:
from tpot import TPOTClassifier

# set the start time
start = time.time()

# define the model

tpot = TPOTClassifier(generations=300, # Number of iterations to run the pipeline optimization process. It must be a positive number or None. If None, the parameter max_time_mins must be defined as the runtime limit. Generally, TPOT will work better when you give it more generations (and therefore time) to optimize the pipeline.
                      population_size=100,
                      offspring_size=100,
                      verbosity=2, # verbosity - How much information TPOT communicates while it is running. 
                      n_jobs=-1, # Number of CPUs for evaluating pipelines in parallel during the TPOT optimization process. Assigning this to -1 will use as many cores as available on the computer. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
                      scoring='f1_weighted', # Function used to evaluate the quality of a given pipeline for the problem. By default, accuracy is used for classification and mean squared error (MSE) is used for regression. 
                      max_time_mins = 60, 
                      max_eval_time_mins= 10,
                      cv=10)

# train the model
tpot.fit(df_shuffled.drop('class',axis=1).loc[training_indices].values, # X_train
         df_shuffled.loc[training_indices, 'class'].values) # y_train

# get the time it takes for the gridsearch to run
# source: https://blog.softhints.com/python-test-performance-and-measure-time-elapsed-in-seconds/
# set the end time
end = time.time()

# get the elapsed execution time
execution_time = end - start
print("--- Execution time: ---")
print ('--- %0.3fms. --- ' % ( execution_time*1000.))
print("--- %s seconds ---" % (execution_time))
print("--- %s minutes ---" % (execution_time/60))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Generation 1 - Current best internal CV score: 0.8996889364724592

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.01, max_depth=8, min_child_weight=18, n_estimators=100, nthread=1, subsample=0.9500000000000001)
--- Execution time: ---
--- 1666844.584ms. --- 
--- 1666.8445839881897 seconds ---
--- 27.780743066469828 minutes ---


In [85]:
# note: for different scoring functions, see:
# source: https://epistasislab.github.io/tpot/using/#scoring-functions

In [106]:
# weighted f1 score which was the scoring method for tpot
tpot.score(df_shuffled.drop('class',axis=1).loc[testing_indices].values, #X_test
           df_shuffled.loc[testing_indices, 'class'].values) # y_test

0.8987252354361767

In [107]:
# source: https://iq.opengenus.org/tpot-python/
from sklearn.metrics import roc_auc_score
# calculate AUC score
tpot_auc_score = roc_auc_score(df_shuffled.loc[testing_indices, 'class'].values, tpot.predict_proba(df_shuffled.drop('class',axis=1).loc[testing_indices].values)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.9285


### Testrun 1
___
HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.7500000000000001, min_samples_leaf=15, min_samples_split=18, n_estimators=100)
<br> --- Execution time: ---
<br> --- 707084.034ms. --- 
<br> --- 707.0840344429016 seconds ---
<br> --- 11.784733907381694 minutes ---
<br> tpot.score = 0.9072333857032616
___
### Testrun 2
est pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.9500000000000001, min_samples_leaf=6, min_samples_split=15, n_estimators=100)
--- Execution time: ---
--- 424634.354ms. --- 
--- 424.63435411453247 seconds ---
--- 7.0772392352422075 minutes ---
<br> tpot.score = 0.9000766387617157
___
### Testrun 3
HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.2, min_samples_leaf=8, min_samples_split=14, n_estimators=100)
<br> --- Execution time: ---
<br> --- 798850.751ms. --- 
<br> --- 798.8507509231567 seconds ---
<br> --- 13.314179182052612 minutes ---
<br> tpot.score = 0.9016232777560529
___
### Testrun 4
HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=50.0, style=ProgressStyle(des…

Generation 1 - Current best internal CV score: 0.9028496916056253
Generation 2 - Current best internal CV score: 0.9028496916056253

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=gini, max_features=0.6500000000000001, min_samples_leaf=9, min_samples_split=13, n_estimators=100)
--- Execution time: ---
<br> --- 1343828.360ms. --- 
<br> --- 1343.8283603191376 seconds ---
<br> --- 22.39713933865229 minutes ---
<br> tpot.score = 0.8989145659134411
<br> AUC score: 0.9283
___
### Testrun 5
HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_w…

Best pipeline: XGBClassifier(MinMaxScaler(input_matrix), learning_rate=0.01, max_depth=10, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.6500000000000001)
<br>--- Execution time: ---
<br> --- 1568095.310ms. --- 
<br> --- 1568.0953097343445 seconds ---
<br> --- 26.134921828905743 minutes --
<br> tpot.score = 0.9062844719301901
<br> AUC score: 0.9337

In [None]:
# Export the optimized pipeline as Python code.
tpot.export('tpot_project2_pipeline.py')