# Decision Tree #

<b> Table of Contents: </b>
<br> [Pipeline 1](#1001)
<br> [Pipeline 2](#2002)
<br> [Pipeline 3](#3003)

In [29]:
# import modules
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

<a id = "1001"> <h2> Pipeline 1 </h2> </a>
___

Loading the Dataset

In [30]:
# load the dataset
df_pipeline1 = pd.read_csv("pipeline_1.csv")

In [31]:
df_pipeline1.head()

Unnamed: 0,Q4,VisitorType_New_Visitor,Q3,TrafficType_2,TrafficType_8,TrafficType_3,PageValues_iqr_yj_zscore,Q1,TrafficType_13,ExitRates_iqr_yj_zscore,...,SpecialDay_0.8,Month_Feb,Browser_6,SpecialDay_0.4,TrafficType_20,Informational_Duration_pp_iqr_yj_zscore,Browser_12,OperatingSystems_7,TrafficType_16,Revenue
0,0,0,0,0,0,0,-0.529467,1,0,1.71049,...,0,1,0,0,0,-0.491728,0,0,0,0
1,0,0,0,1,0,0,-0.529467,1,0,1.676643,...,0,1,0,0,0,-0.491728,0,0,0,0
2,0,0,0,0,0,1,-0.529467,1,0,1.71049,...,0,1,0,0,0,-0.491728,0,0,0,0
3,0,0,0,0,0,0,-0.529467,1,0,1.71049,...,0,1,0,0,0,-0.491728,0,0,0,0
4,0,0,0,0,0,0,-0.529467,1,0,0.796311,...,0,1,0,0,0,-0.491728,0,0,0,0


In [32]:
df_pipeline1.describe().round(2)

Unnamed: 0,Q4,VisitorType_New_Visitor,Q3,TrafficType_2,TrafficType_8,TrafficType_3,PageValues_iqr_yj_zscore,Q1,TrafficType_13,ExitRates_iqr_yj_zscore,...,SpecialDay_0.8,Month_Feb,Browser_6,SpecialDay_0.4,TrafficType_20,Informational_Duration_pp_iqr_yj_zscore,Browser_12,OperatingSystems_7,TrafficType_16,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,...,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,0.43,0.14,0.11,0.32,0.03,0.17,0.0,0.17,0.06,0.0,...,0.03,0.01,0.01,0.02,0.02,-0.0,0.0,0.0,0.0,0.15
std,0.49,0.34,0.31,0.47,0.16,0.37,1.0,0.38,0.24,1.0,...,0.16,0.12,0.12,0.14,0.13,1.0,0.03,0.02,0.02,0.36
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.53,0.0,0.0,-1.87,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,-0.53,0.0,0.0,-0.78,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,-0.53,0.0,0.0,-0.16,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,0.0,0.0,-0.53,0.0,0.0,0.8,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.98,1.0,1.0,1.71,...,1.0,1.0,1.0,1.0,1.0,2.08,1.0,1.0,1.0,1.0


In [33]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_pipeline1.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Month', 'VisitorType', 'Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_pipeline1[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

['Administrative_Duration_iqr_yj_zscore',
 'Browser_12',
 'Browser_6',
 'ExitRates_iqr_yj_zscore',
 'Informational_Duration_pp_iqr_yj_zscore',
 'Month_Feb',
 'OperatingSystems_3',
 'OperatingSystems_7',
 'PageValues_iqr_yj_zscore',
 'Q1',
 'Q3',
 'Q4',
 'SpecialDay_0.4',
 'SpecialDay_0.8',
 'TrafficType_1',
 'TrafficType_13',
 'TrafficType_16',
 'TrafficType_2',
 'TrafficType_20',
 'TrafficType_3',
 'TrafficType_8',
 'VisitorType_New_Visitor']

In [34]:
# Define Features and Target variables
X = df_pipeline1[list_X_columns]
Y = df_pipeline1['Revenue']

In [35]:
# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

In [36]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [37]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8564476885644768


In [38]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      3131
           1       0.53      0.53      0.53       568

    accuracy                           0.86      3699
   macro avg       0.72      0.72      0.72      3699
weighted avg       0.86      0.86      0.86      3699



In [39]:
print(metrics.confusion_matrix(y_test, y_pred))

[[2869  262]
 [ 269  299]]


<a id = "2002"> <h2> Pipeline 2 </h2> </a>
___

Loading the Dataset

In [40]:
# load the dataset
df_pipeline2 = pd.read_csv("pipeline_2.csv")

In [41]:
df_pipeline2.head()

Unnamed: 0,TrafficType_15,Month_Nov,Administrative_Duration_mm_yj_stdev,VisitorType_New_Visitor,Informational_mm_yj_stdev,TrafficType_2,TrafficType_3,ProductRelated_mm_yj_stdev,PageValues_mm_yj_stdev,Month_May,...,TrafficType_1,add_exit_bounce_rates_mm_yj_stdev,Month_Mar,TrafficType_18,TrafficType_8,SpecialDay_0.8,Month_Feb,TrafficType_12,Browser_12,Revenue
0,0,0,-0.0,0,-0.0,0,0,0.001399,-0.0,0,...,1,0.150999,0,0,0,0,1,0,0,0
1,0,0,-0.0,0,-0.0,1,0,0.002761,-0.0,0,...,0,0.117252,0,0,0,0,1,0,0,0
2,0,0,-0.0,0,-0.0,0,1,0.001399,-0.0,0,...,0,0.150999,0,0,0,0,1,0,0,0
3,0,0,-0.0,0,-0.0,0,0,0.002761,-0.0,0,...,0,0.140666,0,0,0,0,1,0,0,0
4,0,0,-0.0,0,-0.0,0,0,0.012429,-0.0,0,...,0,0.099569,0,0,0,0,1,0,0,0


In [42]:
df_pipeline2.describe().round(2)

Unnamed: 0,TrafficType_15,Month_Nov,Administrative_Duration_mm_yj_stdev,VisitorType_New_Visitor,Informational_mm_yj_stdev,TrafficType_2,TrafficType_3,ProductRelated_mm_yj_stdev,PageValues_mm_yj_stdev,Month_May,...,TrafficType_1,add_exit_bounce_rates_mm_yj_stdev,Month_Mar,TrafficType_18,TrafficType_8,SpecialDay_0.8,Month_Feb,TrafficType_12,Browser_12,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,...,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,0.0,0.24,0.01,0.14,0.0,0.32,0.17,0.02,0.0,0.27,...,0.2,0.07,0.15,0.0,0.03,0.03,0.01,0.0,0.0,0.15
std,0.06,0.43,0.01,0.34,0.01,0.47,0.37,0.02,0.01,0.45,...,0.4,0.04,0.36,0.03,0.16,0.16,0.12,0.01,0.03,0.36
min,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.01,-0.0,0.0,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.02,-0.0,0.0,...,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.02,0.0,-0.0,1.0,0.0,0.03,-0.0,1.0,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.03,1.0,0.02,1.0,1.0,0.06,0.02,1.0,...,1.0,0.15,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [43]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_pipeline2.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Month', 'VisitorType', 'Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_pipeline2[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

['Administrative_Duration_mm_yj_stdev',
 'Browser_12',
 'Informational_mm_yj_stdev',
 'Month_Feb',
 'Month_Mar',
 'Month_May',
 'Month_Nov',
 'OperatingSystems_3',
 'PageValues_mm_yj_stdev',
 'ProductRelated_mm_yj_stdev',
 'SpecialDay_0.8',
 'TrafficType_1',
 'TrafficType_12',
 'TrafficType_13',
 'TrafficType_15',
 'TrafficType_18',
 'TrafficType_2',
 'TrafficType_3',
 'TrafficType_8',
 'VisitorType_New_Visitor',
 'add_exit_bounce_rates_mm_yj_stdev']

In [44]:
# Define Features and Target variables
X = df_pipeline2[list_X_columns]
Y = df_pipeline2['Revenue']

In [45]:
# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

In [46]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [47]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8594214652608814


In [48]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      3131
           1       0.54      0.57      0.55       568

    accuracy                           0.86      3699
   macro avg       0.73      0.74      0.73      3699
weighted avg       0.86      0.86      0.86      3699



In [49]:
print(metrics.confusion_matrix(y_test, y_pred))

[[2858  273]
 [ 247  321]]


<a id = "3003"> <h2> Pipeline 3 </h2> </a>
___

Loading the Dataset

In [50]:
# load the dataset
df_pipeline3 = pd.read_csv("pipeline_3.csv")

In [51]:
df_pipeline3.head()

Unnamed: 0,PageValues_yj_stdev_zscore,Month_Nov,VisitorType_New_Visitor,TrafficType_15,TrafficType_2,TrafficType_3,Browser_12,Month_May,TrafficType_16,TrafficType_13,...,TrafficType_1,Month_Mar,SpecialDay_0.8,TrafficType_8,Month_Feb,Administrative_Duration_yj_stdev_zscore,Browser_6,OperatingSystems_7,ProductRelated_Duration_yj_stdev_zscore,Revenue
0,-0.529409,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,-0.996659,0,0,-2.096783,0
1,-0.529409,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,-0.996659,0,0,-1.074189,0
2,-0.529409,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,-0.996659,0,0,-2.096783,0
3,-0.529409,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,-0.996659,0,0,-1.875436,0
4,-0.529409,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,-0.996659,0,0,0.057515,0


In [21]:
df_pipeline3.describe().round(2)

Unnamed: 0,PageValues_yj_stdev_zscore,Month_Nov,VisitorType_New_Visitor,TrafficType_15,TrafficType_2,TrafficType_3,Browser_12,Month_May,TrafficType_16,TrafficType_13,...,TrafficType_1,Month_Mar,SpecialDay_0.8,TrafficType_8,Month_Feb,Administrative_Duration_yj_stdev_zscore,Browser_6,OperatingSystems_7,ProductRelated_Duration_yj_stdev_zscore,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,...,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,-0.0,0.24,0.14,0.0,0.32,0.17,0.0,0.27,0.0,0.06,...,0.2,0.15,0.03,0.03,0.01,0.0,0.01,0.0,-0.0,0.15
std,1.0,0.43,0.34,0.06,0.47,0.37,0.03,0.45,0.02,0.24,...,0.4,0.36,0.16,0.16,0.12,1.0,0.12,0.02,1.0,0.36
min,-0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-2.1,0.0
25%,-0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,-0.62,0.0
50%,-0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.03,0.0
75%,-0.53,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.65,0.0
max,1.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,5.31,1.0


In [22]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_pipeline3.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Month', 'VisitorType', 'Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_pipeline3[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

['Administrative_Duration_yj_stdev_zscore',
 'Browser_12',
 'Browser_13',
 'Browser_6',
 'Month_Feb',
 'Month_Mar',
 'Month_May',
 'Month_Nov',
 'OperatingSystems_3',
 'OperatingSystems_6',
 'OperatingSystems_7',
 'PageValues_yj_stdev_zscore',
 'ProductRelated_Duration_yj_stdev_zscore',
 'SpecialDay_0.8',
 'TrafficType_1',
 'TrafficType_13',
 'TrafficType_15',
 'TrafficType_16',
 'TrafficType_2',
 'TrafficType_3',
 'TrafficType_8',
 'VisitorType_New_Visitor']

In [23]:
# Define Features and Target variables
X = df_pipeline3[list_X_columns]
Y = df_pipeline3['Revenue']

In [24]:
# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

In [25]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [26]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8550959718842931


In [27]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91      3131
           1       0.53      0.53      0.53       568

    accuracy                           0.86      3699
   macro avg       0.72      0.72      0.72      3699
weighted avg       0.85      0.86      0.85      3699



In [28]:
print(metrics.confusion_matrix(y_test, y_pred))

[[2864  267]
 [ 269  299]]
