# Decision Tree #

<b> Table of Content: </b>
<br> [Pipeline_1](#500)
<br> [Pipeline 2](#501)
<br> [Pipeline 3](#502)

<a id = "500"> <h2> Pipeline 1 </h2> </a>
___

In [4]:
# import modules
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [5]:
# load the dataset
df = pd.read_csv("pipeline_1.csv")

In [6]:
df.head()

Unnamed: 0,Q4,VisitorType_New_Visitor,Q3,TrafficType_2,TrafficType_8,TrafficType_3,PageValues_iqr_yj_zscore,Q1,TrafficType_13,ExitRates_iqr_yj_zscore,...,SpecialDay_0.8,Month_Feb,Browser_6,SpecialDay_0.4,TrafficType_20,Informational_Duration_pp_iqr_yj_zscore,Browser_12,OperatingSystems_7,TrafficType_16,Revenue
0,0,0,0,0,0,0,-0.529467,1,0,1.71049,...,0,1,0,0,0,-0.491728,0,0,0,0
1,0,0,0,1,0,0,-0.529467,1,0,1.676643,...,0,1,0,0,0,-0.491728,0,0,0,0
2,0,0,0,0,0,1,-0.529467,1,0,1.71049,...,0,1,0,0,0,-0.491728,0,0,0,0
3,0,0,0,0,0,0,-0.529467,1,0,1.71049,...,0,1,0,0,0,-0.491728,0,0,0,0
4,0,0,0,0,0,0,-0.529467,1,0,0.796311,...,0,1,0,0,0,-0.491728,0,0,0,0


In [7]:
df.describe().round(2)

Unnamed: 0,Q4,VisitorType_New_Visitor,Q3,TrafficType_2,TrafficType_8,TrafficType_3,PageValues_iqr_yj_zscore,Q1,TrafficType_13,ExitRates_iqr_yj_zscore,...,SpecialDay_0.8,Month_Feb,Browser_6,SpecialDay_0.4,TrafficType_20,Informational_Duration_pp_iqr_yj_zscore,Browser_12,OperatingSystems_7,TrafficType_16,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,...,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,0.43,0.14,0.11,0.32,0.03,0.17,0.0,0.17,0.06,0.0,...,0.03,0.01,0.01,0.02,0.02,-0.0,0.0,0.0,0.0,0.15
std,0.49,0.34,0.31,0.47,0.16,0.37,1.0,0.38,0.24,1.0,...,0.16,0.12,0.12,0.14,0.13,1.0,0.03,0.02,0.02,0.36
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.53,0.0,0.0,-1.87,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,-0.53,0.0,0.0,-0.78,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,-0.53,0.0,0.0,-0.16,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,0.0,0.0,-0.53,0.0,0.0,0.8,...,0.0,0.0,0.0,0.0,0.0,-0.49,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.98,1.0,1.0,1.71,...,1.0,1.0,1.0,1.0,1.0,2.08,1.0,1.0,1.0,1.0


In [8]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Month', 'VisitorType', 'Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

['Administrative_Duration_iqr_yj_zscore',
 'Browser_12',
 'Browser_6',
 'ExitRates_iqr_yj_zscore',
 'Informational_Duration_pp_iqr_yj_zscore',
 'Month_Feb',
 'OperatingSystems_3',
 'OperatingSystems_7',
 'PageValues_iqr_yj_zscore',
 'Q1',
 'Q3',
 'Q4',
 'SpecialDay_0.4',
 'SpecialDay_0.8',
 'TrafficType_1',
 'TrafficType_13',
 'TrafficType_16',
 'TrafficType_2',
 'TrafficType_20',
 'TrafficType_3',
 'TrafficType_8',
 'VisitorType_New_Visitor']

In [9]:
# Define Features and Target variables
X = df[list_X_columns]
Y = df['Revenue']

In [10]:
# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

In [11]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [12]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.857529061908624


In [13]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      3131
           1       0.54      0.53      0.53       568

    accuracy                           0.86      3699
   macro avg       0.73      0.72      0.72      3699
weighted avg       0.86      0.86      0.86      3699



In [14]:
print(metrics.confusion_matrix(y_test, y_pred))

[[2870  261]
 [ 266  302]]


<a id = "501"> <h2> Pipeline 2 </h2> </a>
___

In [3]:
# load the dataset
df = pd.read_csv("pipeline_2.csv")

<a id = "502"> <h2> Pipeline 3 </h2> </a>
___

In [3]:
# load the dataset
df = pd.read_csv("pipeline_3.csv")