# Random Forest #

<b> Table of Contents: </b>
<br> [Pipeline 1](#11)
<br> [Pipeline 2](#22)
<br> [Pipeline 3](#33)

Loading Modules

In [15]:
# import modules
import pandas as pd

#Import scikit-learn dataset library
from sklearn import datasets

# Import train_test_split function
from sklearn.model_selection import train_test_split

<a id = "11"> <h2> Pipeline 1 </h2> </a>
___

Loading the Dataset

In [16]:
#Load dataset
# read csv file to a pandas dataframe
df_pipeline1 = pd.read_csv("pipeline_1.csv")

In [17]:
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_pipeline1.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_pipeline1[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically -- look up how to do this 
list_X_columns

['Q1',
 'VisitorType_New_Visitor',
 'Informational_Duration_pp_iqr_yj_zscore',
 'TrafficType_2',
 'Browser_12',
 'Q4',
 'OperatingSystems_3',
 'Browser_6',
 'Month_Feb',
 'ExitRates_iqr_yj_zscore',
 'TrafficType_16',
 'TrafficType_8',
 'PageValues_iqr_yj_zscore',
 'SpecialDay_0.8',
 'OperatingSystems_7',
 'TrafficType_20',
 'TrafficType_13',
 'TrafficType_3',
 'Q3',
 'TrafficType_1',
 'SpecialDay_0.4',
 'Administrative_Duration_iqr_yj_zscore']

In [18]:
# Define Features and Target variables
X = df_pipeline1[list_X_columns] # features vars
y = df_pipeline1['Revenue'] # target vars

In [19]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2019) # 70% training and 30% test

In [20]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

In [21]:
#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93      3131
           1       0.66      0.51      0.57       568

    accuracy                           0.88      3699
   macro avg       0.79      0.73      0.75      3699
weighted avg       0.87      0.88      0.88      3699



<a id = "22"> <h2> Pipeline 2 </h2> </a>
___

Loading the Dataset

In [22]:
#Load dataset
# read csv file to a pandas dataframe
df_pipeline2 = pd.read_csv("pipeline_2.csv")

In [23]:
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_pipeline2.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_pipeline2[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically -- look up how to do this 
list_X_columns

['TrafficType_15',
 'VisitorType_New_Visitor',
 'TrafficType_2',
 'Browser_12',
 'Month_Mar',
 'TrafficType_18',
 'Month_Nov',
 'OperatingSystems_3',
 'Month_May',
 'Month_Feb',
 'ProductRelated_mm_yj_stdev',
 'TrafficType_8',
 'SpecialDay_0.8',
 'PageValues_mm_yj_stdev',
 'TrafficType_13',
 'Administrative_Duration_mm_yj_stdev',
 'TrafficType_3',
 'Informational_mm_yj_stdev',
 'TrafficType_1',
 'add_exit_bounce_rates_mm_yj_stdev',
 'TrafficType_12']

In [24]:
# Define Features and Target variables
X = df_pipeline2[list_X_columns] # features vars
y = df_pipeline2['Revenue'] # target vars

In [25]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2019) # 70% training and 30% test

In [26]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

In [27]:
#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94      3131
           1       0.68      0.55      0.61       568

    accuracy                           0.89      3699
   macro avg       0.80      0.75      0.77      3699
weighted avg       0.88      0.89      0.89      3699



<a id = "33"> <h2> Pipeline 3 </h2> </a>
___

Loading the Dataset

In [28]:
#Load dataset
# read csv file to a pandas dataframe
df_pipeline3 = pd.read_csv("pipeline_3.csv")

> Declare Features and Target

In [29]:
# show all columns in dataset
list(df_pipeline3.columns)[:]

['PageValues_yj_stdev_zscore',
 'Month_Nov',
 'VisitorType_New_Visitor',
 'TrafficType_15',
 'TrafficType_2',
 'TrafficType_3',
 'Browser_12',
 'Month_May',
 'TrafficType_16',
 'TrafficType_13',
 'OperatingSystems_6',
 'OperatingSystems_3',
 'Browser_13',
 'TrafficType_1',
 'Month_Mar',
 'SpecialDay_0.8',
 'TrafficType_8',
 'Month_Feb',
 'Administrative_Duration_yj_stdev_zscore',
 'Browser_6',
 'OperatingSystems_7',
 'ProductRelated_Duration_yj_stdev_zscore',
 'Revenue']

In [30]:
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df_pipeline3.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df_pipeline3[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically -- look up how to do this 
list_X_columns

['TrafficType_15',
 'PageValues_yj_stdev_zscore',
 'VisitorType_New_Visitor',
 'TrafficType_2',
 'Browser_12',
 'Month_Mar',
 'Month_Nov',
 'OperatingSystems_3',
 'Browser_13',
 'Month_May',
 'Browser_6',
 'Month_Feb',
 'OperatingSystems_6',
 'TrafficType_16',
 'Administrative_Duration_yj_stdev_zscore',
 'SpecialDay_0.8',
 'TrafficType_8',
 'OperatingSystems_7',
 'ProductRelated_Duration_yj_stdev_zscore',
 'TrafficType_13',
 'TrafficType_3',
 'TrafficType_1']

In [31]:
# Define Features and Target variables
X = df_pipeline3[list_X_columns] # features vars
y = df_pipeline3['Revenue'] # target vars

In [32]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2019) # 70% training and 30% test

In [33]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

In [34]:
#check the predictive performance using the actual and predicted values
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93      3131
           1       0.62      0.54      0.58       568

    accuracy                           0.88      3699
   macro avg       0.77      0.74      0.75      3699
weighted avg       0.87      0.88      0.88      3699

