## Naive Bayes ## 

In [1]:
# import modules
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [2]:
# read csv file to a pandas dataframe
df = pd.read_csv("data/online_shoppers_intention.csv")

### Transform Object Type Features with Label Encoder ###

> Label Encode the Object Type Features

In [3]:
#creating labelEncoder
le = preprocessing.LabelEncoder()

In [13]:
# Converting string labels into numbers.
# df['label'] = le.fit_transform(df.label.values)
# source: https://stackoverflow.com/questions/50258960/how-to-apply-labelencoder-for-a-specific-column-in-pandas-dataframe

# Month
df['Month_lbl'] = le.fit_transform(df['Month'].values)
                                      
# VisitorType
df['VisitorType_lbl'] = le.fit_transform(df['VisitorType'].values)

In [21]:
# test if LabelEncoder worked
df[['Month', 'Month_lbl', 'VisitorType', 'VisitorType_lbl']].tail(5)

Unnamed: 0,Month,Month_lbl,VisitorType,VisitorType_lbl
12325,Dec,1,Returning_Visitor,2
12326,Nov,7,Returning_Visitor,2
12327,Nov,7,Returning_Visitor,2
12328,Nov,7,Returning_Visitor,2
12329,Nov,7,New_Visitor,0


In [6]:
# show labelencoder for the Month
# https://stackoverflow.com/questions/48131812/get-unique-values-of-multiple-columns-as-a-new-dataframe-in-pandas
df.groupby(by=['Month'], as_index=False)['Month_lbl'].first()

Unnamed: 0,Month,Month_lbl
0,Aug,0
1,Dec,1
2,Feb,2
3,Jul,3
4,June,4
5,Mar,5
6,May,6
7,Nov,7
8,Oct,8
9,Sep,9


In [7]:
# show labelencoder for the VisitorType
df.groupby(by=['VisitorType'], as_index=False)['VisitorType_lbl'].first()

Unnamed: 0,VisitorType,VisitorType_lbl
0,New_Visitor,0
1,Other,1
2,Returning_Visitor,2


> Declare Features and Target

In [8]:
# show all columns in dataset
list(df.columns)[:]

['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'Month',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'VisitorType',
 'Weekend',
 'Revenue',
 'Month_lbl',
 'VisitorType_lbl']

In [9]:
# ---------------------------------------------------------------------------------------------------
# Extract needed features as a list of columns 
# excluding target variable 'Revenue' and original object features (that were labelencoded), namely 'Month' and VisitorType'
# ---------------------------------------------------------------------------------------------------
## create a list of all the columns
list_all_columns = df.columns.tolist()

## remove features in list, which are not needed for testing for feature selection, namely:
list_remove_features = ['Month', 'VisitorType', 'Revenue'] 

# Create sets of a,b
setA = set(list_all_columns)
setB = set(list_remove_features)

# Get new set with elements that are only in a but not in b
setlist_X_columns = setA.difference(list_remove_features)

# convert set object to a list
list_X_columns = list(setlist_X_columns)

# Define dependent variables
X = df[list_X_columns].to_numpy()

#show the columns that need to be included as features
## sort alphabetically
list_X_columns.sort()
## show list 
list_X_columns

['Administrative',
 'Administrative_Duration',
 'BounceRates',
 'Browser',
 'ExitRates',
 'Informational',
 'Informational_Duration',
 'Month_lbl',
 'OperatingSystems',
 'PageValues',
 'ProductRelated',
 'ProductRelated_Duration',
 'Region',
 'SpecialDay',
 'TrafficType',
 'VisitorType_lbl',
 'Weekend']

In [10]:
# Define Features and Target variables
X = df[list_X_columns]
Y = df['Revenue']

# Split dataset into training set and test set 
# 70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,random_state=2019) 

In [11]:
# Create a Gaussian Classifier
gnb = GaussianNB()

# Train the model using the training sets
gnb.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = gnb.predict(X_test)

In [12]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8418491484184915
