## Module 9 - Introduction to Bayesian Inference

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import random
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

<b>Naive Bayesian Classifier</b><br>
Here, we'll train a Naive Bayesian classifier to make a prediction as to whether or not we'll buy the social network ads using the data introduced in Module 8.

In [2]:
ad_data = pd.read_csv("Social_Network_Ads.csv")
ad_data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


The first thing we have to do is encode the catagorical and binary variables

In [3]:
# Create the Data Frame
X = ad_data[["Gender", "Age", "EstimatedSalary"]]
X["Gender"] = pd.get_dummies(X["Gender"])["Female"]
Y = ad_data["Purchased"]

# Splitting the dataset into the Training set and Test set
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.25, random_state = 0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Then we generate the model

In [4]:
# initialise Gaussian Naive Bayes
model = GaussianNB()
model.fit(X_Train, Y_Train)
predictions_NB = model.predict(X_Test)
probabilities_NB = model.predict_proba(X_Test)

We're going to compare the performance of the Naive Bayes model with a Logistic Regresison model

In [5]:
# Feature Scaling - This is not necessary, but it is highly recommended.
sc_X = StandardScaler()
X_Train_LR = sc_X.fit_transform(X_Train)
X_Test_LR = sc_X.transform(X_Test)

# Fitting the Logistic Regression into the Training set
classifier = LogisticRegression()
classifier.fit(X_Train_LR, Y_Train)

# Make Predictions
predictions_LR = classifier.predict(X_Test_LR)
probabilities_LR = classifier.predict_proba(X_Test_LR)


Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.


Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.


Data with input dtype uint8, int64 were all converted to float64 by StandardScaler.





Put the data together into a single data frame

In [6]:
# Put the data together
results_df = pd.DataFrame({"NB Predictions": predictions_NB,
                           "NB Probabilities": probabilities_NB[:,1],
                           "LR Predictions": predictions_LR, 
                           "LR Probabilities": probabilities_LR[:,1],
                           "True Values": Y_Test})
results_df.head()

Unnamed: 0,NB Predictions,NB Probabilities,LR Predictions,LR Probabilities,True Values
132,0,0.098972,0,0.124345,0
309,0,0.157497,0,0.15721,0
341,0,0.138574,0,0.212761,0
196,0,0.077316,0,0.083687,0
246,0,0.099409,0,0.091364,0


Visualize the performance

In [7]:
# Use sklearn to get us the tpr and fpr at different cutoffs
tpr_nb, fpr_nb, _ = metrics.roc_curve(results_df["True Values"], results_df["NB Probabilities"])
tpr_lr, fpr_lr, _ = metrics.roc_curve(results_df["True Values"], results_df["LR Probabilities"])

trace0 = go.Scatter(
    x = tpr_nb,
    y = fpr_nb,
    mode = 'lines',
    name = 'NB Predictor'
)

trace1 = go.Scatter(
    x = tpr_lr,
    y = fpr_lr,
    mode = 'lines',
    name = 'LR Predictor'
)

trace2 = go.Scatter(
    x = tpr_nb,
    y = tpr_nb,
    mode='lines',
    name='Random',
    line=dict(
        dash="dash",
        color="grey")
)

data = [trace0, trace1, trace2]

layout = dict(title = 'ROC Curve for Ad Purchases',
              xaxis = dict(title = 'False Positive Rate'),
              yaxis = dict(title = 'True Positive Rate'),
              )

fig = dict(data=data, layout=layout)
iplot(fig)

auc_nb = metrics.roc_auc_score(results_df["True Values"], results_df["NB Probabilities"])
auc_lr = metrics.roc_auc_score(results_df["True Values"], results_df["LR Probabilities"])
print("Area Under the ROC Curve (AUC) is %.3f for Naive Bayes, and %.3f for Logistic Regression" % (auc_nb, auc_lr))

Area Under the ROC Curve (AUC) is 0.964 for Naive Bayes, and 0.955 for Logistic Regression


In [8]:
# Use sklearn to get us the tpr and fpr at different cutoffs
precision_NB, recall_NB, _ = metrics.precision_recall_curve(results_df["True Values"], results_df["NB Probabilities"])
precision_LR, recall_LR, _ = metrics.precision_recall_curve(results_df["True Values"], results_df["LR Probabilities"])

trace0 = go.Scatter(
    x = recall_NB,
    y = precision_NB,
    mode = 'lines',
    name = 'NB Predictor'
)

trace1 = go.Scatter(
    x = recall_LR,
    y = precision_LR,
    mode = 'lines',
    name = 'LR Predictor'
)

data = [trace0, trace1]

layout = dict(title = 'Precision-Recall Curve for Ad Purchases',
              xaxis = dict(title = 'Recall'),
              yaxis = dict(title = 'Precision'),
              )

fig = dict(data=data, layout=layout)
iplot(fig)