In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Naive Bayes Example").getOrCreate()
sc = spark.sparkContext

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('datasets/Social_Network_Ads.csv')

In [None]:
sdf.show()

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Gender', outputCol="Gender_numeric").fit(sdf)
sdf = indexer.transform(sdf)
sdf.select('Gender', 'Gender_numeric').show()

### Select Features

In [None]:
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = ['Gender_numeric', 'Age', 'EstimatedSalary'], outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['features', 'Purchased'])
ndf.show(3)

### Standardize Features

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="stdFeatures",
                            withStd=True, withMean=False)

In [None]:
scalerModel = scaler.fit(ndf)

In [None]:
scaledData = scalerModel.transform(ndf)

In [None]:
scaledData.show(truncate = False)

### Naive Bayes expects features and label column names for processing

In [None]:
scaledData = scaledData.withColumnRenamed('Purchased', 'label')
scaledData = scaledData.withColumnRenamed('features', 'original')
scaledData = scaledData.withColumnRenamed('stdFeatures', 'features')

In [None]:
scaledData.show()

### Create Training and Test data

In [None]:
(trainingData, testData) = scaledData.randomSplit([0.7, 0.3], seed = 2345)

### Create Model

In [None]:
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
nbayes = NaiveBayes()
nbayes

In [None]:
nbayes_model = nbayes.fit(trainingData)

In [None]:
y_pred = nbayes_model.transform(testData)

In [None]:
y_pred.printSchema()

In [None]:
y_pred.select('label', 'prediction', 'features').show(200)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Instantiate metrics object
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'label')

evaluation = evaluator.evaluate(y_pred)

# Area under ROC curve
print("Area under ROC = %s" % evaluation)


In [None]:
y_true = y_pred.select('label')
y_true = y_true.toPandas()

In [None]:
y_p = y_pred.select('prediction')
y_p = y_p.toPandas()

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_p)
cm

In [None]:
correct = cm[0, 0] + cm[1, 1]
error = cm[0, 1] + cm[1,0]
total = correct + error
print('Correct predictions: {} of {}'.format(correct, total))
print('Errored predictions: {} of {}'. format(error, total))