# Logistics Regression Using Social Network Advertisting dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Logistic Regression Example").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

## Load Data

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/Social_Network_Ads.csv')

In [None]:
sdf.show(15)

## Convert String to Numbers

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Gender', outputCol="Gender_numeric").fit(sdf)
sdf = indexer.transform(sdf)
sdf.select('Gender', 'Gender_numeric').show()

In [None]:
sdf.printSchema()

## Select Features

In [None]:
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = ['Gender_numeric', 'Age', 'EstimatedSalary'], outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['features', 'Purchased'])
ndf.show(3)

## Standardize the features

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="stdFeatures",
                            withStd=True, withMean=False)

In [None]:
scalerModel = scaler.fit(ndf)

In [None]:
scaledData = scalerModel.transform(ndf)

In [None]:
scaledData.show(5, truncate = False)

## Create Training and Test sets

In [None]:
(trainingData, testData) = scaledData.randomSplit([0.7, 0.3], seed = 2345)

In [None]:
trainingData.printSchema()

## Model Training using Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol = 'stdFeatures', labelCol = 'Purchased', maxIter=10)

In [None]:
lr_model = lr.fit(trainingData)


<font color='teal'><h2>ROC Curve</h2></font>
<span style="font-family:times, serif; font-size:16pt; font-style:italic">

<ul>
    <li>A visual way to measure the performance of binary classifier ROC (Receiver Operating Characteristic) Curve</li>
    <li>Created by plotting True Positive Rate (TPR or recall) against False Positive Rate (FPR)</li>
</ul>
</span>
<font color='teal'><h2>AUC - Area Under the ROC curve</h2></font>
<span style="font-family:times, serif; font-size:16pt; font-style:italic">
<ul>
    <li>AUC is a good measure of performance of the classifier</li>
    <li>If it is near 0.5, the classifier is not much better than random guessing</li>
    <li>Classifier gets better when the curve get close to 1</li>
    <li>Since our value is close to 1, it indicates that classifier is good
at minimizing false negatives (not purchased as purchased) and true negative
(purchased is classified as purchased)</li> 
</ul>
</span>


In [None]:
trainSummary = lr_model.summary
auc = str(np.round(trainSummary.areaUnderROC, 4))
roc = trainSummary.roc.toPandas()
plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
plt.plot(roc['FPR'],roc['TPR'], label = "Train AUC " + auc)
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
pr = trainSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

## Print the model statistics based on training data

In [None]:
accuracy = np.round(trainSummary.accuracy, 4)
falsePositiveRate = np.round(trainSummary.weightedFalsePositiveRate, 4)
truePositiveRate = np.round(trainSummary.weightedTruePositiveRate, 4)
fMeasure = np.round(trainSummary.weightedFMeasure(), 4)
precision = np.round(trainSummary.weightedPrecision, 4)
recall = np.round(trainSummary.weightedRecall, 4)
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" %(accuracy, falsePositiveRate, 
                                                                                   truePositiveRate, fMeasure, 
                                                                                   precision, recall))

## Make Predictions

In [None]:
predictions = lr_model.transform(testData)
predictions.select('Purchased', 'prediction', 'probability', 'stdFeatures').show(10)

## Model Evaluation

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol = 'Purchased')
print('Test Area Under ROC', evaluator.evaluate(predictions))

## Create Confusion Matrix

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
y_true = predictions.select('Purchased')
y_true = y_true.toPandas()

In [None]:
y_p = predictions.select('prediction')
y_p = y_p.toPandas()

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_p)
cm

In [None]:
correct = cm[0, 0] + cm[1, 1]
error = cm[0, 1] + cm[1,0]
total = correct + error
print('Correct predictions: {} of {}'.format(correct, total))
print('Errored predictions: {} of {}'. format(error, total))

In [None]:
# Plot the confusion matrix
import seaborn as sn
sn.heatmap(cm, annot=True, cmap=plt.cm.Blues)
plt.show()