# Support Vector Machine - Classification

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Support Vector Machine Classification Example").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Warn")

## Load Data

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/Social_Network_Ads.csv')

In [None]:
sdf.show()

In [None]:
sdf = sdf.drop('User ID')

In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Gender', outputCol="Gender_numeric").fit(sdf)
sdf = indexer.transform(sdf)
sdf.select('Gender', 'Gender_numeric').show()

## Select Features

In [None]:
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = ['Gender_numeric', 'Age', 'EstimatedSalary'], outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['features', 'Purchased'])
ndf.show(3)

## Standardize Features

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="stdFeatures",
                            withStd=True, withMean=False)

In [None]:
scalerModel = scaler.fit(ndf)

In [None]:
scaledData = scalerModel.transform(ndf)

In [None]:
scaledData.show(truncate = False)

## Create Training and Test data

In [None]:
(trainingData, testData) = scaledData.randomSplit([0.7, 0.3], seed = 2345)

## Using Linear Kernel
## NOTE - Currently there is no support for non linear kernel  in Spark

In [None]:
from pyspark.ml.classification import LinearSVC
svc = LinearSVC(maxIter=10, regParam=0.1, labelCol = 'Purchased')

In [None]:
svc_model = svc.fit(trainingData)

In [None]:
svc_model.coefficients

In [None]:
svc_model.intercept

In [None]:
y_pred = svc_model.transform(testData)

In [None]:
y_pred.printSchema()

In [None]:
y_pred.select('Purchased', 'prediction', 'stdFeatures').show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Instantiate metrics object
evaluator = BinaryClassificationEvaluator(rawPredictionCol = 'prediction', labelCol = 'Purchased')

evaluation = evaluator.evaluate(y_pred)

# Area under ROC curve
print("Area under ROC = %s" % evaluation)


In [None]:
y_true = y_pred.select('Purchased')
y_true = y_true.toPandas()

In [None]:
y_p = y_pred.select('prediction')
y_p = y_p.toPandas()

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_p)
cm

In [None]:
correct = cm[0, 0] + cm[1, 1]
error = cm[0, 1] + cm[1,0]
total = correct + error
print('Correct predictions: {} of {}'.format(correct, total))
print('Errored predictions: {} of {}'. format(error, total))

In [None]:
# Plot the confusion matrix
import seaborn as sn
sn.heatmap(cm, annot=True, cmap=plt.cm.Blues)
plt.show()