# PCA - Principle Component Analysis

# Wine Dataset
## Wines are categorized into 3 customer segments based on featuers listed below:
### Features are:

<ol>
    <li>Alcohol</li>
    <li>Malic acid</li>
    <li>Ash</li>
    <li>Alcalinity of ash</li>
    <li>Magnesium</li>
    <li>Total phenols</li>
    <li>Flavanoids</li>
    <li>Nonflavanoid phenols</li>
    <li>Proanthocyanins</li>
    <li>Color intensity</li>
    <li>Hue</li>
    <li>OD280/OD315 of diluted wines</li>
    <li>Proline </li>
    </ol>
</font>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("PCA Wine Dataset").getOrCreate()
sc = spark.sparkContext

## Data Exploration
1. Create dataframe from the Wine.csv file

In [None]:
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/Wine.csv')

In [None]:
# Convert only first 3 samples into pandas dataframe
df1 = pd.DataFrame(sdf.head(3), columns = sdf.columns)
df1

In [None]:
df = sdf.toPandas()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df['Customer_Segment'].unique()

In [None]:
corr = df.corr()
corr

In [None]:
sdf.printSchema()

<font color = 'tomato'>
<h2>Data preparation</h2>
1. Create features using Vector Assembler<br>
2. Standardize the data<br>
</font>

In [None]:
cols = sdf.columns
cols = cols[:-1]

In [None]:
# Select features into vector assembler
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = cols, outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['Customer_Segment', 'features'])
ndf.show(3, truncate = False)

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="stdFeatures",
                            withStd=False, withMean=True)

In [None]:
scalerModel = scaler.fit(ndf)

In [None]:
scaledData = scalerModel.transform(ndf)

In [None]:
#scaledData.select("stdFeatures").show(3, truncate = False)

## Apply Logistic Regression using scaled data (before doing PCA)

In [None]:
(train, test) = scaledData.randomSplit([0.7, 0.3], seed = 2345)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'stdFeatures', labelCol = 'Customer_Segment', maxIter=10)

In [None]:
lr_model = lr.fit(train)

In [None]:
trainSummary = lr_model.summary

In [None]:
accuracy = np.round(trainSummary.accuracy, 4)
falsePositiveRate = np.round(trainSummary.weightedFalsePositiveRate, 4)
truePositiveRate = np.round(trainSummary.weightedTruePositiveRate, 4)
fMeasure = np.round(trainSummary.weightedFMeasure(), 4)
precision = np.round(trainSummary.weightedPrecision, 4)
recall = np.round(trainSummary.weightedRecall, 4)
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" %(accuracy, falsePositiveRate, 
                                                                                   truePositiveRate, fMeasure, 
                                                                                   precision, recall))

In [None]:
predictions = lr_model.transform(test)

In [None]:
predictions.select('Customer_Segment', 'prediction', 'probability', 'features').toPandas().head()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = 'Customer_Segment')
evaluator.evaluate(predictions)

<font color = 'tomato'>
<h2>Apply Principle Component Analysis (PCA)</h2>
    <ol>
        <li>Create PCA instance (select number of components to 2), use stdFeatures</li>
        <li>Fit the instance to scaled data</li>
        <li>Transform with scaled data
        <li>Once the full processing is completed, change the number of components to 4 and compare results</li>
    </ol>
            
</font>

In [None]:
from pyspark.ml.feature import PCA

In [None]:
pca = PCA(k = 4, inputCol = scaler.getOutputCol(), outputCol = 'pcaFeatures')

In [None]:
model = pca.fit(scaledData)

In [None]:
transformed_feature = model.transform(scaledData)

In [None]:
transformed_feature.select('pcaFeatures').show(3, truncate = False)

<font color = 'tomato'>
    <h2>Training and Test set </h2>
    <ol>
        <li>Create Training and test set for the transformed data</li>
    </ol>
</font>

In [None]:
(trainingData, testData) = transformed_feature.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

In [None]:
trainingData.printSchema()

<font color = 'tomato'>
    <h2>Use Logistic Regression </h2>
    <ol>
        <li>Create Logistic Regression instance</li>
        <li>Fit the transformed features</li>
        <li>Transform the model</li>
        <li>Evaluate the model using multi-class classification evaluator</li>
    </ol>
</font>

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol = 'pcaFeatures', labelCol = 'Customer_Segment', maxIter=10)

In [None]:
lr_model = lr.fit(trainingData)

In [None]:
trainSummary = lr_model.summary

In [None]:
accuracy = np.round(trainSummary.accuracy, 4)
falsePositiveRate = np.round(trainSummary.weightedFalsePositiveRate, 4)
truePositiveRate = np.round(trainSummary.weightedTruePositiveRate, 4)
fMeasure = np.round(trainSummary.weightedFMeasure(), 4)
precision = np.round(trainSummary.weightedPrecision, 4)
recall = np.round(trainSummary.weightedRecall, 4)
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" %(accuracy, falsePositiveRate, 
                                                                                   truePositiveRate, fMeasure, 
                                                                                   precision, recall))

In [None]:
predictions = lr_model.transform(testData)

In [None]:
predictions.select('Customer_Segment', 'prediction', 'probability', 'pcaFeatures').toPandas().head()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = 'Customer_Segment')
evaluator.evaluate(predictions)

<font color = 'tomato'>
    <h2>Confusion Matrix </h2>
    <ol>
        <li>Create the predicted values as pandas dataframe</li>
        <li>Create the test values for Customer_Segment</li>
        <li>Create Confusion Matrix</li>
    </ol>
</font>

In [None]:
y_true = predictions.select('Customer_Segment')
y_true = y_true.toPandas()

In [None]:
y_p = predictions.select('prediction')
y_p = y_p.toPandas()

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_p)
cm

In [None]:
# Plot the confusion matrix
import seaborn as sn
sn.heatmap(cm, annot=True, cmap = 'Blues')
plt.show()