# Principle Component Analysis with Iris Dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("PCA Example").getOrCreate()
sc = spark.sparkContext

In [None]:
# NOTE The CSV file does not have any headers
sdf = spark.read.csv('../datasets/iris.csv').toDF('Sepal_L', 'Sepal_W', 'Petal_L', 'Petal_W', 'Species')

In [None]:
sdf.show()

In [None]:
sdf.printSchema()

In [None]:
sdf.select("Species").distinct().show()

### Since the feature columns are string convert them to floats

In [None]:
from pyspark.sql.functions import col
cols = sdf.columns
# ignore Species column
cols = cols[:-1]
for col_name in cols:
    sdf = sdf.withColumn(col_name, col(col_name).cast('float'))

In [None]:
sdf.printSchema()

### Select Features

In [None]:
from pyspark.ml.feature import VectorAssembler
vassemb = VectorAssembler(inputCols = cols, outputCol = 'features')
ndf = vassemb.transform(sdf)
ndf = ndf.select(['features'])
ndf.show(3, truncate = False)

### Apply Principle Component Analysis (PCA)

In [None]:
from pyspark.ml.feature import PCA

In [None]:
pca = PCA(k = 2, inputCol = vassemb.getOutputCol(), outputCol = 'pcaFeatures')

In [None]:
model = pca.fit(ndf)

In [None]:
transformed_feature = model.transform(ndf)

In [None]:
transformed_feature.printSchema()

In [None]:
transformed_feature.select('pcaFeatures').show(3, truncate = False)

<font color='teal'>
<h2>Now these transformed dataset can be used with other ML algorithms to make predictions</h2>
</font>