In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

In [None]:
# (2) Instantiate a Spark Context
conf = SparkConf().setMaster("local").setAppName("Transformation")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

In [None]:
data = spark.read.csv('./datasets/colors.csv', header=True, inferSchema=True)
data.show()

#### String Indexing


In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="color", outputCol="color_indexed")

In [None]:
indexer_model = indexer.fit(data)


In [None]:
indexed_data= indexer_model.transform(data)
# to view the data
indexed_data.show()

#### One Hot Encoding


In [None]:
from pyspark.ml.feature import OneHotEncoder
ohe = OneHotEncoder(inputCols=["color_indexed"], outputCols=["color_ohe"])

In [None]:
ohe_model = ohe.fit(indexed_data)


In [None]:
encoded_data = ohe_model.transform(indexed_data)
encoded_data.show()

In [None]:
ohe = OneHotEncoder(inputCols=["color_indexed"], outputCols=["color_ohe"], dropLast=False)

#### Feature Scaling


In [None]:
data = spark.read.csv('./datasets/wine.csv', header=False, inferSchema=True)
data.show()

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features")
data_2 = assembler.transform(data)

#### StandardScaler

In [None]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

In [None]:
scaler_model = scaler.fit(data_2)


In [None]:
scaled_data = scaler_model.transform(data_2)


#### MinMaxScaler

In [None]:
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler(min=0, max=1, inputCol='features', outputCol='features_minmax')
scaler_model = scaler.fit(data_2)
data_3 = scaler_model.transform(data_2)

#### Principal Component Analysis


In [None]:
data = spark.read.csv('./datasets/digits.csv', header=True, inferSchema=True)
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=data.columns[1:], outputCol='features')
data_2 = assembler.transform(data)

In [None]:
from pyspark.ml.feature import PCA
pca = PCA(k=2, inputCol='features', outputCol='features_pca')

In [None]:
pca_model = pca.fit(data_2)

In [None]:
pca_data = pca_model.transform(data_2).select('features_pca')

In [None]:
pca_data.show()