# Data Scaling for Machine Learning

## Import Spark Libraries

In [None]:
# Set up the environment for using pyspark
import findspark

findspark.init()

In [None]:
import pyspark

In [None]:
# Create Spark Session
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Pima Diabetes").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/pima_diabetes.csv')

In [None]:
sdf.show()

In [None]:
# Get the statistical information all some of the features (Summary is the description )
sdf.describe().select('Summary', 'num_preg', 'glucose_conc', 'diastolic_bp', 'thickness' ).show()

In [None]:
sdf.describe().select('Summary', 'insulin', 'bmi', 'diab_pred', 'age', 'skin' ).show()

## Most Spark ML Algorigthm require the features to be separated as vectors

In [None]:
# Select columns and print them
cols = sdf.columns
cols

In [None]:
# Remove diabetes because that is our target variable
cols.remove('diabetes')
print(cols)

In [None]:
# Let us import the vector assembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols,outputCol="features")

In [None]:
# Now let us use the transform method to transform our dataset
sdf = assembler.transform(sdf)
sdf.select("features").show(truncate=False)

## MinMaxScaler
<br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">

<ul>
<li>When your data is comprised of attributes with varying scales, many machine learning algorithms
    can benefit from rescaling the attributes to all have the same scale</li>
<li>Often this is referred to as normalization and attributes are often rescaled into the range between 0 and 1</li>
</ul>

</span>

In [None]:
from pyspark.ml.feature import MinMaxScaler
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

In [None]:
scalerModel = scaler.fit(sdf)

In [None]:
scaledData = scalerModel.transform(sdf)

In [None]:
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))

In [None]:
scaledData.select('scaledFeatures', 'features').toPandas().head()

## StandardScaler<br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">

<ul>

<li>The main idea is to standardize (mean = 0 and standard deviation = 1) your features before applying machine learning techniques</li>
<li>Usually a dataset contains variables that are different in scale. </li>
<li>For e.g. an Employee dataset will contain AGE column with values on scale 20-70 and SALARY column with values on scale 10000-80000</li>
<li>As these two columns are different in scale, they are Standardized to have common scale while building machine learning model</li>
</span>

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="stdFeatures",
                            withStd=True, withMean=False)

In [None]:
scalerModel = scaler.fit(sdf)

In [None]:
scaledData = scalerModel.transform(sdf)

In [None]:
scaledData.select('stdFeatures', 'features').toPandas().head()

## Normalizer<br>
<span style="font-family:times, serif; font-size:14pt; font-style:bold">

<ul>
    <li>Normalizer() rescales each sample. For example rescaling each company's stock price independently of the other</li>
    <li>Some stocks are more expensive than others. To account for this, we normalize it</li>
    <li>The Normalizer will separately transform each company's stock price to a relative scale</li>
</ul>
</span>

In [None]:
from pyspark.ml.feature import Normalizer

In [None]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)

In [None]:
l1NormData = normalizer.transform(sdf)

In [None]:
print("Normalized using L^1 norm")
l1NormData.select('normFeatures', 'features').toPandas().head()

## Train and Test Split

In [None]:
train, test = sdf.randomSplit([0.7, 0.3], seed = 2345)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))