In [131]:
# %conda install -y openjdk
# %conda install -y pyspark
# %conda install -y -c conda-forge findspark

In [132]:
from pyspark.sql import SparkSession
from sklearn.datasets import fetch_california_housing
from pyspark.ml.feature import RobustScaler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
import pandas as pd

In [133]:
# start spark session
spark = SparkSession.builder.appName("RobustScaler Example").getOrCreate()

In [134]:
housing = fetch_california_housing()
print(housing['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [135]:
# convert dataset into a spark dataframe
df_pandas = pd.DataFrame(housing.data, columns=housing.feature_names)
df_pandas['target'] = pd.Series(housing.target)
df = spark.createDataFrame(df_pandas)
df.printSchema()

root
 |-- MedInc: double (nullable = true)
 |-- HouseAge: double (nullable = true)
 |-- AveRooms: double (nullable = true)
 |-- AveBedrms: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- AveOccup: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- target: double (nullable = true)



In [136]:
print("Number of columns:", len(df.columns))
df.show(5)

Number of columns: 9
+------+--------+------------------+------------------+----------+------------------+--------+---------+------+
|MedInc|HouseAge|          AveRooms|         AveBedrms|Population|          AveOccup|Latitude|Longitude|target|
+------+--------+------------------+------------------+----------+------------------+--------+---------+------+
|8.3252|    41.0| 6.984126984126984|1.0238095238095237|     322.0|2.5555555555555554|   37.88|  -122.23| 4.526|
|8.3014|    21.0| 6.238137082601054|0.9718804920913884|    2401.0| 2.109841827768014|   37.86|  -122.22| 3.585|
|7.2574|    52.0| 8.288135593220339| 1.073446327683616|     496.0|2.8022598870056497|   37.85|  -122.24| 3.521|
|5.6431|    52.0|5.8173515981735155|1.0730593607305936|     558.0| 2.547945205479452|   37.85|  -122.25| 3.413|
|3.8462|    52.0| 6.281853281853282|1.0810810810810811|     565.0|2.1814671814671813|   37.85|  -122.25| 3.422|
+------+--------+------------------+------------------+----------+-----------------

In [137]:
# merge feature columns into a vector column
features = housing.feature_names
va = VectorAssembler(inputCols = features, outputCol='features')

va_df = va.transform(df)
va_df = va_df.select(['features', 'target'])
va_df.show(5)

+--------------------+------+
|            features|target|
+--------------------+------+
|[8.3252,41.0,6.98...| 4.526|
|[8.3014,21.0,6.23...| 3.585|
|[7.2574,52.0,8.28...| 3.521|
|[5.6431,52.0,5.81...| 3.413|
|[3.8462,52.0,6.28...| 3.422|
+--------------------+------+
only showing top 5 rows



MinMax Scaler is one of the most popular scaling algorithms. It transforms features by scaling each feature to a given range, which is generally [0,1], or [-1,-1] in case of negative values.

In [138]:
# rescaling all features
mms = MinMaxScaler(inputCol='features', outputCol='scaled')
mms_model = mms.fit(va_df)
df_mms = mms_model.transform(va_df)
df_mms.show(5)

+--------------------+------+--------------------+
|            features|target|              scaled|
+--------------------+------+--------------------+
|[8.3252,41.0,6.98...| 4.526|[0.53966841836664...|
|[8.3014,21.0,6.23...| 3.585|[0.53802706169570...|
|[7.2574,52.0,8.28...| 3.521|[0.46602805478545...|
|[5.6431,52.0,5.81...| 3.413|[0.35469855588198...|
|[3.8462,52.0,6.28...| 3.422|[0.23077612722583...|
+--------------------+------+--------------------+
only showing top 5 rows



In [139]:
df_mms.select('scaled').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled                                                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.5396684183666434,0.7843137254901961,0.0435123021381201,0.020468661773009596,0.008940833543541018,0.0014994256786964625,0.5674814027630182,0.21115537848605498] |
|[0.5380270616957008,0.39215686274509803,0.03822395413503622,0.018929263599547087,0.06721040387903249,0.0011407430675557137,0.5653560042507968,0.21215139442231046]|
|[0.46602805478545123,1.0,0.052756463901968496,0.021940108528170434,0.013817651840017937,0.001697957938906389,0.5642933049946866,0.21015936254980092]              |
|[0.354698

Robust Scaler algorithms scale features that are robust to outliers. The method it follows is almost similar to the MinMax Scaler but it uses the interquartile range (rather than the min-max used in MinMax Scaler). The median and scales of the data are removed by this scaling algorithm according to the quantile range.

In [140]:
# rescaling all features
rs = RobustScaler(inputCol='features', outputCol='scaled')
rs_model = rs.fit(va_df)
df_rs = rs_model.transform(va_df)
df_rs.show(5)

+--------------------+------+--------------------+
|            features|target|              scaled|
+--------------------+------+--------------------+
|[8.3252,41.0,6.98...| 4.526|[3.82574330223794...|
|[8.3014,21.0,6.23...| 3.585|[3.81480630485731...|
|[7.2574,52.0,8.28...| 3.521|[3.33504894076559...|
|[5.6431,52.0,5.81...| 3.413|[2.59321722347318...|
|[3.8462,52.0,6.28...| 3.422|[1.76747392123523...|
+--------------------+------+--------------------+
only showing top 5 rows



In [141]:
df_rs.select('scaled').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|scaled                                                                                                                                                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|[3.825743302237949,2.1578947368421053,4.337302476446551,10.973171146106555,0.34401709401709407,3.0019800267129773,10.021164021164019,-32.25065963060693]|
|[3.814806304857313,1.1052631578947367,3.8740256983114834,10.41659676460011,2.5651709401709404,2.478405532102114,10.015873015873012,-32.248021108179486] |
|[3.33504894076559,2.7368421052631575,5.147121625265366,11.505177472859062,0.5299145299145299,3.291780604089101,10.01322751322751,-32.253298153034365]   |
|[2.593217223473186,2.7368421052631575,3.6127082956055734,11.501029968