## General Transformations
Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cloudanum').getOrCreate()

## Import Data

In [2]:
import pandas as pd
iris= spark.createDataFrame(pd.read_csv("https://storage.googleapis.com/neurals/data/iris.csv",header='infer'))
dataframe = iris.drop("species")
dataframe.show()


+-----------------+----------------+-----------------+----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|
+-----------------+----------------+-----------------+----------------+
|              5.1|             3.5|              1.4|             0.2|
|              4.9|             3.0|              1.4|             0.2|
|              4.7|             3.2|              1.3|             0.2|
|              4.6|             3.1|              1.5|             0.2|
|              5.0|             3.6|              1.4|             0.2|
|              5.4|             3.9|              1.7|             0.4|
|              4.6|             3.4|              1.4|             0.3|
|              5.0|             3.4|              1.5|             0.2|
|              4.4|             2.9|              1.4|             0.2|
|              4.9|             3.1|              1.5|             0.1|
|              5.4|             3.7|              1.5|          

## Vector Assembler

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler



assembler = VectorAssembler(
    inputCols=["sepal length (cm)", "sepal width (cm)", "petal length (cm)"],
    outputCol="features")

output = assembler.transform(dataframe)
print(" Assembled to vector column 'features'")

dataframe = output.drop("sepal length (cm)", "sepal width (cm)", "petal length (cm)")
dataframe = dataframe.withColumnRenamed("petal width (cm)","label")
dataframe.show()

 Assembled to vector column 'features'
+-----+-------------+
|label|     features|
+-----+-------------+
|  0.2|[5.1,3.5,1.4]|
|  0.2|[4.9,3.0,1.4]|
|  0.2|[4.7,3.2,1.3]|
|  0.2|[4.6,3.1,1.5]|
|  0.2|[5.0,3.6,1.4]|
|  0.4|[5.4,3.9,1.7]|
|  0.3|[4.6,3.4,1.4]|
|  0.2|[5.0,3.4,1.5]|
|  0.2|[4.4,2.9,1.4]|
|  0.1|[4.9,3.1,1.5]|
|  0.2|[5.4,3.7,1.5]|
|  0.2|[4.8,3.4,1.6]|
|  0.1|[4.8,3.0,1.4]|
|  0.1|[4.3,3.0,1.1]|
|  0.2|[5.8,4.0,1.2]|
|  0.4|[5.7,4.4,1.5]|
|  0.4|[5.4,3.9,1.3]|
|  0.3|[5.1,3.5,1.4]|
|  0.3|[5.7,3.8,1.7]|
|  0.3|[5.1,3.8,1.5]|
+-----+-------------+
only showing top 20 rows



## Normalizer
Normalizer is a Transformer which transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2 by default.) This normalization can help standardize your input data and improve the behavior of learning algorithms.

In [5]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors



# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataframe)
print("Normalized using L^1 norm")
l1NormData.show()

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataframe, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show()

Normalized using L^1 norm
+-----+-------------+--------------------+
|label|     features|        normFeatures|
+-----+-------------+--------------------+
|  0.2|[5.1,3.5,1.4]|[0.51,0.35,0.1399...|
|  0.2|[4.9,3.0,1.4]|[0.52688172043010...|
|  0.2|[4.7,3.2,1.3]|[0.51086956521739...|
|  0.2|[4.6,3.1,1.5]|[0.5,0.3369565217...|
|  0.2|[5.0,3.6,1.4]|[0.5,0.36,0.13999...|
|  0.4|[5.4,3.9,1.7]|[0.49090909090909...|
|  0.3|[4.6,3.4,1.4]|[0.48936170212765...|
|  0.2|[5.0,3.4,1.5]|[0.50505050505050...|
|  0.2|[4.4,2.9,1.4]|[0.50574712643678...|
|  0.1|[4.9,3.1,1.5]|[0.51578947368421...|
|  0.2|[5.4,3.7,1.5]|[0.50943396226415...|
|  0.2|[4.8,3.4,1.6]|[0.48979591836734...|
|  0.1|[4.8,3.0,1.4]|[0.52173913043478...|
|  0.1|[4.3,3.0,1.1]|[0.51190476190476...|
|  0.2|[5.8,4.0,1.2]|[0.52727272727272...|
|  0.4|[5.7,4.4,1.5]|[0.49137931034482...|
|  0.4|[5.4,3.9,1.3]|[0.50943396226415...|
|  0.3|[5.1,3.5,1.4]|[0.51,0.35,0.1399...|
|  0.3|[5.7,3.8,1.7]|[0.50892857142857...|
|  0.3|[5.1,3.8,1.5]|[0.4903

## Standard Scaler
StandardScaler transforms a dataset of Vector rows, normalizing each feature to have unit standard deviation and/or zero mean. It takes parameters:

withStd: True by default. Scales the data to unit standard deviation.
withMean: False by default. Centers the data with mean before scaling. It will build a dense output, so take care when applying to sparse input.
StandardScaler is an Estimator which can be fit on a dataset to produce a StandardScalerModel; this amounts to computing summary statistics. The model can then transform a Vector column in a dataset to have unit standard deviation and/or zero mean features.

Note that if the standard deviation of a feature is zero, it will return default 0.0 value in the Vector for that feature.

In [7]:
from pyspark.ml.feature import StandardScaler

dataFrame = dataframe
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(dataFrame)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(dataFrame)
scaledData.show()

+-----+-------------+--------------------+
|label|     features|      scaledFeatures|
+-----+-------------+--------------------+
|  0.2|[5.1,3.5,1.4]|[6.15892840883879...|
|  0.2|[4.9,3.0,1.4]|[5.91740180457060...|
|  0.2|[4.7,3.2,1.3]|[5.67587520030241...|
|  0.2|[4.6,3.1,1.5]|[5.55511189816832...|
|  0.2|[5.0,3.6,1.4]|[6.03816510670469...|
|  0.4|[5.4,3.9,1.7]|[6.52121831524107...|
|  0.3|[4.6,3.4,1.4]|[5.55511189816832...|
|  0.2|[5.0,3.4,1.5]|[6.03816510670469...|
|  0.2|[4.4,2.9,1.4]|[5.31358529390013...|
|  0.1|[4.9,3.1,1.5]|[5.91740180457060...|
|  0.2|[5.4,3.7,1.5]|[6.52121831524107...|
|  0.2|[4.8,3.4,1.6]|[5.79663850243651...|
|  0.1|[4.8,3.0,1.4]|[5.79663850243651...|
|  0.1|[4.3,3.0,1.1]|[5.19282199176604...|
|  0.2|[5.8,4.0,1.2]|[7.00427152377745...|
|  0.4|[5.7,4.4,1.5]|[6.88350822164335...|
|  0.4|[5.4,3.9,1.3]|[6.52121831524107...|
|  0.3|[5.1,3.5,1.4]|[6.15892840883879...|
|  0.3|[5.7,3.8,1.7]|[6.88350822164335...|
|  0.3|[5.1,3.8,1.5]|[6.15892840883879...|
+-----+----

## Bucketizer
Bucketizer transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter:

    splits: Parameter for mapping continuous features into buckets. With n+1 splits, there are n buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last bucket, which also includes y. Splits should be strictly increasing. Values at -inf, inf must be explicitly provided to cover all Double values; Otherwise, values outside the splits specified will be treated as errors. Two examples of splits are Array(Double.NegativeInfinity, 0.0, 1.0, Double.PositiveInfinity) and Array(0.0, 1.0, 2.0).
    
Note that if you have no idea of the upper and lower bounds of the targeted column, you should add Double.NegativeInfinity and Double.PositiveInfinity as the bounds of your splits to prevent a potential out of Bucketizer bounds exception.

Note also that the splits that you provided have to be in strictly increasing order, i.e. s0 < s1 < s2 < ... < sn.

More details can be found in the API docs for Bucketizer.

In [10]:
from pyspark.ml.feature import Bucketizer

splits = [0.0, 1.0, 2.0, 3.0, 4.0,5.0,10]


bucketizer = Bucketizer(splits=splits, inputCol="sepal length (cm)", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(iris)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 6 buckets
+-----------------+----------------+-----------------+----------------+-----------+----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|    species|bucketedFeatures|
+-----------------+----------------+-----------------+----------------+-----------+----------------+
|              5.1|             3.5|              1.4|             0.2|Iris-setosa|             5.0|
|              4.9|             3.0|              1.4|             0.2|Iris-setosa|             4.0|
|              4.7|             3.2|              1.3|             0.2|Iris-setosa|             4.0|
|              4.6|             3.1|              1.5|             0.2|Iris-setosa|             4.0|
|              5.0|             3.6|              1.4|             0.2|Iris-setosa|             5.0|
|              5.4|             3.9|              1.7|             0.4|Iris-setosa|             5.0|
|              4.6|             3.4|              1.4|    

## Imputer
The Imputer estimator completes missing values in a dataset, using the mean, median or mode of the columns in which the missing values are located. The input columns should be of numeric type. Currently Imputer does not support categorical features and possibly creates incorrect values for columns containing categorical features. Imputer can impute custom values other than ‘NaN’ by .setMissingValue(custom_value). For example, .setMissingValue(0) will impute all occurrences of (0).

In [11]:
from pyspark.ml.feature import Imputer

df = spark.createDataFrame([
    (1.0, float("nan")),
    (2.0, float("nan")),
    (float("nan"), 3.0),
    (4.0, 4.0),
    (5.0, 5.0)
], ["a", "b"])

imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
model = imputer.fit(df)

model.transform(df).show()

+---+---+-----+-----+
|  a|  b|out_a|out_b|
+---+---+-----+-----+
|1.0|NaN|  1.0|  4.0|
|2.0|NaN|  2.0|  4.0|
|NaN|3.0|  3.0|  3.0|
|4.0|4.0|  4.0|  4.0|
|5.0|5.0|  5.0|  5.0|
+---+---+-----+-----+



## To Do:
Apply the following transformations on the dataframe below:

1- Vector Assembler

2- Normalizer

3- Standard Scaler

4- Bucketizer
    

In [12]:
df = spark.createDataFrame([
    (1, Vectors.dense([1.7, 4.4, 7.6, 5.8, 9.6, 2.3]), 3.0,),
    (2, Vectors.dense([8.8, 7.3, 5.7, 7.3, 2.2, 4.1]), 2.0,),
    (3, Vectors.dense([1.2, 9.5, 2.5, 3.1, 8.7, 2.5]), 3.0,),
    (4, Vectors.dense([3.7, 9.2, 6.1, 4.1, 7.5, 3.8]), 2.0,),
    (5, Vectors.dense([8.9, 5.2, 7.8, 8.3, 5.2, 3.0]), 4.0,),
    (6, Vectors.dense([7.9, 8.5, 9.2, 4.0, 9.4, 2.1]), 4.0,)], ["id", "features", "label"])