In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))
            
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

In [5]:
dfRandom.toPandas()

Unnamed: 0,id,uniform,normal1,normal2
0,0,0.413713,-0.587748,-0.256535
1,1,0.198292,-0.256535,-0.506854
2,2,0.120307,-0.506854,-0.14137
3,3,0.442929,-0.14137,-0.726588
4,4,0.889878,0.965767,0.891697
5,5,0.273107,-0.726588,-1.198539
6,6,0.870794,-1.198539,-0.117111
7,7,0.271493,-0.117111,0.304946
8,8,0.603714,0.304946,0.039339
9,9,0.143567,-1.048001,-0.963547


In [6]:
dfVec.toPandas()

Unnamed: 0,id,uniform,normal1,normal2,features
0,0,0.413713,-0.587748,-0.256535,"[0.41371264721, -0.587748239674, -0.256535324205]"
1,1,0.198292,-0.256535,-0.506854,"[0.198291963821, -0.256535324205, -0.506853671..."
2,2,0.120307,-0.506854,-0.14137,"[0.120307152585, -0.506853671746, -0.141369919..."
3,3,0.442929,-0.14137,-0.726588,"[0.442929185213, -0.141369919356, -0.726587521..."
4,4,0.889878,0.965767,0.891697,"[0.889878425389, 0.965766508876, 0.891697335754]"
5,5,0.273107,-0.726588,-1.198539,"[0.273107306848, -0.726587521995, -1.19853855262]"
6,6,0.870794,-1.198539,-0.117111,"[0.870793547001, -1.19853855262, -0.117110926001]"
7,7,0.271493,-0.117111,0.304946,"[0.271493317932, -0.117110926001, 0.304945613282]"
8,8,0.603714,0.304946,0.039339,"[0.603714357844, 0.304945613282, 0.0393394905131]"
9,9,0.143567,-1.048001,-0.963547,"[0.143566883898, -1.04800065723, -0.963546696012]"


## Data Normalization

### Normalizer

In [7]:

from pyspark.ml.feature import Normalizer

In [8]:
scaler1 = Normalizer().setInputCol("features").setOutputCol("scaledFeat").setP(1.0)

In [9]:
scaler1.transform(dfVec.select("id","features")).show(5)

+---+--------------------+--------------------+
| id|            features|          scaledFeat|
+---+--------------------+--------------------+
|  0|[0.41371264720975...|[0.32886636983701...|
|  1|[0.19829196382083...|[0.20619308493718...|
|  2|[0.12030715258495...|[0.15654175655718...|
|  3|[0.44292918521277...|[0.33788519635286...|
|  4|[0.88987842538862...|[0.32390519197407...|
+---+--------------------+--------------------+
only showing top 5 rows



In [10]:
scaler1.transform(dfVec.select("id","features")).toPandas()

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[0.328866369837, -0.467209864778, -0.203923765..."
1,1,"[0.198291963821, -0.256535324205, -0.506853671...","[0.206193084937, -0.266757204246, -0.527049710..."
2,2,"[0.120307152585, -0.506853671746, -0.141369919...","[0.156541756557, -0.659509949224, -0.183948294..."
3,3,"[0.442929185213, -0.141369919356, -0.726587521...","[0.337885196353, -0.107842979317, -0.55427182433]"
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]","[0.323905191974, 0.351527554253, 0.324567253773]"
5,5,"[0.273107306848, -0.726587521995, -1.19853855262]","[0.124239450256, -0.330532475816, -0.545228073..."
6,6,"[0.870793547001, -1.19853855262, -0.117110926001]","[0.39826948921, -0.548168206797, -0.0535623039..."
7,7,"[0.271493317932, -0.117110926001, 0.304945613282]","[0.391454651901, -0.168857256307, 0.439688091793]"
8,8,"[0.603714357844, 0.304945613282, 0.0393394905131]","[0.636829853047, 0.321672770525, 0.041497376428]"
9,9,"[0.143566883898, -1.04800065723, -0.963546696012]","[0.0666168323812, -0.486285431727, -0.44709773..."


### Standard Scaler

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scaler2 = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)

In [None]:
scaler2Model = scaler2.fit(dfVec.select("id","features"))

In [None]:
scaler2Model.transform(dfVec.select("id","features")).toPandas()[:5]

### MinMax Scaler

In [None]:
from pyspark.ml.feature import MinMaxScaler

In [None]:
scaler3 = MinMaxScaler().setInputCol("features").setOutputCol("scaledFeat").setMin(-1.0).setMax(1.0)

In [None]:
scaler3Model = scaler3.fit(dfVec.select("id","features"))

In [None]:
scaler3Model.transform(dfVec.select("id","features")).toPandas()[:5]

In [None]:
sc.stop()