In [1]:
sc

<pyspark.context.SparkContext at 0x7f2a9e685790>

In [4]:
!rm -rf metastore_db/*.lck

from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [5]:
from pyspark.sql.functions import rand, randn
from pyspark.ml.feature import VectorAssembler

dfRandom = sqlc.range(0, 10).select("id") \
            .withColumn("uniform", rand(10)) \
            .withColumn("normal1", randn(10)) \
            .withColumn("normal2", randn(11))
            
assembler = VectorAssembler(inputCols = ["uniform","normal1","normal2"], outputCol = "features")

dfVec = assembler.transform(dfRandom)

## Data Normalization

### Normalizer

In [6]:
from pyspark.ml.feature import Normalizer

In [7]:
scaler1 = Normalizer().setInputCol("features").setOutputCol("scaledFeat").setP(1.0)

In [8]:
scaler1.transform(dfVec.select("id","features")).show(5)

+---+--------------------+--------------------+
| id|            features|          scaledFeat|
+---+--------------------+--------------------+
|  0|[0.41371264720975...|[0.32886636983701...|
|  1|[0.19829196382083...|[0.20619308493718...|
|  2|[0.12030715258495...|[0.15654175655718...|
|  3|[0.44292918521277...|[0.33788519635286...|
|  4|[0.88987842538862...|[0.32390519197407...|
+---+--------------------+--------------------+
only showing top 5 rows



### Standard Scaler

In [9]:
from pyspark.ml.feature import StandardScaler

In [10]:
scaler2 = StandardScaler().setInputCol("features").setOutputCol("scaledFeat").setWithStd(True).setWithMean(True)

In [11]:
scaler2Model = scaler2.fit(dfVec.select("id","features"))

In [12]:
scaler2Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[-0.0321173330608, -0.400475290492, 0.01770640..."
1,1,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.795199539636, 0.116559162465, -0.38815037517]"
2,2,"[0.120307152585, -0.506853671746, -0.141369919...","[-1.07144423862, -0.274196160897, 0.204431268542]"
3,3,"[0.442929185213, -0.141369919356, -0.726587521...","[0.0713760731298, 0.296336216182, -0.744418595..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]","[1.65459922076, 2.02461325728, 1.87940775681]"


### MinMax Scaler

In [13]:
from pyspark.ml.feature import MinMaxScaler

In [14]:
scaler3 = MinMaxScaler().setInputCol("features").setOutputCol("scaledFeat").setMin(-1.0).setMax(1.0)

In [15]:
scaler3Model = scaler3.fit(dfVec.select("id","features"))

In [16]:
scaler3Model.transform(dfVec.select("id","features")).toPandas()[:5]

Unnamed: 0,id,features,scaledFeat
0,0,"[0.41371264721, -0.587748239674, -0.256535324205]","[-0.237483245559, -0.435578353707, -0.09866323..."
1,1,"[0.198291963821, -0.256535324205, -0.506853671...","[-0.797329203956, -0.12950974872, -0.338175289..."
2,2,"[0.120307152585, -0.506853671746, -0.141369919...","[-1.0, -0.360824965779, 0.0115304584941]"
3,3,"[0.442929185213, -0.141369919356, -0.726587521...","[-0.161553857247, -0.0230872236347, -0.5484231..."
4,4,"[0.889878425389, 0.965766508876, 0.891697335754]","[1.0, 1.0, 1.0]"
