In [1]:
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession

sc=SparkContext()
spark=SparkSession(sc)

## Example Data

In [3]:
import warnings
warnings.simplefilter('ignore')
import pandas as pd
pdf = pd.DataFrame({
        'x1': ['a','a','b','b', 'b', 'c'],
        'x2': ['apple', 'orange', 'orange','orange', 'peach', 'peach'],
        'x3': [1, 1, 2, 2, 2, 4],
        'x4': [2.4, 2.5, 3.5, 1.4, 2.1,1.5],
        'y1': [1, 0, 1, 0, 0, 1],
        'y2': ['yes', 'no', 'no', 'yes', 'yes', 'yes']
    })
df = spark.createDataFrame(pdf)
df.show()

+---+------+---+---+---+---+
| x1|    x2| x3| x4| y1| y2|
+---+------+---+---+---+---+
|  a| apple|  1|2.4|  1|yes|
|  a|orange|  1|2.5|  0| no|
|  b|orange|  2|3.5|  1| no|
|  b|orange|  2|1.4|  0|yes|
|  b| peach|  2|2.1|  0|yes|
|  c| peach|  4|1.5|  1|yes|
+---+------+---+---+---+---+



## VectorAssembler

To fit a ML model,we need to have all the features columns in one column of vectors: the **featureCol.**
<br> The *VectorAssembler* is used to combine multiple **OneHotEncoder** columns and **other continuous columns** into a single **featureCol** column.

### StringIndex and OneHotEncode categorical columns


In [4]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

In [6]:
all_stages=[StringIndexer(inputCol=c,outputCol='index_'+c) for c in ['x1','x2','x3']]+ \
[OneHotEncoder(inputCol='index_'+c,outputCol='ohe_'+c) for c in ['x1','x2','x3']]
all_stages

[StringIndexer_c785569b18c4,
 StringIndexer_bdf5a9f34806,
 StringIndexer_9edd16816800,
 OneHotEncoder_71b8df73d9e3,
 OneHotEncoder_50437fcbcc22,
 OneHotEncoder_c8150493c023]

In [8]:
df_new=Pipeline(stages=all_stages).fit(df).transform(df)
df_new.show()

+---+------+---+---+---+---+--------+--------+--------+-------------+-------------+-------------+
| x1|    x2| x3| x4| y1| y2|index_x1|index_x2|index_x3|       ohe_x1|       ohe_x2|       ohe_x3|
+---+------+---+---+---+---+--------+--------+--------+-------------+-------------+-------------+
|  a| apple|  1|2.4|  1|yes|     1.0|     2.0|     1.0|(2,[1],[1.0])|    (2,[],[])|(2,[1],[1.0])|
|  a|orange|  1|2.5|  0| no|     1.0|     0.0|     1.0|(2,[1],[1.0])|(2,[0],[1.0])|(2,[1],[1.0])|
|  b|orange|  2|3.5|  1| no|     0.0|     0.0|     0.0|(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|
|  b|orange|  2|1.4|  0|yes|     0.0|     0.0|     0.0|(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|
|  b| peach|  2|2.1|  0|yes|     0.0|     1.0|     0.0|(2,[0],[1.0])|(2,[1],[1.0])|(2,[0],[1.0])|
|  c| peach|  4|1.5|  1|yes|     2.0|     1.0|     2.0|    (2,[],[])|(2,[1],[1.0])|    (2,[],[])|
+---+------+---+---+---+---+--------+--------+--------+-------------+-------------+-------------+



### Assemble feature columns into one *featureCol* with VectorAssembler 


In [9]:
from pyspark.ml.feature import VectorAssembler

In [12]:
df_assembled=VectorAssembler(inputCols=['ohe_x1','ohe_x2','ohe_x3','x4'],outputCol='featureCol') \
                   .transform(df_new).drop('index_x1','index_x2','index_x3')
df_assembled.show(truncate=False)

+---+------+---+---+---+---+-------------+-------------+-------------+-----------------------------+
|x1 |x2    |x3 |x4 |y1 |y2 |ohe_x1       |ohe_x2       |ohe_x3       |featureCol                   |
+---+------+---+---+---+---+-------------+-------------+-------------+-----------------------------+
|a  |apple |1  |2.4|1  |yes|(2,[1],[1.0])|(2,[],[])    |(2,[1],[1.0])|(7,[1,5,6],[1.0,1.0,2.4])    |
|a  |orange|1  |2.5|0  |no |(2,[1],[1.0])|(2,[0],[1.0])|(2,[1],[1.0])|[0.0,1.0,1.0,0.0,0.0,1.0,2.5]|
|b  |orange|2  |3.5|1  |no |(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,1.0,0.0,3.5]|
|b  |orange|2  |1.4|0  |yes|(2,[0],[1.0])|(2,[0],[1.0])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,1.0,0.0,1.4]|
|b  |peach |2  |2.1|0  |yes|(2,[0],[1.0])|(2,[1],[1.0])|(2,[0],[1.0])|[1.0,0.0,0.0,1.0,1.0,0.0,2.1]|
|c  |peach |4  |1.5|1  |yes|(2,[],[])    |(2,[1],[1.0])|(2,[],[])    |(7,[3,6],[1.0,1.5])          |
+---+------+---+---+---+---+-------------+-------------+-------------+---------------------

### Convert *sparse vectors* in featureCol into *Dense vectors*

In [13]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.ml.linalg import DenseVector,SparseVector

In [14]:
df_assembled.rdd.map(lambda x:x['featureCol']).take(4)

[SparseVector(7, {1: 1.0, 5: 1.0, 6: 2.4}),
 DenseVector([0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 2.5]),
 DenseVector([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 3.5]),
 DenseVector([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.4])]

In [15]:
df_assembled.rdd.map(lambda x: list(x['featureCol'].toArray())).take(4)

[[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.4],
 [0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 2.5],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 3.5],
 [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.4]]