In [10]:
# This cell is not needed when this jupyter notebook is running on a Sagemaker instance
# This is only needed when running it on local laptop
import pyspark
from pyspark.sql import SparkSession
# Set up a spark session with leveraging all available CPUs
spark = SparkSession \
        .builder \
        .master('local[*]')\
        .appName("Demo") \
        .config("spark.driver.bindAddress", "127.0.0.1") \
        .getOrCreate()
print("Spark Version: " + spark.version)

Spark Version: 3.0.1


In [53]:
from pyspark.sql.functions import col, split, mean
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
import numpy as np

# Drop Missing Values

In [40]:
# Create a data frame

df = spark.createDataFrame([
    (0, "Red"),
    (1, "Blue"),
    (2, "Green"),
    (3, None)
], ["id", "Color"])
df.show()

+---+-----+
| id|Color|
+---+-----+
|  0|  Red|
|  1| Blue|
|  2|Green|
|  3| null|
+---+-----+



Drop rows with NULL values in ANY columns

In [41]:
df.na.drop("any").show(truncate=False)

+---+-----+
|id |Color|
+---+-----+
|0  |Red  |
|1  |Blue |
|2  |Green|
+---+-----+



Drop rows with NULL values in ALL columns

In [43]:
df.na.drop("all").show(truncate=False)

+---+-----+
|id |Color|
+---+-----+
|0  |Red  |
|1  |Blue |
|2  |Green|
|3  |null |
+---+-----+



Drop rows with NULL values by using ```dropna()```

In [42]:
df.dropna().show(truncate=False)

+---+-----+
|id |Color|
+---+-----+
|0  |Red  |
|1  |Blue |
|2  |Green|
+---+-----+



# Fill the NULL Values

In [44]:
# Create a data frame

df = spark.createDataFrame([
    (0, "Red"),
    (1, "Blue"),
    (2, "Green"),
    (3, None)
], ["id", "Color"])
df.show()

+---+-----+
| id|Color|
+---+-----+
|  0|  Red|
|  1| Blue|
|  2|Green|
|  3| null|
+---+-----+



Fill the null values with "NA"

In [45]:
df.na.fill('NA').show()

+---+-----+
| id|Color|
+---+-----+
|  0|  Red|
|  1| Blue|
|  2|Green|
|  3|   NA|
+---+-----+



Fill the null values with mean or average

In [46]:
df = spark.createDataFrame([
    (0, "Red"),
    (1, "Blue"),
    (2, "Green"),
    (None, "Yellow")
], ["id", "Color"])
df.show()

+----+------+
|  id| Color|
+----+------+
|   0|   Red|
|   1|  Blue|
|   2| Green|
|null|Yellow|
+----+------+



In [50]:
mean_val=df.select(mean(df.id)).collect()
df.na.fill(mean_val[0][0],subset=['id']).show()

+---+------+
| id| Color|
+---+------+
|  0|   Red|
|  1|  Blue|
|  2| Green|
|  1|Yellow|
+---+------+



# One-hot Encoding

In [16]:
# Create a data frame

df = spark.createDataFrame([
    (0, "Red"),
    (1, "Blue"),
    (2, "Green"),
    (3, "White")
], ["id", "Color"])
df.show()

+---+-----+
| id|Color|
+---+-----+
|  0|  Red|
|  1| Blue|
|  2|Green|
|  3|White|
+---+-----+



In [17]:
# One-hot encoding with Pyspark CountVectorizer

df = df.withColumn("Color_Array", split(col("Color")," "))
colorVectorizer = CountVectorizer(inputCol="Color_Array", outputCol="Color_OneHotEncoded", vocabSize=4, minDF=1.0)
colorVectorizer_model = colorVectorizer.fit(df)
df_ohe = colorVectorizer_model.transform(df)
df_ohe.show(truncate=False)

+---+-----+-----------+-------------------+
|id |Color|Color_Array|Color_OneHotEncoded|
+---+-----+-----------+-------------------+
|0  |Red  |[Red]      |(4,[3],[1.0])      |
|1  |Blue |[Blue]     |(4,[1],[1.0])      |
|2  |Green|[Green]    |(4,[0],[1.0])      |
|3  |White|[White]    |(4,[2],[1.0])      |
+---+-----+-----------+-------------------+



In [21]:
# Convert the one-hot encoded column into numpy array

x_3d = np.array(df_ohe.select('Color_OneHotEncoded').collect())
X = x_3d.reshape(4, 4)
X

array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]])

# Label Index Encoding

In [22]:
# Create a data frame

df = spark.createDataFrame([
    (0, "Red"),
    (1, "Blue"),
    (2, "Green"),
    (3, "White")
], ["id", "Color"])
df.show()

+---+-----+
| id|Color|
+---+-----+
|  0|  Red|
|  1| Blue|
|  2|Green|
|  3|White|
+---+-----+



In [25]:
# Convert a column into numerical categories (indexing)

labelIndexer = StringIndexer(inputCol="Color", outputCol="Color_Index")
labelIndexer_model = labelIndexer.fit(df)
df_lie = labelIndexer_model.transform(df)
df_lie.show(truncate=False)

+---+-----+-----------+
|id |Color|Color_Index|
+---+-----+-----------+
|0  |Red  |2.0        |
|1  |Blue |0.0        |
|2  |Green|1.0        |
|3  |White|3.0        |
+---+-----+-----------+



# Feature Assembler

In [26]:
# Create a data frame

df = spark.createDataFrame([
    (0, "Red"),
    (1, "Blue"),
    (2, "Green"),
    (3, "White")
], ["id", "Color"])
df = df.withColumn("Color_Array", split(col("Color")," "))
df.show()

+---+-----+-----------+
| id|Color|Color_Array|
+---+-----+-----------+
|  0|  Red|      [Red]|
|  1| Blue|     [Blue]|
|  2|Green|    [Green]|
|  3|White|    [White]|
+---+-----+-----------+



In [33]:
# setup one-hot encoding

colorVectorizer = CountVectorizer(inputCol="Color_Array", outputCol="Color_OneHotEncoded", vocabSize=4, minDF=1.0)
colorVectorizer_model = colorVectorizer.fit(df)
df_ohe = colorVectorizer_model.transform(df)

labelIndexer = StringIndexer(inputCol="Color", outputCol="Color_Index")
labelIndexer_model = labelIndexer.fit(df_ohe)
df_lie = labelIndexer_model.transform(df_ohe)

In [35]:
vecAssembler = VectorAssembler(inputCols=["Color_OneHotEncoded", "Color_Index"], outputCol="features")
df_va = vecAssembler.transform(df_lie)
df_va.show(truncate=False)

+---+-----+-----------+-------------------+-----------+-------------------+
|id |Color|Color_Array|Color_OneHotEncoded|Color_Index|features           |
+---+-----+-----------+-------------------+-----------+-------------------+
|0  |Red  |[Red]      |(4,[0],[1.0])      |2.0        |(5,[0,4],[1.0,2.0])|
|1  |Blue |[Blue]     |(4,[3],[1.0])      |0.0        |(5,[3],[1.0])      |
|2  |Green|[Green]    |(4,[2],[1.0])      |1.0        |(5,[2,4],[1.0,1.0])|
|3  |White|[White]    |(4,[1],[1.0])      |3.0        |(5,[1,4],[1.0,3.0])|
+---+-----+-----------+-------------------+-----------+-------------------+



# Numerical Scaling

In [61]:
# Create a data frame

df = spark.createDataFrame([
    (0, 10.0),
    (1, 11.0),
    (2, 12.0),
    (3, 13.0)
], ["Factor", "Color"])
df.show()

+------+-----+
|Factor|Color|
+------+-----+
|     0| 10.0|
|     1| 11.0|
|     2| 12.0|
|     3| 13.0|
+------+-----+



In [66]:
va = VectorAssembler(inputCols=["Factor","Color"], outputCol="Color_VA")
df_tmp = va.transform(df)
scaler = StandardScaler(inputCol="Color_VA", outputCol="scaledColor", withStd=True, withMean=True)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(df_tmp)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(df_tmp)
scaledData.show(truncate=False)

+------+-----+----------+-----------------------------------------+
|Factor|Color|Color_VA  |scaledColor                              |
+------+-----+----------+-----------------------------------------+
|0     |10.0 |[0.0,10.0]|[-1.161895003862225,-1.161895003862225]  |
|1     |11.0 |[1.0,11.0]|[-0.3872983346207417,-0.3872983346207417]|
|2     |12.0 |[2.0,12.0]|[0.3872983346207417,0.3872983346207417]  |
|3     |13.0 |[3.0,13.0]|[1.161895003862225,1.161895003862225]    |
+------+-----+----------+-----------------------------------------+

