# PYSPARK MACHINE LEARNING LIBRARY NOTES  
ELİF CANSU YILDIZ

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import lit,col

In [2]:
spark = SparkSession\
    .builder\
    .appName("MachineLearningNotes")\
    .getOrCreate()

In [3]:
schema = StructType([StructField("age",IntegerType(), True),
                    StructField("fnlwgt", FloatType(), True),
                    StructField("education", StringType(), True),
                    StructField("education_num", FloatType(), True),
                    StructField("marital_status", StringType(), True),
                    StructField("relationship", StringType(), True),
                    StructField("race", StringType(), True),
                    StructField("sex", StringType(), True),
                    StructField("capital_gain", FloatType(), True),
                    StructField("capital_loss", FloatType(), True),
                    StructField("hours_per_week", FloatType(), True)])

df = spark.read.csv("dataset.csv", schema=schema)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: float (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: float (nullable = true)
 |-- capital_loss: float (nullable = true)
 |-- hours_per_week: float (nullable = true)



In [184]:
df[df.columns[:9]].show()
df[df.columns[9:]].show()

+---+--------+-------------+-------------+--------------------+---------------+-------------------+-------+------------+
|age|  fnlwgt|    education|education_num|      marital_status|   relationship|               race|    sex|capital_gain|
+---+--------+-------------+-------------+--------------------+---------------+-------------------+-------+------------+
| 39| 77516.0|    Bachelors|         13.0|       Never-married|  Not-in-family|              White|   Male|      2174.0|
| 50| 83311.0|    Bachelors|         13.0|  Married-civ-spouse|        Husband|              White|   Male|         0.0|
| 38|215646.0|      HS-grad|          9.0|            Divorced|  Not-in-family|              White|   Male|         0.0|
| 53|234721.0|         11th|          7.0|  Married-civ-spouse|        Husband|              Black|   Male|         0.0|
| 28|338409.0|    Bachelors|         13.0|  Married-civ-spouse|           Wife|              Black| Female|         0.0|
| 37|284582.0|      Masters|    

In [157]:
categoricalColumns = ["education", "marital_status", "relationship", "race", "sex"]
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

## String Indexer and OneHotEncoderEstimator

1. Category Indexing with StringIndexer (Different categories are converted to indexes according to their usage frequency)

2. Use OneHotEncoder to convert categorical variables into binary SparseVectors

3. Add stages.  These are not run here, but will run all at once later on for Pipeline.
4. Convert label into label indices using the StringIndexer

In [112]:
stages = [] #stages in our Pipeline

In [113]:
for categoricalCol in categoricalColumns:
    
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    
    stages += [stringIndexer,encoder]
    
#"""label_stringIdx = StringIndexer(inputCol="income", outputCol="label")
#stages += [label_stringIdx]"""

## PipeLine

In [125]:
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)

In [115]:
preppedDataDF.select("education", "educationIndex","educationclassVec", "marital_status", "marital_statusIndex", "marital_statusclassVec").show()
preppedDataDF[preppedDataDF.columns[11:15]].show()
preppedDataDF[preppedDataDF.columns[15:]].show()

+-------------+--------------+-----------------+--------------------+-------------------+----------------------+
|    education|educationIndex|educationclassVec|      marital_status|marital_statusIndex|marital_statusclassVec|
+-------------+--------------+-----------------+--------------------+-------------------+----------------------+
|    Bachelors|           0.0|    (8,[0],[1.0])|       Never-married|                1.0|         (3,[1],[1.0])|
|    Bachelors|           0.0|    (8,[0],[1.0])|  Married-civ-spouse|                0.0|         (3,[0],[1.0])|
|      HS-grad|           1.0|    (8,[1],[1.0])|            Divorced|                2.0|         (3,[2],[1.0])|
|         11th|           3.0|    (8,[3],[1.0])|  Married-civ-spouse|                0.0|         (3,[0],[1.0])|
|    Bachelors|           0.0|    (8,[0],[1.0])|  Married-civ-spouse|                0.0|         (3,[0],[1.0])|
|      Masters|           2.0|    (8,[2],[1.0])|  Married-civ-spouse|                0.0|       

## Vector Assembler

Transform all features into a vector using VectorAssembler

In [191]:
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages2 = [assembler]

partialPipeline = Pipeline().setStages(stages2)
pipelineModel = partialPipeline.fit(df)
preppedDataDF2 = pipelineModel.transform(df)

In [118]:
selectedColumns = [t[0] for t in preppedDataDF2.dtypes if t[1]=="int" or t[1]=="float" or t[1]=="vector"]

preppedDataDF2[selectedColumns].show(truncate=False)

+---+--------+-------------+------------+------------+--------------+------------------------------------+
|age|fnlwgt  |education_num|capital_gain|capital_loss|hours_per_week|features                            |
+---+--------+-------------+------------+------------+--------------+------------------------------------+
|39 |77516.0 |13.0         |2174.0      |0.0         |40.0          |[39.0,77516.0,13.0,2174.0,0.0,40.0] |
|50 |83311.0 |13.0         |0.0         |0.0         |13.0          |[50.0,83311.0,13.0,0.0,0.0,13.0]    |
|38 |215646.0|9.0          |0.0         |0.0         |40.0          |[38.0,215646.0,9.0,0.0,0.0,40.0]    |
|53 |234721.0|7.0          |0.0         |0.0         |40.0          |[53.0,234721.0,7.0,0.0,0.0,40.0]    |
|28 |338409.0|13.0         |0.0         |0.0         |40.0          |[28.0,338409.0,13.0,0.0,0.0,40.0]   |
|37 |284582.0|14.0         |0.0         |0.0         |40.0          |[37.0,284582.0,14.0,0.0,0.0,40.0]   |
|49 |160187.0|5.0          |0.0      

## MinMaxScaler

Relevant columns should be vectorized before running the function

In [10]:
assembler = VectorAssembler(inputCols = ["hours_per_week"], outputCol = "vector")
stage = [assembler]
partialPipeline = Pipeline().setStages(stage)
pipelineModel = partialPipeline.fit(df)
newdf = pipelineModel.transform(df)

column2 = "vector"
mmScaler = MinMaxScaler(min = 0, max = 100, inputCol = column2, outputCol = "Scaled_hours_per_week")
model = mmScaler.fit(newdf)
scaledDF = model.transform(newdf)

scaledDF.select("hours_per_week", "vector", "scaled_hours_per_week").show(truncate = False)


+--------------+------+---------------------+
|hours_per_week|vector|scaled_hours_per_week|
+--------------+------+---------------------+
|40.0          |[40.0]|[40.298507462686565] |
|13.0          |[13.0]|[0.0]                |
|40.0          |[40.0]|[40.298507462686565] |
|40.0          |[40.0]|[40.298507462686565] |
|40.0          |[40.0]|[40.298507462686565] |
|40.0          |[40.0]|[40.298507462686565] |
|16.0          |[16.0]|[4.477611940298507]  |
|45.0          |[45.0]|[47.76119402985074]  |
|50.0          |[50.0]|[55.223880597014926] |
|40.0          |[40.0]|[40.298507462686565] |
|80.0          |[80.0]|[100.0]              |
|40.0          |[40.0]|[40.298507462686565] |
|30.0          |[30.0]|[25.37313432835821]  |
|50.0          |[50.0]|[55.223880597014926] |
|40.0          |[40.0]|[40.298507462686565] |
|45.0          |[45.0]|[47.76119402985074]  |
|35.0          |[35.0]|[32.83582089552239]  |
|40.0          |[40.0]|[40.298507462686565] |
|50.0          |[50.0]|[55.2238805

References:  
https://docs.databricks.com/spark/latest/mllib/binary-classification-mllib-pipelines.html  
http://spark.apache.org/docs/2.4.0/api/python/pyspark.ml.html#module-pyspark.ml.feature