# Chapter 6

## Multilayer perceptron regressor

In [None]:
from pyspark.sql import SparkSession 

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Intro") \
    .getOrCreate()

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, DoubleType

custom_schema = StructType([
    StructField("Make", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("Vehicle Class", StringType(), True),
    StructField("Cylinders", DoubleType(), True),
    StructField("Transmission", StringType(), True),
    StructField("Fuel Type", StringType(), True),
    StructField("Fuel Consumption City (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Hwy (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Comb (L/100 km)", DoubleType(), True),
    StructField("Fuel Consumption Comb (mpg)", DoubleType(), True),
    StructField("CO2", DoubleType(), True)])


In [None]:
# load data

co2_data = spark.read.format("csv")\
    .schema(custom_schema) \
    .option("header", True) \
    .load("../datasets/CO2_Emissions_Canada.csv")

In [None]:
co2_data.take(2)

In [None]:
cols_only_continues_values = {'Fuel Consumption City (L/100 km)':0}
#                               "Fuel Consumption Hwy (L/100 km)",
#         "Fuel Consumption Comb (L/100 km)"}

In [None]:
co2_data = co2_data.fillna(0.0)

In [None]:
co2_data.printSchema()

In [None]:
co2_data.take(2)

# Prep the data for regression

turn the feature columns into one indexed column:

In [None]:
from pyspark.ml.feature import FeatureHasher
from pyspark.sql.functions import col

cols = ["Make", "Model", "Vehicle Class","Cylinders","Transmission","Fuel Type",
        "Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)",
        "Fuel Consumption Comb (L/100 km)","Fuel Consumption Comb (mpg)"]

cols_only_continues = ["Fuel Consumption City (L/100 km)", "Fuel Consumption Hwy (L/100 km)",
        "Fuel Consumption Comb (L/100 km)"]

hasher = FeatureHasher(outputCol="hashed_features", inputCols=cols_only_continues)
co2_data = hasher.transform(co2_data)
                       


In [None]:
co2_data.select("hashed_features").show(5, truncate=False)

In [None]:
co2_data.select("hashed_features").take(1)

In [None]:
co2_data.select("hashed_features").show(5, truncate=False)

In [None]:
co2_data.printSchema()

# time for selecting the most meaninful features:

In [None]:
from pyspark.ml.feature import UnivariateFeatureSelector

selector = UnivariateFeatureSelector(outputCol="selectedFeatures", featuresCol="hashed_features", labelCol="CO2")

selector.setFeatureType("continuous")
selector.setLabelType("continuous")

model = selector.fit(co2_data)
output = model.transform(co2_data)

In [None]:
output.select("selectedFeatures").show(5, truncate=False)

Count the number of available classes as it is needed as part of the last layer for the multilayer perceptron classifier.

Classifier trainer based on the Multilayer Perceptron. 
Each layer has sigmoid activation function, output layer has softmax. 

Number of inputs has to be equal to the size of feature vectors. 

Number of outputs has to be equal to the total number of labels.


Specify the layers for the neural network as follows: 

input layer => size 50 (features), two intermediate layers (i.e. hidden layer) of size 20 and 8 and output => size 70 as the largest value of CO2 is 69 and lables array is searched by index (classes).      




In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


mlp = MultilayerPerceptronClassifier(layers=[50,20, 8, 70], seed=123, featuresCol="selectedFeatures", labelCol="CO2")
mlp.setMaxIter(100)


model = mlp.fit(output)

