## Using Pipeline techniques in Spark

Use on the flights dataset

In [76]:
import findspark
findspark.init('/home/rich/spark/spark-2.4.3-bin-hadoop2.7')
import pandas as pd
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import matplotlib.pyplot as plt

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import round

### Data dictionary of flights data
    mon — month (integer between 1 and 12)
    dom — day of month (integer between 1 and 31)
    dow — day of week (integer; 1 = Monday and 7 = Sunday)
    org — origin airport (IATA code)
    mile — distance (miles)
    carrier — carrier (IATA code)
    depart — departure time (decimal hour)
    duration — expected duration (minutes)
    delay — delay (minutes)

### Loading flights data

In [77]:
file_path = './data/flights.csv'

In [78]:
#pandas is my first love :)
df = pd.read_csv(file_path)

In [79]:
df.head()

Unnamed: 0,mon,dom,dow,carrier,flight,org,mile,depart,duration,delay
0,11,20,6,US,19,JFK,2153,9.48,351,
1,0,22,2,UA,1107,ORD,316,16.33,82,30.0
2,2,20,4,UA,226,SFO,337,6.17,82,-8.0
3,9,13,1,AA,419,ORD,1236,10.33,195,-5.0
4,4,2,5,AA,325,ORD,258,8.92,65,


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
mon         50000 non-null int64
dom         50000 non-null int64
dow         50000 non-null int64
carrier     50000 non-null object
flight      50000 non-null int64
org         50000 non-null object
mile        50000 non-null int64
depart      50000 non-null float64
duration    50000 non-null int64
delay       47022 non-null float64
dtypes: float64(2), int64(6), object(2)
memory usage: 3.8+ MB


In [81]:
# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('FlightsEnsembles').getOrCreate()

In [82]:
# Read data from CSV file
flights = spark.read.csv(file_path,sep=',',header=True,inferSchema=True,nullValue='NA')

# Get number of records
print("The data contain %d records." % flights.count())

# make a distance km feature
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

The data contain 50000 records.


### Flight duration model: Pipeline stages

Create the stages for the flights duration model pipeline.

In [83]:
# Convert categorical strings to index values
#Create an indexer to convert the 'org' column into an indexed column called 'org_idx'.
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# One-hot encode index values
onehot = OneHotEncoderEstimator(inputCols=['org_idx','dow'],outputCols=['org_dummy','dow_dummy'])

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km','org_dummy','dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

In [84]:
#assembler = VectorAssembler(inputCols=['km'], outputCol='features')
#flights = assembler.transform(flights)

flights_train, flights_test = flights.randomSplit([0.8,0.2],seed=17)

In [85]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

### Cross validating simple flight duration model

In [86]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')


### Cross validating flight duration model pipeline

In [87]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km','org_dummy'], outputCol='features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

cv = CrossValidator(estimator=pipeline,estimatorParamMaps=params,evaluator=evaluator)

### Optimizing flights linear regression with GridSearch

In [88]:
# Create parameter grid
params = ParamGridBuilder()

# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01,0.1,1.0,10.0]) \
               .addGrid(regression.elasticNetParam, [0.0,0.5,1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)
cv = cv.fit(flights_train)

Number of models to be tested:  12


### Dissecting the best flight duration model


In [89]:
cv.avgMetrics

[11.268931315728903,
 11.269233313436072,
 11.269701769380896,
 11.270489131102623,
 11.292339000559178,
 11.33937114874702,
 11.36619394722489,
 11.595104284795871,
 11.78445226157085,
 14.56300085560058,
 17.15785620240054,
 19.32140480958893]

In [90]:
# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
evaluator.evaluate(predictions)

[StringIndexer_dc1cd8301d6e, OneHotEncoderEstimator_11c4ab7c18fe, VectorAssembler_26ff11b047ad, LinearRegression_f4c38afb1db3]


11.011963880807526

In [91]:
#print(best_model.stages[3].extractParamMap())