# Connexion au cluster

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Airplane ML") \
    .config('spark.executor.memory', '6g') \
    .config('spark.driver.memory', '2g') \
    .getOrCreate()

# Chargement des données

In [2]:
flights = spark.read.parquet('/data/parquet/flights')

In [None]:
flights.groupBy(flights.Origin).count().limit(10).toPandas()

In [3]:
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as F

flights = flights.where((flights.Year == '2008') & (flights.Origin == 'JFK')) \
    .select('Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest', 'ArrDelay') \
    .na.drop() \
    .withColumn('Delayed', (F.when(flights.ArrDelay == 'NA', 0).otherwise(flights.ArrDelay).cast('integer') > 20).cast('string')) \
    .withColumn('DepTime', flights.CRSDepTime.cast('integer'))
    
features = ['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrierIndex', 'FlightNumIndex', 'OriginIndex', 'DestIndex']

In [None]:
flights.limit(10) \
    .toPandas()

In [None]:
flights.count()

# Préparation du modèle

## Séparation entrainement / test

In [4]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

(trainingData, testData) = [dataset.cache() for dataset in flights.randomSplit([0.7, 0.3])]

## Feature engineering

In [8]:
labelIndexer = StringIndexer(inputCol='Delayed', outputCol='DelayedIndex')
categoricalIndexers = [ StringIndexer(inputCol=inputColName, outputCol=inputColName + 'Index', handleInvalid='skip') \
                       for inputColName in ['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest'] ]

In [9]:
assembler = VectorAssembler(
    inputCols=['YearIndex', 'MonthIndex', 'DayOfMonthIndex', 'DayOfWeekIndex', 'DepTime', 'UniqueCarrierIndex', 'FlightNumIndex', 'OriginIndex', 'DestIndex'],
    outputCol='features')

## Paramétrage du modèle

In [10]:
rf = RandomForestClassifier(labelCol="DelayedIndex", featuresCol="features", maxBins=8000)

In [11]:
pipeline = Pipeline(stages=[labelIndexer, *categoricalIndexers, assembler, rf])

model = pipeline.fit(trainingData)

In [None]:
testPredictions = model.transform(testData)
testPredictions.limit(10).toPandas()


In [12]:
singleRow = spark.sql("SELECT '2008',    '2',        '10',        '6',   1050,           'AA',     '1150',   'JFK', 'ORD'").toDF('Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest')
singleRow.show()
model.transform(singleRow).toPandas()

+----+-----+----------+---------+-------+-------------+---------+------+----+
|Year|Month|DayOfMonth|DayOfWeek|DepTime|UniqueCarrier|FlightNum|Origin|Dest|
+----+-----+----------+---------+-------+-------------+---------+------+----+
|2008|    2|        10|        6|   1050|           AA|     1150|   JFK| ORD|
+----+-----+----------+---------+-------+-------------+---------+------+----+



Unnamed: 0,Year,Month,DayOfMonth,DayOfWeek,DepTime,UniqueCarrier,FlightNum,Origin,Dest,YearIndex,...,DayOfMonthIndex,DayOfWeekIndex,UniqueCarrierIndex,FlightNumIndex,OriginIndex,DestIndex,features,rawPrediction,probability,prediction


## Évaluation du modèle

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='DelayedIndex')
evaluator.evaluate(testPredictions)


In [None]:
model.stages

In [None]:
import pandas as pd
rfModel = model.stages[10]
featureImportance = pd.DataFrame({'feature': features, 'importance': rfModel.featureImportances.toArray()})
featureImportance

In [None]:
import plotly.plotly as py
import cufflinks as cf
featureImportance.iplot(x='feature', kind='barh')

# Exploitation du modèle

In [None]:
model.save('/data/airplane-model')

# Tuning des hyperparamètres

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
rf.getNumTrees()

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50, 100]) \
    .addGrid(rf.maxDepth, [3, 5, 7]) \
    .build()

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol='DelayedIndex'),
                          numFolds=10)

In [None]:
cvModel = crossval.fit(trainingData)

In [None]:
cvModel.bestModel

In [None]:
rfModel = cvModel.bestModel.stages[6]
rfModel.getNumTrees

In [None]:
rfModel = cvModel.bestModel.stages[6]
featureImportance = pd.DataFrame({'feature': features, 'importance': rfModel.featureImportances.toArray()})
featureImportance

In [None]:
featureImportance.iplot(x='feature', kind='barh')

In [None]:
testPredictions = cvModel.transform(testData)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol='DelayedIndex')
evaluator.evaluate(testPredictions)

In [None]:
print(rf.explainParams())