# Connexion au cluster

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Airplane ML") \
    .config('spark.executor.memory', '6g') \
    .config('spark.driver.memory', '2g') \
    .getOrCreate()

# Imports

In [None]:
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as F

# Chargement des données

In [None]:
flights = spark.read.parquet('/data/parquet/flights')

In [None]:
#flights.groupBy(flights.Origin).count().limit(10).toPandas()

In [None]:
flights = flights.where((flights.Origin == 'ORD')) \
    .select('Year', 'Month', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest', 'ArrDelay') \
    .na.drop() \
    .withColumn('Delayed', (F.when(flights.ArrDelay == 'NA', 0).otherwise(flights.ArrDelay).cast('integer') > 20).cast('string')) \
    .withColumn('DepTime', flights.CRSDepTime.cast('integer'))
    
features = ['Month', 'DayOfWeek', 'DepTime', 'UniqueCarrierIndex', 'FlightNumIndex', 'OriginIndex', 'DestIndex']

In [None]:
flights.filter(flights.Delayed == 'true').limit(10) \
    .toPandas()

In [None]:
flights.count()

# Préparation du modèle

## Séparation entrainement / test

In [None]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

#trainingData = flights.filter(flights.Year.isin('2007'))
#testData = flights.filter(flights.Year == '2008')
(trainingData, testData) = [dataset.cache() for dataset in flights.filter(flights.Year.isin('2007', '2008')).randomSplit([0.7, 0.3])]

In [None]:
trainingData.limit(10).toPandas()

In [None]:
trainingData.groupBy('UniqueCarrier').count().orderBy(F.col('count').desc()).toPandas()

In [None]:
testData.limit(10).toPandas()

In [None]:
testData.groupBy('UniqueCarrier').count().orderBy(F.col('count').desc()).toPandas()

## Feature engineering

In [None]:
labelIndexer = StringIndexer(inputCol='Delayed', outputCol='DelayedIndex')
categoricalIndexers = [ StringIndexer(inputCol=inputColName, outputCol=inputColName + 'Index', handleInvalid='skip') \
                       for inputColName in ['Month', 'DayOfWeek', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest'] ]

In [None]:
assembler = VectorAssembler(
    inputCols=['MonthIndex', 'DayOfWeekIndex', 'DepTime', 'UniqueCarrierIndex', 'FlightNumIndex', 'OriginIndex', 'DestIndex'],
    outputCol='features')

## Paramétrage du modèle

In [None]:
rf = RandomForestClassifier(labelCol="DelayedIndex", featuresCol="features", maxBins=8000)

In [None]:
pipeline = Pipeline(stages=[labelIndexer, *categoricalIndexers, assembler, rf])

model = pipeline.fit(trainingData)

In [None]:
testPredictions = model.transform(testData)
#testPredictions.filter(testPredictions.prediction == 0).limit(10).toPandas()
testPredictions.limit(10).toPandas()

## Évaluation du modèle

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='DelayedIndex')
evaluator.evaluate(testPredictions)


In [None]:
model.stages

In [None]:
import pandas as pd
rfModel = model.stages[8]
featureImportance = pd.DataFrame({'feature': features, 'importance': rfModel.featureImportances.toArray()})
featureImportance

In [None]:
import plotly.plotly as py
import cufflinks as cf
featureImportance.iplot(x='feature', kind='barh')

# Exploitation du modèle

In [None]:
model.save('/data/airplane-model')

# Tuning des hyperparamètres

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
rf.getNumTrees()

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50, 100]) \
    .addGrid(rf.maxDepth, [3, 5, 7]) \
    .build()

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol='DelayedIndex'),
                          numFolds=10)

In [None]:
cvModel = crossval.fit(trainingData)

In [None]:
cvModel.bestModel

In [None]:
rfModel = cvModel.bestModel.stages[6]
rfModel.getNumTrees

In [None]:
rfModel = cvModel.bestModel.stages[6]
featureImportance = pd.DataFrame({'feature': features, 'importance': rfModel.featureImportances.toArray()})
featureImportance

In [None]:
featureImportance.iplot(x='feature', kind='barh')

In [None]:
testPredictions = cvModel.transform(testData)

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol='DelayedIndex')
evaluator.evaluate(testPredictions)

In [None]:
print(rf.explainParams())