# Connexion au cluster

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Airplane ML") \
    .config('spark.executor.memory', '6g') \
    .getOrCreate()

# Chargement des données

In [2]:
flights = spark.read.parquet('/data/parquet/flights')

In [3]:
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as F

flights = flights.where(flights.Year == '2008') \
    .select('Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest', 'ArrDelay') \
    .na.drop() \
    .withColumn('Delayed', (F.when(flights.ArrDelay == 'NA', 0).otherwise(flights.ArrDelay).cast('integer') > 20).cast('string')) \
    .withColumn('Year', flights.Year.cast('integer')) \
    .withColumn('Month', flights.Month.cast('integer')) \
    .withColumn('DayOfMonth', flights.DayOfMonth.cast('integer')) \
    .withColumn('DayOfWeek', flights.DayOfWeek.cast('integer')) \
    .withColumn('DepTime', flights.CRSDepTime.cast('integer'))
    
features = ['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrierIndex', 'FlightNumIndex', 'OriginIndex', 'DestIndex']

In [4]:
flights.limit(10) \
    .toPandas()

Unnamed: 0,Year,Month,DayOfMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,Origin,Dest,ArrDelay,Delayed,DepTime
0,2008,4,4,5,1250,AA,1063,DFW,LAS,-2,False,1250
1,2008,4,4,5,1955,WN,609,ABQ,AMA,2,False,1955
2,2008,4,4,5,950,AA,1065,BOS,SJU,14,False,950
3,2008,4,4,5,900,WN,3257,ABQ,BWI,-16,False,900
4,2008,4,4,5,935,AA,1067,ORD,IAH,17,False,935
5,2008,4,4,5,910,WN,77,ABQ,DAL,-1,False,910
6,2008,4,4,5,920,AA,1068,MIA,DCA,2,False,920
7,2008,4,4,5,655,WN,87,ABQ,DAL,-8,False,655
8,2008,4,4,5,1825,AA,1069,CLT,DFW,121,True,1825
9,2008,4,4,5,1315,WN,214,ABQ,DAL,5,False,1315


In [5]:
flights.count()

7009728

# Préparation du modèle

## Séparation entrainement / test

In [6]:
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline

(trainingData, testData) = [dataset.cache() for dataset in flights.randomSplit([0.7, 0.3])]

## Feature engineering

In [7]:
labelIndexer = StringIndexer(inputCol='Delayed', outputCol='DelayedIndex').fit(trainingData)
categoricalIndexers = [ StringIndexer(inputCol=inputColName, outputCol=inputColName + 'Index', handleInvalid='skip').fit(flights) \
                       for inputColName in ['UniqueCarrier', 'FlightNum', 'Origin', 'Dest'] ]

In [8]:
assembler = VectorAssembler(
    inputCols=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrierIndex', 'FlightNumIndex', 'OriginIndex', 'DestIndex'],
    outputCol='features')

## Paramétrage du modèle

In [9]:
rf = RandomForestClassifier(labelCol="DelayedIndex", featuresCol="features", maxBins=8000)

pipeline = Pipeline(stages=[labelIndexer, *categoricalIndexers, assembler, rf])

model = pipeline.fit(trainingData)

In [10]:
testPredictions = model.transform(testData)
testPredictions.limit(10).toPandas()


Unnamed: 0,Year,Month,DayOfMonth,DayOfWeek,CRSDepTime,UniqueCarrier,FlightNum,Origin,Dest,ArrDelay,...,DepTime,DelayedIndex,UniqueCarrierIndex,FlightNumIndex,OriginIndex,DestIndex,features,rawPrediction,probability,prediction
0,2008,1,2,3,100,NW,166,SEA,MSP,15,...,100,0.0,8.0,421.0,18.0,13.0,"[2008.0, 1.0, 2.0, 3.0, 100.0, 8.0, 421.0, 18....","[17.5673631406, 2.43263685936]","[0.878368157032, 0.121631842968]",0.0
1,2008,1,2,3,1000,AA,1599,DFW,IAH,-9,...,1000,0.0,1.0,859.0,2.0,6.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 1.0, 859.0, 2....","[16.6087456552, 3.39125434479]","[0.83043728276, 0.16956271724]",0.0
2,2008,1,2,3,1000,AA,1701,RDU,DFW,1,...,1000,0.0,1.0,861.0,33.0,2.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 1.0, 861.0, 33...","[16.8414534085, 3.15854659149]","[0.842072670425, 0.157927329575]",0.0
3,2008,1,2,3,1000,AA,1956,DFW,MSY,45,...,1000,1.0,1.0,1933.0,2.0,49.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 1.0, 1933.0, 2...","[16.5483341024, 3.45166589763]","[0.827416705119, 0.172583294881]",0.0
4,2008,1,2,3,1000,B6,1052,PIT,JFK,-3,...,1000,0.0,15.0,1206.0,47.0,16.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 15.0, 1206.0, ...","[17.2035186913, 2.7964813087]","[0.860175934565, 0.139824065435]",0.0
5,2008,1,2,3,1000,B6,1083,JFK,CLT,1,...,1000,0.0,15.0,844.0,16.0,14.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 15.0, 844.0, 1...","[17.043240566, 2.95675943402]","[0.852162028299, 0.147837971701]",0.0
6,2008,1,2,3,1000,DL,1177,SLC,PHX,-10,...,1000,0.0,5.0,882.0,10.0,5.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 5.0, 882.0, 10...","[17.390162549, 2.60983745104]","[0.869508127448, 0.130491872552]",0.0
7,2008,1,2,3,1000,DL,1270,BDL,ATL,4,...,1000,0.0,5.0,1098.0,54.0,0.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 5.0, 1098.0, 5...","[17.0089925148, 2.99100748519]","[0.85044962574, 0.14955037426]",0.0
8,2008,1,2,3,1000,DL,1275,EWR,ATL,-19,...,1000,0.0,5.0,961.0,11.0,0.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 5.0, 961.0, 11...","[16.789640262, 3.21035973803]","[0.839482013098, 0.160517986902]",0.0
9,2008,1,2,3,1000,DL,1764,SLC,MSP,-9,...,1000,0.0,5.0,1972.0,10.0,13.0,"[2008.0, 1.0, 2.0, 3.0, 1000.0, 5.0, 1972.0, 1...","[17.438483318, 2.56151668198]","[0.871924165901, 0.128075834099]",0.0


## Évaluation du modèle

In [11]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='DelayedIndex')
evaluator.evaluate(testPredictions)


0.6699356413566553

In [12]:
import pandas as pd
rfModel = model.stages[6]
featureImportance = pd.DataFrame({'feature': features, 'importance': rfModel.featureImportances.toArray()})
featureImportance

Unnamed: 0,feature,importance
0,Year,0.0
1,Month,0.209388
2,DayOfMonth,0.037498
3,DayOfWeek,0.0038
4,DepTime,0.416328
5,UniqueCarrierIndex,0.022214
6,FlightNumIndex,0.12719
7,OriginIndex,0.112112
8,DestIndex,0.07147


In [13]:
import plotly.plotly as py
import cufflinks as cf
featureImportance.iplot(x='feature', kind='barh')

# Tuning des hyperparamètres

In [14]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [15]:
rf.getNumTrees()

20

In [16]:
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 50]) \
    .build()

In [None]:
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol='DelayedIndex'),
                          numFolds=10)

In [None]:
cvModel = crossval.fit(trainingData)

--- Logging error ---
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44272)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-7d110be435f1>", line 1, in <module>
    cvModel = crossval.fit(trainingData)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/usr/local/spark/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/pipeline.py", line 108, in _fit
    model = stage.fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/wrapper.py"

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44272)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-7d110be435f1>", line 1, in <module>
    cvModel = crossval.fit(trainingData)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/usr/local/spark/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/pipeline.py", line 108, in _fit
    model = stage.fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 236, in _fit
  

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44272)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-7d110be435f1>", line 1, in <module>
    cvModel = crossval.fit(trainingData)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/usr/local/spark/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/pipeline.py", line 108, in _fit
    model = stage.fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 236, in _fit
  

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44272)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-7d110be435f1>", line 1, in <module>
    cvModel = crossval.fit(trainingData)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/usr/local/spark/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/pipeline.py", line 108, in _fit
    model = stage.fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 236, in _fit
  

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44272)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-7d110be435f1>", line 1, in <module>
    cvModel = crossval.fit(trainingData)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/usr/local/spark/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/pipeline.py", line 108, in _fit
    model = stage.fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 236, in _fit
  

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44272)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-7d110be435f1>", line 1, in <module>
    cvModel = crossval.fit(trainingData)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/usr/local/spark/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/pipeline.py", line 108, in _fit
    model = stage.fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 236, in _fit
  