In [43]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate() # local cluster with all available nodes

In [4]:
print(spark.version)

3.0.0


## Load Data

In [5]:
data = spark.read.csv('flights.csv', header=True, inferSchema=True, nullValue='NA')

In [6]:
data.count() # number of records

275000

In [7]:
data.dtypes # columns and types

[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

In [8]:
data.show(8)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 10| 10|  1|     OO|  5836|ORD| 157|  8.18|      51|   27|
|  1|  4|  1|     OO|  5866|ORD| 466|  15.5|     102| null|
| 11| 22|  1|     OO|  6016|ORD| 738|  7.17|     127|  -19|
|  2| 14|  5|     B6|   199|JFK|2248| 21.17|     365|   60|
|  5| 25|  3|     WN|  1675|SJC| 386| 12.92|      85|   22|
|  3| 28|  1|     B6|   377|LGA|1076| 13.33|     182|   70|
|  5| 28|  6|     B6|   904|ORD| 740|  9.58|     130|   47|
|  1| 19|  2|     UA|   820|SFO| 679| 12.75|     123|  135|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 8 rows



Data dictionary:
- mon — month (integer between 1 and 12)
- dom — day of month (integer between 1 and 31)
- dow — day of week (integer; 1 = Monday and 7 = Sunday)
- org — origin airport (IATA code)
- mile — distance (miles)
- carrier — carrier (IATA code)
- depart — departure time (decimal hour)
- duration — expected duration (minutes)
- delay — delay (minutes)

## Data Wrangling

### Drop columns and nulls

We want to predict delay, let's drop the flight column because is an uninformative column.

In [9]:
data2 = data.drop('flight')

We want only the records with no null entries in the delay column

In [10]:
data2.filter('delay IS NULL').count()

16711

In [11]:
data2 = data2.filter('delay IS NOT NULL')

In [12]:
data2.count()

258289

In [13]:
data2.dropna().count()

258289

There are no more nulls in the data

### Create target column

The Federal Aviation Administration (FAA) considers a flight to be "delayed" when it arrives 15 minutes or more after its scheduled time.

Let's create a boolean column indicating whether or not a flight was delayed

In [14]:
data2 = data2.withColumn('delayed', (data2.delay >= 15).cast('integer'))

In [15]:
data2.show(5)

+---+---+---+-------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|delayed|
+---+---+---+-------+---+----+------+--------+-----+-------+
| 10| 10|  1|     OO|ORD| 157|  8.18|      51|   27|      1|
| 11| 22|  1|     OO|ORD| 738|  7.17|     127|  -19|      0|
|  2| 14|  5|     B6|JFK|2248| 21.17|     365|   60|      1|
|  5| 25|  3|     WN|SJC| 386| 12.92|      85|   22|      1|
|  3| 28|  1|     B6|LGA|1076| 13.33|     182|   70|      1|
+---+---+---+-------+---+----+------+--------+-----+-------+
only showing top 5 rows



### Categorical columns

In [16]:
data_idx = StringIndexer(inputCol='carrier', outputCol='carrier_idx').fit(data2).transform(data2)
# the order for the categories is assingn according to frequency: most to least.
# Use stringOrderType to change it.

In [17]:
data_idx = StringIndexer(inputCol='org', outputCol='org_idx').fit(data_idx).transform(data_idx)

In [18]:
data_idx.show(5)

+---+---+---+-------+---+----+------+--------+-----+-------+-----------+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|delayed|carrier_idx|org_idx|
+---+---+---+-------+---+----+------+--------+-----+-------+-----------+-------+
| 10| 10|  1|     OO|ORD| 157|  8.18|      51|   27|      1|        2.0|    0.0|
| 11| 22|  1|     OO|ORD| 738|  7.17|     127|  -19|      0|        2.0|    0.0|
|  2| 14|  5|     B6|JFK|2248| 21.17|     365|   60|      1|        4.0|    2.0|
|  5| 25|  3|     WN|SJC| 386| 12.92|      85|   22|      1|        3.0|    5.0|
|  3| 28|  1|     B6|LGA|1076| 13.33|     182|   70|      1|        4.0|    3.0|
+---+---+---+-------+---+----+------+--------+-----+-------+-----------+-------+
only showing top 5 rows



In [19]:
data_1hot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy']).fit(data_idx).transform(data_idx)

In [20]:
data_1hot = OneHotEncoder(inputCols=['carrier_idx'], outputCols=['carrier_dummy']).fit(data_1hot).transform(data_1hot)

In [22]:
data_1hot.show(5) # the dummy columns are DenseVector: indicates the len of the vector and the index and values not 0

+---+---+---+-------+---+----+------+--------+-----+-------+-----------+-------+-------------+-------------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|delayed|carrier_idx|org_idx|    org_dummy|carrier_dummy|
+---+---+---+-------+---+----+------+--------+-----+-------+-----------+-------+-------------+-------------+
| 10| 10|  1|     OO|ORD| 157|  8.18|      51|   27|      1|        2.0|    0.0|(7,[0],[1.0])|(8,[2],[1.0])|
| 11| 22|  1|     OO|ORD| 738|  7.17|     127|  -19|      0|        2.0|    0.0|(7,[0],[1.0])|(8,[2],[1.0])|
|  2| 14|  5|     B6|JFK|2248| 21.17|     365|   60|      1|        4.0|    2.0|(7,[2],[1.0])|(8,[4],[1.0])|
|  5| 25|  3|     WN|SJC| 386| 12.92|      85|   22|      1|        3.0|    5.0|(7,[5],[1.0])|(8,[3],[1.0])|
|  3| 28|  1|     B6|LGA|1076| 13.33|     182|   70|      1|        4.0|    3.0|(7,[3],[1.0])|(8,[4],[1.0])|
+---+---+---+-------+---+----+------+--------+-----+-------+-----------+-------+-------------+-------------+
only showing top 5 

### Assamble columns
consolidate all of the predictor columns into a single column

In [23]:
assembler = VectorAssembler(inputCols=['mon', 'dom', 'dow', 'carrier_dummy', 'org_dummy', 'mile', 'depart', 'duration'], outputCol='features')

In [24]:
data_assambled = assembler.transform(data_1hot)

In [29]:
data_assambled.select('mon', 'dom', 'dow', 'carrier_dummy', 'org_dummy', 'mile', 'depart', 'duration', 'features','delayed').show(5, truncate=True)

+---+---+---+-------------+-------------+----+------+--------+--------------------+-------+
|mon|dom|dow|carrier_dummy|    org_dummy|mile|depart|duration|            features|delayed|
+---+---+---+-------------+-------------+----+------+--------+--------------------+-------+
| 10| 10|  1|(8,[2],[1.0])|(7,[0],[1.0])| 157|  8.18|      51|(21,[0,1,2,5,11,1...|      1|
| 11| 22|  1|(8,[2],[1.0])|(7,[0],[1.0])| 738|  7.17|     127|(21,[0,1,2,5,11,1...|      0|
|  2| 14|  5|(8,[4],[1.0])|(7,[2],[1.0])|2248| 21.17|     365|(21,[0,1,2,7,13,1...|      1|
|  5| 25|  3|(8,[3],[1.0])|(7,[5],[1.0])| 386| 12.92|      85|(21,[0,1,2,6,16,1...|      1|
|  3| 28|  1|(8,[4],[1.0])|(7,[3],[1.0])|1076| 13.33|     182|(21,[0,1,2,7,14,1...|      1|
+---+---+---+-------------+-------------+----+------+--------+--------------------+-------+
only showing top 5 rows



## Machine Learning

### Split train-test data

In [30]:
train, test = data_assambled.randomSplit([0.8,0.2], seed=42)

### Decision tree

In [31]:
tree = DecisionTreeClassifier(labelCol='delayed', featuresCol='features').fit(train)

In [32]:
predictions = tree.transform(test)

In [33]:
predictions.select('delayed', 'prediction', 'probability').show(5, truncate=False)

+-------+----------+----------------------------------------+
|delayed|prediction|probability                             |
+-------+----------+----------------------------------------+
|0      |1.0       |[0.33498860306089223,0.6650113969391078]|
|1      |1.0       |[0.33498860306089223,0.6650113969391078]|
|1      |0.0       |[0.5506902312494505,0.44930976875054957]|
|1      |1.0       |[0.33498860306089223,0.6650113969391078]|
|1      |1.0       |[0.33498860306089223,0.6650113969391078]|
+-------+----------+----------------------------------------+
only showing top 5 rows



#### Evaluation

Confusion matrix

In [34]:
predictions.groupBy('delayed', 'prediction').count().show()

+-------+----------+-----+
|delayed|prediction|count|
+-------+----------+-----+
|      1|       0.0| 8838|
|      0|       0.0|15308|
|      1|       1.0|17180|
|      0|       1.0|10364|
+-------+----------+-----+



In [35]:
TN = predictions.filter('prediction = 0 AND delayed = prediction').count()
TP = predictions.filter('prediction = 1 AND delayed = prediction').count()
FN = predictions.filter('prediction = 0 AND delayed <> prediction').count()
FP = predictions.filter('prediction = 1 AND delayed <> prediction').count()

In [36]:
accuracy = (TN + TP)/ (TN + TP + FN + FP)
print(accuracy)

0.62851615399497


In [37]:
precision = TP/ (TP + FP)
recall = TP / (TP + FN)
print(precision, recall)

0.6237293058379321 0.6603120916288723


## Logistic Regression

In [38]:
log = LogisticRegression(labelCol='delayed', featuresCol='features').fit(train)

In [39]:
preds = log.transform(test)

#### Evaluation

Confusion matrix

In [40]:
preds.groupBy('delayed', 'prediction').count().show()

+-------+----------+-----+
|delayed|prediction|count|
+-------+----------+-----+
|      1|       0.0| 9188|
|      0|       0.0|14693|
|      1|       1.0|16830|
|      0|       1.0|10979|
+-------+----------+-----+



Weighted Precision

In [41]:
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'delayed')
weighted_precision = multi_evaluator.evaluate(preds, {multi_evaluator.metricName: "weightedPrecision"})
print(weighted_precision)

0.6101957069746587


AUC (area under ROC)

In [42]:
binary_evaluator = BinaryClassificationEvaluator(labelCol = 'delayed')
auc = binary_evaluator.evaluate(preds, {binary_evaluator.metricName:'areaUnderROC'})
print(auc)

0.653047019327488


## Linear Regression to predict the duration

- Start with just one feature: distance

In [44]:
assembler2 = VectorAssembler(inputCols=['mile'], outputCol='features')

In [45]:
df_assambled = assembler2.transform(data_1hot)

In [46]:
df_train, df_test = df_assambled.randomSplit([0.8,0.2], seed=42)

In [47]:
reg = LinearRegression(labelCol='duration').fit(df_train)

In [48]:
pred_dur = reg.transform(df_test)
pred_dur.select('duration', 'prediction').show(5, False)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|245     |213.57145459809462|
|160     |133.36577053199244|
|155     |137.14445777522826|
|190     |177.73487235579364|
|255     |213.32766832433748|
+--------+------------------+
only showing top 5 rows



In [50]:
RegressionEvaluator(labelCol='duration').evaluate(pred_dur) #RMSE

17.08160179005299

Coefficients

In [52]:
# Intercept (average minutes on ground)
inter = reg.intercept
print(inter)

44.018101199997126


In [55]:
# Coefficients: just one. Indicates the average minutes per mile (slope for distance)
coefs = reg.coefficients
print(coefs)

[0.12189313687857477]


Let's see if it makes sense. To transform to minutes per km: divide per 1.60934. To transform to hours per km: divide per 60. And then take the inverse to have km/hr

In [57]:
1/(0.12189313687857477 / 1.60934/60)

792.1725740489371

According to google this makes sense.

- Add Origin

In [58]:
assembler3 = VectorAssembler(inputCols=['mile','org_dummy'], outputCol='features')
df3_assambled = assembler3.transform(data_1hot)
df3_train, df3_test = df3_assambled.randomSplit([0.8,0.2], seed=42)

In [59]:
reg3 = LinearRegression(labelCol='duration').fit(df3_train)

In [61]:
pred_dur2 = reg3.transform(df3_test)
pred_dur2.select('duration', 'prediction').show(5, False)

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|245     |234.81889722676988|
|160     |150.3855457396908 |
|155     |154.0903953711782 |
|190     |193.88765109005902|
|255     |228.78494439310168|
+--------+------------------+
only showing top 5 rows



In [62]:
RegressionEvaluator(labelCol='duration').evaluate(pred_dur2) #RMSE

11.124669149751762

coefficients

In [63]:
coefs = reg3.coefficients
print(coefs)

[0.11951127843507756,28.40886411142309,20.39563635811636,52.69779736409431,46.902867087296265,15.80421429487965,18.154485106121992,18.020384912431833]


To understand them we need to see how was converted the org column.

In [65]:
data_1hot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SMF|    4.0|(7,[4],[1.0])|
|SJC|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



So:
- 0: Miles
- 1: ORD
- 2: SFO
- 3: JFK
- 4: LGA
- 5: SMF
- 6: SJC
- 7: TUS

and the baseline is OGG.

In [66]:
# Intercept (average minutes on ground at IGG)
inter = reg3.intercept
print(inter)

15.880911559482685


In [67]:
# average speed
1/(reg3.coefficients[0] / 1.60934/60)

807.9605645960416

In [69]:
# Average minutes on ground at JFK
reg3.intercept + reg3.coefficients[3]

68.578708923577

In [None]:
spark.stop() # close the conection is a good practice