# Set Up

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=e412eaf595a7b7ffe70d07ebf12119170a42d53439efa593f33ddea9f6e2b199
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import round
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

In [None]:
# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('Flight Data Analysis') \
                    .getOrCreate()

# Read Data

In [None]:
# Read data from CSV file
flights = spark.read.csv('flights.csv', sep=',', header=True, inferSchema=True, nullValue='NA')

In [None]:
# Get number of records
print("The data contain %d records." % flights.count())

The data contain 50000 records.


In [None]:
# View the first five records
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| NULL|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| NULL|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [None]:
flights.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: integer (nullable = true)



In [None]:
flights.describe().show()

+-------+-----------------+-----------------+-----------------+-------+------------------+-----+----------------+------------------+-----------------+------------------+
|summary|              mon|              dom|              dow|carrier|            flight|  org|            mile|            depart|         duration|             delay|
+-------+-----------------+-----------------+-----------------+-------+------------------+-----+----------------+------------------+-----------------+------------------+
|  count|            50000|            50000|            50000|  50000|             50000|50000|           50000|             50000|            50000|             47022|
|   mean|           5.2351|         15.66196|          2.95236|   NULL|        2054.31344| NULL|       882.40112|14.130952600000064|        151.76582|28.663795670111863|
| stddev|3.437758623534696|8.772488135606777|1.966033503314405|   NULL|2182.4715300582875| NULL|701.232785607705| 4.694052286573998|87.04507290261697|

# Clean Data

In [None]:
# Remove the 'flight' column
flights = flights.drop('flight')

In [None]:
# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

In [None]:
# Remove records with missing values in any column
flights = flights.dropna()
print("After cleaning, the data contain %d records." % flights.count())

After cleaning, the data contain 47022 records.


# Feature Engineering

In [None]:
# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('mile')

In [None]:
# Create 'label' column indicating whether flight delayed (1) or not (0)
flights = flights.withColumn('label', (flights.delay >= 15).cast('integer'))

In [None]:
# Indexing categorical columns 'carrier' and 'org'
indexer_carrier = StringIndexer(inputCol='carrier', outputCol='carrier_idx')
indexer_model_carrier = indexer_carrier.fit(flights)
flights = indexer_model_carrier.transform(flights)

indexer_org = StringIndexer(inputCol='org', outputCol='org_idx')
indexer_model_org = indexer_org.fit(flights)
flights = indexer_model_org.transform(flights)

# Prepare for Model

In [None]:
# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

In [None]:
# Check the proportion of the training set
training_ratio = flights_train.count() / flights.count()
print("Training set contains approximately %.2f%% of the records." % (training_ratio * 100))

In [None]:
# Create an assembler object to consolidate predictor columns into a single column
assembler = VectorAssembler(
    inputCols=['mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'km', 'depart', 'duration'],
    outputCol='features'
)
flights_train = assembler.transform(flights_train)
flights_test = assembler.transform(flights_test)

# Decision Tree Model

In [None]:
# Build a Decision Tree model
tree = DecisionTreeClassifier(featuresCol='features', labelCol='label')
tree_model = tree.fit(flights_train)

In [None]:
# Make predictions for the testing data
predictions = tree_model.transform(flights_test)
predictions.select('label', 'prediction', 'probability').show(5, truncate=False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |0.0       |[0.5297666934835077,0.47023330651649237]|
|0    |1.0       |[0.35275502606105735,0.6472449739389426]|
|0    |0.0       |[0.689616672869371,0.31038332713062894] |
|1    |1.0       |[0.35275502606105735,0.6472449739389426]|
|1    |1.0       |[0.35275502606105735,0.6472449739389426]|
+-----+----------+----------------------------------------+
only showing top 5 rows



In [None]:
# Evaluate the Decision Tree

# Create a confusion matrix
predictions.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = predictions.filter('prediction = 0 AND label = prediction').count()
TP = predictions.filter('prediction = 1 AND label = prediction').count()
FN = predictions.filter('prediction = 0 AND label != prediction').count()
FP = predictions.filter('prediction = 1 AND label != prediction').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TN + TP) / (TN + TP + FN + FP)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1611|
|    0|       0.0| 2735|
|    1|       1.0| 3299|
|    0|       1.0| 1902|
+-----+----------+-----+

0.6320310045040327


# Logistic Regression Model

In [None]:
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1671|
|    0|       0.0| 2522|
|    1|       1.0| 3239|
|    0|       1.0| 2115|
+-----+----------+-----+



In [None]:
# Evaluate the Logistic Regression model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName: "areaUnderROC"})

precision = 0.63
recall    = 0.67
