In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master('local[*]').appName('test').getOrCreate() # local cluster with all available nodes

In [4]:
print(spark.version)

3.0.0


## Load Data

In [6]:
data = spark.read.csv('flights.csv', header=True, inferSchema=True, nullValue='NA')

In [8]:
data.count() # number of records

275000

In [10]:
data.dtypes # columns and types

[('mon', 'int'),
 ('dom', 'int'),
 ('dow', 'int'),
 ('carrier', 'string'),
 ('flight', 'int'),
 ('org', 'string'),
 ('mile', 'int'),
 ('depart', 'double'),
 ('duration', 'int'),
 ('delay', 'int')]

In [11]:
data.show(8)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 10| 10|  1|     OO|  5836|ORD| 157|  8.18|      51|   27|
|  1|  4|  1|     OO|  5866|ORD| 466|  15.5|     102| null|
| 11| 22|  1|     OO|  6016|ORD| 738|  7.17|     127|  -19|
|  2| 14|  5|     B6|   199|JFK|2248| 21.17|     365|   60|
|  5| 25|  3|     WN|  1675|SJC| 386| 12.92|      85|   22|
|  3| 28|  1|     B6|   377|LGA|1076| 13.33|     182|   70|
|  5| 28|  6|     B6|   904|ORD| 740|  9.58|     130|   47|
|  1| 19|  2|     UA|   820|SFO| 679| 12.75|     123|  135|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 8 rows



Data dictionary:
- mon — month (integer between 1 and 12)
- dom — day of month (integer between 1 and 31)
- dow — day of week (integer; 1 = Monday and 7 = Sunday)
- org — origin airport (IATA code)
- mile — distance (miles)
- carrier — carrier (IATA code)
- depart — departure time (decimal hour)
- duration — expected duration (minutes)
- delay — delay (minutes)

## Data Wrangling

### Drop columns and nulls

We want to predict delay, let's drop the flight column because is an uninformative column.

In [14]:
data2 = data.drop('flight')

We want only the records with no null entries in the delay column

In [20]:
data2.filter('delay IS NULL').count()

16711

In [21]:
data2 = data2.filter('delay IS NOT NULL')

In [22]:
data2.count()

258289

In [33]:
data2.dropna().count()

258289

There are no more nulls in the data

### Create target column

The Federal Aviation Administration (FAA) considers a flight to be "delayed" when it arrives 15 minutes or more after its scheduled time.

Let's create a boolean column indicating whether or not a flight was delayed

In [35]:
data2 = data2.withColumn('delayed', (data2.delay >= 15).cast('integer'))

In [36]:
data2.show(5)

+---+---+---+-------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|delayed|
+---+---+---+-------+---+----+------+--------+-----+-------+
| 10| 10|  1|     OO|ORD| 157|  8.18|      51|   27|      1|
| 11| 22|  1|     OO|ORD| 738|  7.17|     127|  -19|      0|
|  2| 14|  5|     B6|JFK|2248| 21.17|     365|   60|      1|
|  5| 25|  3|     WN|SJC| 386| 12.92|      85|   22|      1|
|  3| 28|  1|     B6|LGA|1076| 13.33|     182|   70|      1|
+---+---+---+-------+---+----+------+--------+-----+-------+
only showing top 5 rows



### Categorical columns

In [None]:
spark.stop() # close the conection is a good practice