In [1]:
import os
import pandas as pd
import numpy as np

# Create Spark context
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('dj').setMaster('local[4]')
sc = SparkContext(conf=conf)

# Create Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dj').getOrCreate()
# Print spark
sc.applicationId

'local-1698603637136'

In [2]:
# Import the pyspark.sql.types library
from pyspark.sql.types import *
from pyspark.sql import functions as F

# Define a new schema using the StructType method
people_schema = StructType([
  # Define a StructField for each field
  StructField('name', StringType(), False),
  StructField('age', IntegerType(), False),
  StructField('city', StringType(), False)
])

In [3]:
# Load the CSV file
aa_dfw_df = spark.read.format('csv').options(Header=True).load('./AA_DFW_2017_Departures_Short.csv.gz')

# Add the airport column using the F.lower() method
aa_dfw_df = aa_dfw_df.withColumn('airport', F.lower(aa_dfw_df['Destination Airport']))

# Drop the Destination Airport column
aa_dfw_df = aa_dfw_df.drop(aa_dfw_df['Destination Airport'])

# Show the DataFrame
aa_dfw_df.show(5,truncate=False)

+-----------------+-------------+-----------------------------+-------+
|Date (MM/DD/YYYY)|Flight Number|Actual elapsed time (Minutes)|airport|
+-----------------+-------------+-----------------------------+-------+
|01/01/2017       |0005         |537                          |hnl    |
|01/01/2017       |0007         |498                          |ogg    |
|01/01/2017       |0037         |241                          |sfo    |
|01/01/2017       |0043         |134                          |dtw    |
|01/01/2017       |0051         |88                           |stl    |
+-----------------+-------------+-----------------------------+-------+
only showing top 5 rows



In [4]:
df1 = spark.read.format('csv').options(Header=True).load('./AA_DFW_2017_Departures_Short.csv.gz')
df2 = spark.read.format('csv').options(Header=True).load('./AA_DFW_2016_Departures_Short.csv.gz')

In [5]:
# View the row count of df1 and df2
print("df1 Count: %d" % df1.count())
print("df2 Count: %d" % df2.count())

# Combine the DataFrames into one
df3 = df1.union(df2)



df1 Count: 139358
df2 Count: 140604


In [6]:
df3.show(5,truncate=False)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|01/01/2017       |0005         |HNL                |537                          |
|01/01/2017       |0007         |OGG                |498                          |
|01/01/2017       |0037         |SFO                |241                          |
|01/01/2017       |0043         |DTW                |134                          |
|01/01/2017       |0051         |STL                |88                           |
+-----------------+-------------+-------------------+-----------------------------+
only showing top 5 rows



In [7]:
# Save the df3 DataFrame in Parquet format
df3.write.format('parquet').save('tmp_AA_DFW_ALL.parquet')

# Read the Parquet file into a new DataFrame and run a count
print(spark.read.parquet('tmp_AA_DFW_ALL.parquet').count())

279962


In [8]:
df3 = df3.withColumnRenamed('Actual elapsed time (Minutes)', 'flight_duration')
df3.dtypes

[('Date (MM/DD/YYYY)', 'string'),
 ('Flight Number', 'string'),
 ('Destination Airport', 'string'),
 ('flight_duration', 'string')]

In [9]:
# Read the Parquet file into flights_df
flights_df = spark.read.parquet('tmp_AA_DFW_ALL.parquet')
flights_df = df3
# Register the temp table
flights_df.createOrReplaceTempView('flights')

# Run a SQL query of the average flight duration
avg_duration = spark.sql('SELECT avg(flight_duration) from flights').collect()[0]
print('The average flight time is: %d' % avg_duration)

The average flight time is: 151


In [10]:
spark.sql('SELECT avg(flight_duration) from flights').collect()[0][0]

151.60996492381108