In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession

# Create SparkSession object
spark = SparkSession.builder \
                    .master('local[*]') \
                    .appName('test') \
                    .getOrCreate()

In [3]:
# Read data from CSV file
flights = spark.read.csv('flights-larger.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')


In [4]:
# Remove the 'flight' column
flights = flights.drop('flight')

In [5]:
# Number of records with missing 'delay' values
flights.filter('delay IS NULL').count()

16711

In [6]:
# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

In [7]:
# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print(flights.count())

258289


In [10]:
flights.show(5)

+---+---+---+-------+---+----+------+--------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|
+---+---+---+-------+---+----+------+--------+-----+
| 10| 10|  1|     OO|ORD| 157|  8.18|      51|   27|
| 11| 22|  1|     OO|ORD| 738|  7.17|     127|  -19|
|  2| 14|  5|     B6|JFK|2248| 21.17|     365|   60|
|  5| 25|  3|     WN|SJC| 386| 12.92|      85|   22|
|  3| 28|  1|     B6|LGA|1076| 13.33|     182|   70|
+---+---+---+-------+---+----+------+--------+-----+
only showing top 5 rows



In [8]:
# Import the required function
from pyspark.sql.functions import round

In [11]:
# Convert 'mile' to 'km' and drop 'mile' column
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)).drop('Mile')

In [12]:
flights_km.show(5)

+---+---+---+-------+---+------+--------+-----+------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|
+---+---+---+-------+---+------+--------+-----+------+
| 10| 10|  1|     OO|ORD|  8.18|      51|   27| 253.0|
| 11| 22|  1|     OO|ORD|  7.17|     127|  -19|1188.0|
|  2| 14|  5|     B6|JFK| 21.17|     365|   60|3618.0|
|  5| 25|  3|     WN|SJC| 12.92|      85|   22| 621.0|
|  3| 28|  1|     B6|LGA| 13.33|     182|   70|1732.0|
+---+---+---+-------+---+------+--------+-----+------+
only showing top 5 rows

