In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("BigDataProject").getOrCreate()

df_09 = spark.read.csv("2009.csv", header=True, inferSchema=True)
df_10 = spark.read.csv("2010.csv", header=True, inferSchema=True)
df_11 = spark.read.csv("2011.csv", header=True, inferSchema=True)
df_12 = spark.read.csv("2012.csv", header=True, inferSchema=True)
df_13 = spark.read.csv("2013.csv", header=True, inferSchema=True)
df_14 = spark.read.csv("2014.csv", header=True, inferSchema=True)
df_15 = spark.read.csv("2015.csv", header=True, inferSchema=True)
df_16 = spark.read.csv("2016.csv", header=True, inferSchema=True)
df_17 = spark.read.csv("2017.csv", header=True, inferSchema=True)
df_18 = spark.read.csv("2018.csv", header=True, inferSchema=True)

df_flights = spark.read.csv("flights.csv", header=True, inferSchema=True)

In [2]:
print(df_flights.columns)
print("----------------")
print(df_09.columns)

['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']
----------------
['FL_DATE', 'OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'Unnamed: 27']


In [3]:
key_cols = ["YEAR", "MONTH", "DAY", "FLIGHT_NUMBER", "AIRLINE","DISTANCE","WHEELS_ON","WHEELS_OFF","TAXI_IN","TAXI_OUT","SCHEDULED_DEPARTURE"]
distinct_key_count = df_flights.select(key_cols).dropDuplicates().count()

total_rows_in_flight = df_flights.count()

print("Total rows:", total_rows_in_flight)
print("Distinct rows with key:", distinct_key_count)

Total rows: 5819079
Distinct rows with key: 5819079


In [4]:
key_cols = ["FL_DATE", "OP_CARRIER_FL_NUM", "OP_CARRIER","DISTANCE","WHEELS_ON","WHEELS_OFF","TAXI_IN","TAXI_OUT","CRS_DEP_TIME"]
distinct_key_count = df_15.select(key_cols).dropDuplicates().count()

total_rows = df_15.count()

print("Total rows:", total_rows)
print("Distinct rows with key:", distinct_key_count)

Total rows: 5819079
Distinct rows with key: 5819079


In [5]:
from pyspark.sql.functions import concat_ws, lpad, col

# Step 1: Ensure MONTH and DAY are always 2 digits (e.g., 01, 02, ..., 12)
df_flight_new = df_flights.withColumn("MONTH_PAD", lpad(col("MONTH").cast("string"), 2, "0")) \
                 .withColumn("DAY_PAD", lpad(col("DAY").cast("string"), 2, "0"))

# Step 2: Concatenate into FL_DATE string
df_flight_new = df_flight_new.withColumn("FL_DATE", concat_ws("-", col("YEAR"), col("MONTH_PAD"), col("DAY_PAD")))

In [6]:
df_flight_new = df_flight_new.withColumnRenamed("AIRLINE", "OP_CARRIER") \
                            .withColumnRenamed("SCHEDULED_DEPARTURE", "CRS_DEP_TIME") \
                            .withColumnRenamed("FLIGHT_NUMBER", "OP_CARRIER_FL_NUM")

In [7]:
# Define the actual key columns
key_cols = ["FL_DATE", "OP_CARRIER_FL_NUM", "OP_CARRIER","DISTANCE","WHEELS_ON","WHEELS_OFF","TAXI_IN","TAXI_OUT","CRS_DEP_TIME"]

# Select keys and remove duplicates
df1_keys = df_flight_new.select(key_cols).dropDuplicates()
df2_keys = df_15.select(key_cols).dropDuplicates()

# Compare
only_in_df1 = df1_keys.subtract(df2_keys)
only_in_df2 = df2_keys.subtract(df1_keys)

# Count and display differences
count1 = only_in_df1.count()
count2 = only_in_df2.count()

if count1 == 0 and count2 == 0:
    print("✅ The datasets are 100% identical based on given key columns")
else:
    print("❌ There are differences!")
    print(f"Rows only in df_flight_new: {count1}")
    print(f"Rows only in df_15: {count2}")

✅ The datasets are 100% identical based on given key columns


In [8]:
df_all = df_09.union(df_10).union(df_11).union(df_12).union(df_13).union(df_14).union(df_15).union(df_16).union(df_17).union(df_18)

In [9]:
df_all.write.option("header", True).csv('all_flights.csv')