In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import ttest_ind, chi2_contingency, normaltest

2018 Data

In [None]:
# Step 1: Load the Dataset from Parquet File

from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/My Drive/DSC 288R/Combined_Flights_2018.parquet"

flight_data = pd.read_parquet(file_path)
print(flight_data.head())

Data Cleaning

In [None]:
# Identify Missing Values
missing_values = flight_data.isnull().sum()
print("Missing Values Count:\n", missing_values)

In [None]:
# Fill Missing Values Properly
for col in flight_data.columns:
    if flight_data[col].dtype == "object":  # Categorical Columns
        flight_data.loc[:, col] = flight_data[col].fillna(flight_data[col].mode()[0])
    else:  # Numerical Columns
        flight_data.loc[:, col] = flight_data[col].fillna(flight_data[col].median())

In [None]:
# Detect and Handle Outliers
def cap_outliers(df, column):
    """
    Cap outliers using the 99th percentile method.
    """
    upper_limit = np.percentile(df[column], 99)
    lower_limit = np.percentile(df[column], 1)
    df[column] = np.where(df[column] > upper_limit, upper_limit, df[column])
    df[column] = np.where(df[column] < lower_limit, lower_limit, df[column])

# Apply outlier capping to numerical columns only
numerical_cols = flight_data.select_dtypes(include=["int64", "float64"]).columns
for col in numerical_cols:
    cap_outliers(flight_data, col)

In [None]:
# Remove Duplicates
before_duplicates = flight_data.shape[0]
flight_data.drop_duplicates(inplace=True)
after_duplicates = flight_data.shape[0]

before_duplicates, after_duplicates

In [None]:
print(flight_data.info())

In [None]:
print(flight_data.describe())

Data Formatting

In [None]:
# Convert Columns to Appropriate Data Types
# Convert numerical columns stored as strings to integers or floats
for col in flight_data.columns:
    if flight_data[col].dtype == "object":  # Check if column is string-based
        try:
            flight_data[col] = pd.to_numeric(flight_data[col])  # Convert to numeric
        except ValueError:
            pass  # Skip if conversion is not possible (e.g., non-numeric text columns)

In [None]:
# Standardize Date Formatting
date_columns = ["FlightDate"]  # Update with actual date column names in your dataset

for col in date_columns:
    flight_data[col] = pd.to_datetime(flight_data[col], errors="coerce")  # Convert to datetime format

In [None]:
print(flight_data.dtypes)

Data Aggregation

In [None]:
# Summarization - Creating Summary Statistics
summary_statistics = flight_data.describe()

# Grouping - Organizing Data into Groups Based on Airline and Month
grouped_data = flight_data.groupby(["Marketing_Airline_Network", "Month"]).agg({
    "DepDelayMinutes": ["mean", "median", "sum"],
    "ArrDelayMinutes": ["mean", "median", "sum"],
    "Distance": ["mean", "sum"]
}).reset_index()

grouped_data.head()

Data Discretization

In [None]:
# Binning Departure Delay into Categories: On-time, Moderate Delay, Severe Delay
flight_data["DepDelayCategory"] = pd.cut(
    flight_data["DepDelayMinutes"],
    bins=[-10, 0, 15, 300],  # Define custom bins
    labels=["On-time", "Moderate Delay", "Severe Delay"]
)

# Binning Distance into Short, Medium, and Long Flights
flight_data["DistanceCategory"] = pd.cut(
    flight_data["Distance"],
    bins=[0, 500, 1500, 3000],
    labels=["Short", "Medium", "Long"]
)

Data Enrichment

In [None]:
# Creating Time of Day Feature from Scheduled Departure Time
def get_time_of_day(dep_time):
    if dep_time < 600:
        return "Early Morning"
    elif dep_time < 1200:
        return "Morning"
    elif dep_time < 1800:
        return "Afternoon"
    else:
        return "Evening"

flight_data["TimeOfDay"] = flight_data["CRSDepTime"].apply(get_time_of_day)

EDA

Descriptive Statistics

In [None]:
print(flight_data.describe())

In [None]:
# Distribution of Departure Delays
plt.figure(figsize=(8, 6))
sns.histplot(flight_data["DepDelayMinutes"], bins=50, kde=True, color="blue")
plt.title("Distribution of Departure Delays")
plt.xlabel("Minutes Delayed")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Distribution of Flight Distance
plt.figure(figsize=(8, 6))
sns.histplot(flight_data["Distance"], bins=10, kde=True, color="blue")
plt.title("Distribution of Flight Distance")
plt.xlabel("Distance")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Boxplot for Outlier Detection
plt.figure(figsize=(8, 6))
sns.boxplot(x=flight_data["DepDelayMinutes"], color="red")
plt.title("Boxplot of Departure Delays")
plt.xlabel("Minutes Delayed")
plt.show()

Bivariate Analysis

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(flight_data[["DepDelayMinutes", "ArrDelayMinutes", "Distance", "AirTime"]].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Scatterplot with Trendline
plt.figure(figsize=(10, 6))

# Scatterplot with Transparency and Trendline
sns.regplot(
    x=flight_data["DepDelayMinutes"],
    y=flight_data["ArrDelayMinutes"],
    scatter_kws={"alpha": 0.3},  # Adjust transparency to reduce clutter
    line_kws={"color": "red"},  # Trendline color
    lowess=True  # Smooth trendline for better pattern visualization
)

plt.title("Departure Delay vs Arrival Delay", fontsize=14)
plt.xlabel("Departure Delay (Minutes)", fontsize=12)
plt.ylabel("Arrival Delay (Minutes)", fontsize=12)
plt.grid(True, linestyle="--", alpha=0.7)

plt.show()

Inferential Statistics

In [None]:
# Apply categorization
flight_data["TimeOfDay"] = flight_data["CRSDepTime"].apply(get_time_of_day)

# Step 3: Define two groups: Morning vs Evening flights
morning_delays = flight_data[flight_data["TimeOfDay"] == "Morning"]["DepDelayMinutes"]
evening_delays = flight_data[flight_data["TimeOfDay"] == "Evening"]["DepDelayMinutes"]

# Step 4: Perform the T-Test
t_stat, p_value = ttest_ind(morning_delays, evening_delays, equal_var=False)

# Step 5: Print the results
print(f"\n✅ T-Test: Morning vs Evening Delays")
print(f"t-statistic: {t_stat:.3f}, p-value: {p_value:.5f}")

# Interpretation
if p_value < 0.05:
    print("Statistically significant difference in delays between morning and evening flights.")
else:
    print("No significant difference in delays between morning and evening flights.")

In [None]:
# Apply categorization
flight_data["TimeOfDay"] = flight_data["CRSDepTime"].apply(get_time_of_day)

# Define Severe Delay: 1 if delay is more than 60 minutes, otherwise 0
flight_data["SevereDelay"] = (flight_data["DepDelayMinutes"] > 60).astype(int)

# Step 3: Create a Contingency Table
contingency_table = pd.crosstab(flight_data["TimeOfDay"], flight_data["SevereDelay"])

# Step 4: Perform Chi-Square Test
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

# Step 5: Print Results
print("\n✅ Chi-Square Test: Time of Day vs Severe Delays")
print(f"Chi-Square Statistic: {chi2_stat:.3f}")
print(f"Degrees of Freedom: {dof}")
print(f"P-Value: {p_value:.5f}")

# Interpretation
if p_value < 0.05:
    print("Statistically significant relationship between time of day and severe delays.")
else:
    print("No significant relationship between time of day and severe delays.")

# Step 6: Display Contingency Table
print("\n✅ Contingency Table:\n", contingency_table)


Feature Importance

In [None]:
# Selecting Features and Target
features = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "Distance"]
target = "SevereDelay"

X_train, X_test, y_train, y_test = train_test_split(flight_data[features], flight_data[target], test_size=0.2, random_state=42)

In [None]:
# Train RandomForest to Find Feature Importance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Feature Importance Visualization
feature_importance = pd.DataFrame({"Feature": features, "Importance": rf_model.feature_importances_}).sort_values(by="Importance", ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance, palette="viridis")
plt.title("Feature Importance for Predicting Severe Delays")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.show()

Time Series Analysis

In [None]:
flight_data["FlightDate"] = pd.to_datetime(flight_data["FlightDate"], format="%Y%m%d")

daily_delays = flight_data.groupby("FlightDate")[["DepDelayMinutes", "ArrDelayMinutes"]].mean().reset_index()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(daily_delays["FlightDate"], daily_delays["DepDelayMinutes"], label="Average Departure Delay", marker="o")
plt.plot(daily_delays["FlightDate"], daily_delays["ArrDelayMinutes"], label="Average Arrival Delay", marker="x")
plt.title("Time-Series of Average Flight Delays (Daily)")
plt.xlabel("Date")
plt.ylabel("Average Delay")
plt.legend()
plt.grid(True)
plt.show()