In [1]:
# ============================================================
# CELL 1: IMPORT LIBRARIES AND INITIALIZE SPARK
# ============================================================

# Import PySpark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, NaiveBayes, MultilayerPerceptronClassifier, LinearSVC
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor, DecisionTreeRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
import time
import pandas as pd

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("AviationTrendAnalysis") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

print("Spark session initialized successfully.")

Spark session initialized successfully.


In [2]:
# ============================================================
# CELL 2: LOAD DATA FROM CSV FILE
# ============================================================

# Load aviation data from CSV file
file_path = "../data/US Airline Flight Routes and Fares 1993-2024.csv"

df = spark.read \
    .option("header", "true") \
    .option("multiLine", "true") \
    .option("inferSchema", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .csv(file_path)

print(f"Data loaded successfully: {df.count():,} rows, {len(df.columns)} columns")


Data loaded successfully: 245,955 rows, 23 columns


In [3]:
# ============================================================
# CELL 3: DATA CLEANING - WHITESPACE HANDLING
# ============================================================

# Import functions for data cleaning
from pyspark.sql.functions import col, trim, regexp_replace

# Identify string columns
string_columns = [field.name for field in df.schema.fields if field.dataType.typeName() == 'string']

# Remove leading/trailing whitespace
for col_name in string_columns:
    df = df.withColumn(col_name, trim(col(col_name)))

# Normalize whitespace (replace multiple spaces with single space)
for col_name in string_columns:
    df = df.withColumn(col_name, regexp_replace(col(col_name), "\\s+", " "))

print("Data cleaning completed.")


Data cleaning completed.


In [4]:
# ============================================================
# CELL 4: DATA TYPE CONVERSION
# ============================================================

# Define numeric columns to convert
numeric_columns = ['Year', 'quarter', 'citymarketid_1', 'citymarketid_2', 
                   'airportid_1', 'airportid_2', 'nsmiles', 'passengers', 
                   'fare', 'large_ms', 'fare_lg', 'lf_ms', 'fare_low']

# Convert numeric columns to double
converted_count = 0
for col_name in numeric_columns:
    if col_name in df.columns:
        df = df.withColumn(col_name, col(col_name).cast("double"))
        converted_count += 1

print(f"Data type conversion completed: {converted_count} columns converted to double.")


Data type conversion completed: 13 columns converted to double.


In [5]:
# ============================================================
# CELL 5: MISSING VALUES CHECK
# ============================================================

# Import functions for missing value analysis
from pyspark.sql.functions import isnan, isnull

# Analyze missing values
missing_summary = []
total_missing = 0

for col_name in df.columns:
    # Count null values
    null_count = df.filter(col(col_name).isNull()).count()
    
    # Count NaN values (for numeric columns only)
    nan_count = 0
    if col_name in numeric_columns:
        nan_count = df.filter(isnan(col(col_name))).count()
    
    # Total missing values
    col_missing = null_count + nan_count
    total_missing += col_missing
    
    if col_missing > 0:
        missing_summary.append((col_name, col_missing, null_count, nan_count))

print(f"Missing values analysis completed: {total_missing:,} total missing values in {len(missing_summary)} columns.")


Missing values analysis completed: 87,868 total missing values in 8 columns.


In [6]:
# ============================================================
# CELL 6: MISSING VALUES HANDLING (EXCLUDING GEOCODED COLUMNS)
# ============================================================

# Define columns to exclude from missing value removal
exclude_columns = ['Geocoded_City1', 'Geocoded_City2']

# Create condition to check missing values (excluding excluded columns)
missing_condition = None
checked_columns = []

for col_name in df.columns:
    if col_name not in exclude_columns:
        checked_columns.append(col_name)
        if missing_condition is None:
            missing_condition = col(col_name).isNull() | isnan(col(col_name))
        else:
            missing_condition = missing_condition | col(col_name).isNull() | isnan(col(col_name))

# Remove rows with missing values in important columns
df_clean = df.filter(~missing_condition)

# Calculate statistics
original_count = df.count()
clean_count = df_clean.count()
removed_count = original_count - clean_count
removed_percentage = (removed_count / original_count) * 100

print(f"Missing values handling completed: {removed_count:,} rows removed ({removed_percentage:.2f}%)")


Missing values handling completed: 1,612 rows removed (0.66%)


In [7]:
# ============================================================
# CELL 7: DATA CLEANLINESS VERIFICATION
# ============================================================

# Check missing values in important columns
important_columns = [col for col in df_clean.columns if col not in exclude_columns]
all_clean = True

for col_name in important_columns:
    null_count = df_clean.filter(col(col_name).isNull()).count()
    nan_count = df_clean.filter(isnan(col(col_name))).count()
    total_missing = null_count + nan_count
    
    if total_missing > 0:
        all_clean = False

# Cache data for performance
df_clean.cache()
df_clean.count()  # Trigger caching

print(f"Data verification completed: {df_clean.count():,} rows, {len(df_clean.columns)} columns")


Data verification completed: 244,343 rows, 23 columns


In [8]:
# ============================================================
# CELL 8: PREPARE QUARTERLY DATA FOR MACHINE LEARNING
# ============================================================

# Import ML libraries
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
import time

# Aggregate data by Year-Quarter
df_quarterly = df_clean.groupBy('Year', 'quarter').agg(
    # Volume metrics
    count('*').alias('num_routes'),
    sum('passengers').alias('total_passengers'),
    avg('passengers').alias('avg_passengers_per_route'),

    # Price metrics
    avg('fare').alias('avg_fare'),
    stddev('fare').alias('fare_std'),
    min('fare').alias('fare_min'),
    max('fare').alias('fare_max'),

    # Distance metrics
    avg('nsmiles').alias('avg_distance'),
    stddev('nsmiles').alias('distance_std'),

    # Competition metrics
    countDistinct('carrier_lg').alias('num_carriers'),
    avg('large_ms').alias('avg_market_share_large'),
    avg('lf_ms').alias('avg_market_share_lowcost')
).orderBy('Year', 'quarter')

# Create time_period identifier
df_quarterly = df_quarterly.withColumn('time_period',
    concat(col('Year').cast('string'), lit('-Q'), col('quarter').cast('string'))
)

# Create labels (COVID = crisis)
df_quarterly = df_quarterly.withColumn('is_crisis',
    when((col('Year') >= 2020) & (col('Year') <= 2021), 1.0)
    .otherwise(0.0)
)

print(f"Quarterly data preparation completed: {df_quarterly.count()} quarters aggregated.")

Quarterly data preparation completed: 118 quarters aggregated.


In [9]:
# ============================================================
# CELL 9: FEATURE ENGINEERING FOR QUARTERLY DATA
# ============================================================

# Window specs for time-based calculations
window_qoq = Window.orderBy('Year', 'quarter')

# Calculate QoQ (Quarter-over-Quarter) changes
change_cols = ['num_routes', 'total_passengers', 'avg_fare', 'avg_distance']

for col_name in change_cols:
    # Get previous quarter value
    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_prev_q',
        lag(col(col_name), 1).over(window_qoq)
    )

    # Calculate % change
    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_change_qoq',
        when(col(f'{col_name}_prev_q').isNotNull() & (col(f'{col_name}_prev_q') != 0),
             (col(col_name) - col(f'{col_name}_prev_q')) / col(f'{col_name}_prev_q'))
        .otherwise(0.0)
    )

    # Drop temp column
    df_quarterly = df_quarterly.drop(f'{col_name}_prev_q')

# Calculate YoY (Year-over-Year) changes
yoy_cols = ['num_routes', 'total_passengers', 'avg_fare']

for col_name in yoy_cols:
    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_prev_year',
        lag(col(col_name), 4).over(window_qoq)
    )

    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_change_yoy',
        when(col(f'{col_name}_prev_year').isNotNull() & (col(f'{col_name}_prev_year') != 0),
             (col(col_name) - col(f'{col_name}_prev_year')) / col(f'{col_name}_prev_year'))
        .otherwise(0.0)
    )

    df_quarterly = df_quarterly.drop(f'{col_name}_prev_year')

# Create derived features
# Volatility metrics
df_quarterly = df_quarterly.withColumn('fare_volatility',
    when(col('avg_fare') != 0, col('fare_std') / col('avg_fare')).otherwise(0.0)
)

df_quarterly = df_quarterly.withColumn('distance_volatility',
    when(col('avg_distance') != 0, col('distance_std') / col('avg_distance')).otherwise(0.0)
)

# Range metrics
df_quarterly = df_quarterly.withColumn('fare_range',
    col('fare_max') - col('fare_min')
)

# Passenger efficiency
df_quarterly = df_quarterly.withColumn('passenger_efficiency',
    when(col('num_routes') != 0, col('total_passengers') / col('num_routes')).otherwise(0.0)
)

# Handle missing values
df_quarterly = df_quarterly.fillna(0.0)

print("Feature engineering completed.")

Feature engineering completed.


In [10]:
# ============================================================
# CELL 10: SELECT FEATURES AND PREPARE TRAINING DATA
# ============================================================

# Select final features for quarterly ML
feature_cols_quarterly = [
    # Core metrics (8)
    'num_routes', 'total_passengers', 'avg_passengers_per_route',
    'avg_fare', 'avg_distance', 'fare_volatility', 'fare_range',
    'passenger_efficiency',
    
    # Market metrics (3)
    'num_carriers', 'avg_market_share_large', 'avg_market_share_lowcost',
    
    # QoQ changes (4)
    'num_routes_change_qoq', 'total_passengers_change_qoq', 
    'avg_fare_change_qoq', 'avg_distance_change_qoq',
    
    # YoY changes (3)
    'num_routes_change_yoy', 'total_passengers_change_yoy', 
    'avg_fare_change_yoy',
    
    # Time features (2)
    'Year', 'quarter'
]

# Vector Assembler for quarterly features
assembler_quarterly = VectorAssembler(
    inputCols=feature_cols_quarterly,
    outputCol="features"
)

# Standard Scaler for quarterly features
scaler_quarterly = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withStd=True,
    withMean=True
)

# Temporal split for quarterly data
train_data_quarterly = df_quarterly.filter(col('Year') <= 2020)  # Pre-COVID + 2020
test_data_quarterly = df_quarterly.filter(col('Year') > 2020)    # 2021+ (Post-COVID)

# Cache data for performance
train_data_quarterly.cache()
test_data_quarterly.cache()

print(f"Quarterly data preparation completed: {len(feature_cols_quarterly)} features")
print(f"Training quarters: {train_data_quarterly.count():,}")
print(f"Test quarters: {test_data_quarterly.count():,}")

Quarterly data preparation completed: 20 features
Training quarters: 105
Test quarters: 13


In [11]:
# ============================================================
# CELL 11: CLASS IMBALANCE ANALYSIS FOR QUARTERLY DATA
# ============================================================

print("QUARTERLY DATASET OVERVIEW:")
print(f"  Total quarters: {df_quarterly.count():,}")
print(f"  Training quarters: {train_data_quarterly.count():,}")
print(f"  Test quarters: {test_data_quarterly.count():,}")

# Class distribution analysis for quarterly data
quarterly_distribution = df_quarterly.groupBy('is_crisis').count().collect()
total_quarters = df_quarterly.count()

print("\nQUARTERLY CLASS DISTRIBUTION:")
for row in quarterly_distribution:
    class_label = "COVID Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    count = row['count']
    percentage = (count / total_quarters) * 100
    print(f"  - {class_label}: {count} quarters ({percentage:.1f}%)")

# Calculate imbalance ratio for quarterly data
crisis_quarters = next(row['count'] for row in quarterly_distribution if row['is_crisis'] == 1.0)
normal_quarters = next(row['count'] for row in quarterly_distribution if row['is_crisis'] == 0.0)
imbalance_ratio_quarterly = normal_quarters / crisis_quarters

print(f"\nQuarterly imbalance ratio: {imbalance_ratio_quarterly:.1f}:1 (Normal:Crisis)")

if imbalance_ratio_quarterly > 10:
    print("SEVERE CLASS IMBALANCE (>10:1) - Quarterly Level")
elif imbalance_ratio_quarterly > 5:
    print("SIGNIFICANT CLASS IMBALANCE (>5:1) - Quarterly Level")
else:
    print("Class balance is acceptable - Quarterly Level")

print("\nQuarterly analysis completed.")

QUARTERLY DATASET OVERVIEW:
  Total quarters: 118
  Training quarters: 105
  Test quarters: 13

QUARTERLY CLASS DISTRIBUTION:
  - Normal Quarters: 110 quarters (93.2%)
  - COVID Crisis Quarters: 8 quarters (6.8%)

Quarterly imbalance ratio: 13.8:1 (Normal:Crisis)
SEVERE CLASS IMBALANCE (>10:1) - Quarterly Level

Quarterly analysis completed.


In [12]:
# ============================================================
# CELL 12: CLASS WEIGHTING FOR QUARTERLY DATA
# ============================================================

from pyspark.sql.functions import when, col

# Calculate class weights for quarterly data
crisis_quarters_count = crisis_quarters
normal_quarters_count = normal_quarters
total_quarters_count = crisis_quarters_count + normal_quarters_count

# Calculate weights (inverse frequency)
weight_crisis_quarters = total_quarters_count / (2 * crisis_quarters_count)
weight_normal_quarters = total_quarters_count / (2 * normal_quarters_count)

print(f"Quarterly class weights - Crisis: {weight_crisis_quarters:.2f}, Normal: {weight_normal_quarters:.2f}")

# Create weighted dataset for quarterly training
train_data_quarterly_weighted = train_data_quarterly.withColumn(
    "class_weight",
    when(col("is_crisis") == 1.0, weight_crisis_quarters).otherwise(weight_normal_quarters)
)

# Verify the weighting
print(f"Weighted quarterly dataset created: {train_data_quarterly_weighted.count():,} quarters")

# Check the class_weight distribution
print("\nQuarterly class weight distribution:")
train_data_quarterly_weighted.groupBy("is_crisis", "class_weight").count().show()

Quarterly class weights - Crisis: 7.38, Normal: 0.54
Weighted quarterly dataset created: 105 quarters

Quarterly class weight distribution:
+---------+------------------+-----+
|is_crisis|      class_weight|count|
+---------+------------------+-----+
|      0.0|0.5363636363636364|  101|
|      1.0|             7.375|    4|
+---------+------------------+-----+



## MODEL 1: LOGISTIC REGRESSION

In [14]:
# ============================================================
# CELL 13: MODEL 1 - LOGISTIC REGRESSION (QUARTERLY)
# ============================================================

# Create pipeline with class weighting for quarterly data
logistic_reg_quarterly = LogisticRegression(
    featuresCol="scaled_features", 
    labelCol="is_crisis",
    weightCol="class_weight",
    maxIter=100,
    regParam=0.01
)
logistic_pipeline_quarterly = Pipeline(stages=[assembler_quarterly, scaler_quarterly, logistic_reg_quarterly])

# Train model with weighted quarterly data
start_time = time.time()
logistic_model_quarterly = logistic_pipeline_quarterly.fit(train_data_quarterly_weighted)
logistic_time_quarterly = time.time() - start_time

# Predictions on quarterly test data
logistic_predictions_quarterly = logistic_model_quarterly.transform(test_data_quarterly)

# Evaluate with comprehensive metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis", metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="f1")

auc = auc_evaluator.evaluate(logistic_predictions_quarterly)
accuracy = accuracy_evaluator.evaluate(logistic_predictions_quarterly)
precision = precision_evaluator.evaluate(logistic_predictions_quarterly)
recall = recall_evaluator.evaluate(logistic_predictions_quarterly)
f1 = f1_evaluator.evaluate(logistic_predictions_quarterly)

# Results
print("Model: Logistic Regression (Quarterly, Class Weighted)")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Training time: {logistic_time_quarterly:.2f} seconds")

# Check prediction distribution
print("\nQuarterly Prediction Distribution:")
pred_dist = logistic_predictions_quarterly.groupBy("prediction").count().collect()
for row in pred_dist:
    class_name = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {class_name}: {row['count']:,} quarters")

# Confusion Matrix
print("\nQuarterly Confusion Matrix:")
confusion_matrix = logistic_predictions_quarterly.groupBy("is_crisis", "prediction").count().collect()
for row in confusion_matrix:
    actual = "Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    predicted = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {actual} → {predicted}: {row['count']:,} quarters")

Model: Logistic Regression (Quarterly, Class Weighted)
AUC: 0.2778
Accuracy: 0.7692
Precision: 0.8269
Recall: 0.7692
F1-Score: 0.7165
Training time: 2.76 seconds

Quarterly Prediction Distribution:
  Crisis Quarters: 1 quarters
  Normal Quarters: 12 quarters

Quarterly Confusion Matrix:
  Crisis Quarters → Crisis Quarters: 1 quarters
  Crisis Quarters → Normal Quarters: 3 quarters
  Normal Quarters → Normal Quarters: 9 quarters


## MODEL 2: DECISION TREE

In [16]:
# ============================================================
# CELL 14: MODEL 2 - DECISION TREE (QUARTERLY)
# ============================================================

from pyspark.ml.classification import DecisionTreeClassifier

# Create pipeline with Decision Tree for quarterly data
dt_classifier_quarterly = DecisionTreeClassifier(
    featuresCol="features", 
    labelCol="is_crisis",
    weightCol="class_weight",
    maxDepth=15,
    maxBins=64,
    minInstancesPerNode=5
)
dt_pipeline_quarterly = Pipeline(stages=[assembler_quarterly, dt_classifier_quarterly])

# Train model with weighted quarterly data
start_time = time.time()
dt_model_quarterly = dt_pipeline_quarterly.fit(train_data_quarterly_weighted)
dt_time_quarterly = time.time() - start_time

# Predictions on quarterly test data
dt_predictions_quarterly = dt_model_quarterly.transform(test_data_quarterly)

# Evaluate with comprehensive metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis", metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="f1")

auc = auc_evaluator.evaluate(dt_predictions_quarterly)
accuracy = accuracy_evaluator.evaluate(dt_predictions_quarterly)
precision = precision_evaluator.evaluate(dt_predictions_quarterly)
recall = recall_evaluator.evaluate(dt_predictions_quarterly)
f1 = f1_evaluator.evaluate(dt_predictions_quarterly)

# Results
print("Model: Decision Tree (Quarterly, Class Weighted)")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Training time: {dt_time_quarterly:.2f} seconds")

# Check prediction distribution
print("\nQuarterly Prediction Distribution:")
pred_dist = dt_predictions_quarterly.groupBy("prediction").count().collect()
for row in pred_dist:
    class_name = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {class_name}: {row['count']:,} quarters")

# Confusion Matrix
print("\nQuarterly Confusion Matrix:")
confusion_matrix = dt_predictions_quarterly.groupBy("is_crisis", "prediction").count().collect()
for row in confusion_matrix:
    actual = "Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    predicted = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {actual} → {predicted}: {row['count']:,} quarters")

Model: Decision Tree (Quarterly, Class Weighted)
AUC: 0.6250
Accuracy: 0.7692
Precision: 0.8269
Recall: 0.7692
F1-Score: 0.7165
Training time: 0.69 seconds

Quarterly Prediction Distribution:
  Crisis Quarters: 1 quarters
  Normal Quarters: 12 quarters

Quarterly Confusion Matrix:
  Crisis Quarters → Crisis Quarters: 1 quarters
  Crisis Quarters → Normal Quarters: 3 quarters
  Normal Quarters → Normal Quarters: 9 quarters


## MODEL 3: RANDOM FOREST


In [17]:
# ============================================================
# CELL 15: MODEL 3 - RANDOM FOREST (QUARTERLY)
# ============================================================

# Create pipeline with Random Forest for quarterly data
rf_classifier_quarterly = RandomForestClassifier(
    featuresCol="features", 
    labelCol="is_crisis",
    weightCol="class_weight",
    numTrees=200,
    maxDepth=15,
    maxBins=64,
    minInstancesPerNode=5,
    seed=42
)
rf_pipeline_quarterly = Pipeline(stages=[assembler_quarterly, rf_classifier_quarterly])

# Train model with weighted quarterly data
start_time = time.time()
rf_model_quarterly = rf_pipeline_quarterly.fit(train_data_quarterly_weighted)
rf_time_quarterly = time.time() - start_time

# Predictions on quarterly test data
rf_predictions_quarterly = rf_model_quarterly.transform(test_data_quarterly)

# Evaluate with comprehensive metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis", metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="f1")

auc = auc_evaluator.evaluate(rf_predictions_quarterly)
accuracy = accuracy_evaluator.evaluate(rf_predictions_quarterly)
precision = precision_evaluator.evaluate(rf_predictions_quarterly)
recall = recall_evaluator.evaluate(rf_predictions_quarterly)
f1 = f1_evaluator.evaluate(rf_predictions_quarterly)

# Results
print("Model: Random Forest (Quarterly, Class Weighted)")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Training time: {rf_time_quarterly:.2f} seconds")

# Check prediction distribution
print("\nQuarterly Prediction Distribution:")
pred_dist = rf_predictions_quarterly.groupBy("prediction").count().collect()
for row in pred_dist:
    class_name = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {class_name}: {row['count']:,} quarters")

# Confusion Matrix
print("\nQuarterly Confusion Matrix:")
confusion_matrix = rf_predictions_quarterly.groupBy("is_crisis", "prediction").count().collect()
for row in confusion_matrix:
    actual = "Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    predicted = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {actual} → {predicted}: {row['count']:,} quarters")

Model: Random Forest (Quarterly, Class Weighted)
AUC: 0.7500
Accuracy: 0.7692
Precision: 0.8269
Recall: 0.7692
F1-Score: 0.7165
Training time: 0.91 seconds

Quarterly Prediction Distribution:
  Crisis Quarters: 1 quarters
  Normal Quarters: 12 quarters

Quarterly Confusion Matrix:
  Crisis Quarters → Crisis Quarters: 1 quarters
  Crisis Quarters → Normal Quarters: 3 quarters
  Normal Quarters → Normal Quarters: 9 quarters


## MODEL 4: GRADIENT BOOSTING TREES

In [18]:
# ============================================================
# CELL 16: MODEL 4 - GRADIENT BOOSTING TREES (QUARTERLY)
# ============================================================

# Create pipeline with Gradient Boosting for quarterly data
gbt_classifier_quarterly = GBTClassifier(
    featuresCol="features", 
    labelCol="is_crisis",
    weightCol="class_weight",
    maxIter=100,
    maxDepth=8,
    maxBins=64,
    minInstancesPerNode=5,
    seed=42
)
gbt_pipeline_quarterly = Pipeline(stages=[assembler_quarterly, gbt_classifier_quarterly])

# Train model with weighted quarterly data
start_time = time.time()
gbt_model_quarterly = gbt_pipeline_quarterly.fit(train_data_quarterly_weighted)
gbt_time_quarterly = time.time() - start_time

# Predictions on quarterly test data
gbt_predictions_quarterly = gbt_model_quarterly.transform(test_data_quarterly)

# Evaluate with comprehensive metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis", metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="f1")

auc = auc_evaluator.evaluate(gbt_predictions_quarterly)
accuracy = accuracy_evaluator.evaluate(gbt_predictions_quarterly)
precision = precision_evaluator.evaluate(gbt_predictions_quarterly)
recall = recall_evaluator.evaluate(gbt_predictions_quarterly)
f1 = f1_evaluator.evaluate(gbt_predictions_quarterly)

# Results
print("Model: Gradient Boosting Trees (Quarterly, Class Weighted)")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Training time: {gbt_time_quarterly:.2f} seconds")

# Check prediction distribution
print("\nQuarterly Prediction Distribution:")
pred_dist = gbt_predictions_quarterly.groupBy("prediction").count().collect()
for row in pred_dist:
    class_name = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {class_name}: {row['count']:,} quarters")

# Confusion Matrix
print("\nQuarterly Confusion Matrix:")
confusion_matrix = gbt_predictions_quarterly.groupBy("is_crisis", "prediction").count().collect()
for row in confusion_matrix:
    actual = "Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    predicted = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {actual} → {predicted}: {row['count']:,} quarters")

Model: Gradient Boosting Trees (Quarterly, Class Weighted)
AUC: 0.6944
Accuracy: 0.7692
Precision: 0.8269
Recall: 0.7692
F1-Score: 0.7165
Training time: 27.41 seconds

Quarterly Prediction Distribution:
  Crisis Quarters: 1 quarters
  Normal Quarters: 12 quarters

Quarterly Confusion Matrix:
  Crisis Quarters → Crisis Quarters: 1 quarters
  Crisis Quarters → Normal Quarters: 3 quarters
  Normal Quarters → Normal Quarters: 9 quarters


## MODEL 5: NAIVE BAYES

In [19]:
# ============================================================
# CELL 17: MODEL 5 - NAIVE BAYES (QUARTERLY)
# ============================================================

from pyspark.ml.classification import NaiveBayes

# Select features suitable for Naive Bayes (positive values only)
nb_feature_cols_quarterly = [
    'num_routes', 'total_passengers', 'avg_passengers_per_route',
    'avg_fare', 'avg_distance', 'fare_volatility', 'fare_range',
    'passenger_efficiency', 'num_carriers', 'avg_market_share_large',
    'Year', 'quarter'
]

# Vector Assembler for Naive Bayes features
assembler_nb_quarterly = VectorAssembler(
    inputCols=nb_feature_cols_quarterly,
    outputCol="features"
)

# Create pipeline with Naive Bayes for quarterly data
nb_classifier_quarterly = NaiveBayes(
    featuresCol="features", 
    labelCol="is_crisis",
    weightCol="class_weight",
    smoothing=1.0
)
nb_pipeline_quarterly = Pipeline(stages=[assembler_nb_quarterly, nb_classifier_quarterly])

# Train model with weighted quarterly data
start_time = time.time()
nb_model_quarterly = nb_pipeline_quarterly.fit(train_data_quarterly_weighted)
nb_time_quarterly = time.time() - start_time

# Predictions on quarterly test data
nb_predictions_quarterly = nb_model_quarterly.transform(test_data_quarterly)

# Evaluate with comprehensive metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis", metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="f1")

auc = auc_evaluator.evaluate(nb_predictions_quarterly)
accuracy = accuracy_evaluator.evaluate(nb_predictions_quarterly)
precision = precision_evaluator.evaluate(nb_predictions_quarterly)
recall = recall_evaluator.evaluate(nb_predictions_quarterly)
f1 = f1_evaluator.evaluate(nb_predictions_quarterly)

# Results
print("Model: Naive Bayes (Quarterly, Class Weighted)")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Training time: {nb_time_quarterly:.2f} seconds")

# Check prediction distribution
print("\nQuarterly Prediction Distribution:")
pred_dist = nb_predictions_quarterly.groupBy("prediction").count().collect()
for row in pred_dist:
    class_name = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {class_name}: {row['count']:,} quarters")

# Confusion Matrix
print("\nQuarterly Confusion Matrix:")
confusion_matrix = nb_predictions_quarterly.groupBy("is_crisis", "prediction").count().collect()
for row in confusion_matrix:
    actual = "Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    predicted = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {actual} → {predicted}: {row['count']:,} quarters")

Model: Naive Bayes (Quarterly, Class Weighted)
AUC: 0.7500
Accuracy: 0.7692
Precision: 0.8269
Recall: 0.7692
F1-Score: 0.7165
Training time: 0.33 seconds

Quarterly Prediction Distribution:
  Crisis Quarters: 1 quarters
  Normal Quarters: 12 quarters

Quarterly Confusion Matrix:
  Crisis Quarters → Crisis Quarters: 1 quarters
  Crisis Quarters → Normal Quarters: 3 quarters
  Normal Quarters → Normal Quarters: 9 quarters


## MODEL 6: SUPPORT VECTOR MACHINE

In [20]:
# ============================================================
# CELL 18: MODEL 6 - SUPPORT VECTOR MACHINE (QUARTERLY)
# ============================================================

from pyspark.ml.classification import LinearSVC

# Create pipeline with SVM for quarterly data
svm_classifier_quarterly = LinearSVC(
    featuresCol="scaled_features", 
    labelCol="is_crisis",
    weightCol="class_weight",
    maxIter=200,
    regParam=0.01,
    threshold=0.5
)
svm_pipeline_quarterly = Pipeline(stages=[assembler_quarterly, scaler_quarterly, svm_classifier_quarterly])

# Train model with weighted quarterly data
start_time = time.time()
svm_model_quarterly = svm_pipeline_quarterly.fit(train_data_quarterly_weighted)
svm_time_quarterly = time.time() - start_time

# Predictions on quarterly test data
svm_predictions_quarterly = svm_model_quarterly.transform(test_data_quarterly)

# Evaluate with comprehensive metrics
auc_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis", metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="is_crisis", metricName="f1")

auc = auc_evaluator.evaluate(svm_predictions_quarterly)
accuracy = accuracy_evaluator.evaluate(svm_predictions_quarterly)
precision = precision_evaluator.evaluate(svm_predictions_quarterly)
recall = recall_evaluator.evaluate(svm_predictions_quarterly)
f1 = f1_evaluator.evaluate(svm_predictions_quarterly)

# Results
print("Model: Support Vector Machine (Quarterly, Class Weighted)")
print(f"AUC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Training time: {svm_time_quarterly:.2f} seconds")

# Check prediction distribution
print("\nQuarterly Prediction Distribution:")
pred_dist = svm_predictions_quarterly.groupBy("prediction").count().collect()
for row in pred_dist:
    class_name = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {class_name}: {row['count']:,} quarters")

# Confusion Matrix
print("\nQuarterly Confusion Matrix:")
confusion_matrix = svm_predictions_quarterly.groupBy("is_crisis", "prediction").count().collect()
for row in confusion_matrix:
    actual = "Crisis Quarters" if row['is_crisis'] == 1.0 else "Normal Quarters"
    predicted = "Crisis Quarters" if row['prediction'] == 1.0 else "Normal Quarters"
    print(f"  {actual} → {predicted}: {row['count']:,} quarters")

Model: Support Vector Machine (Quarterly, Class Weighted)
AUC: 0.2500
Accuracy: 0.7692
Precision: 0.8269
Recall: 0.7692
F1-Score: 0.7165
Training time: 26.92 seconds

Quarterly Prediction Distribution:
  Crisis Quarters: 1 quarters
  Normal Quarters: 12 quarters

Quarterly Confusion Matrix:
  Crisis Quarters → Crisis Quarters: 1 quarters
  Crisis Quarters → Normal Quarters: 3 quarters
  Normal Quarters → Normal Quarters: 9 quarters
