In [5]:
# Cell 1: Load Data from CSV file
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("AviationTrendAnalysis") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Set file path
file_path = "./data/US Airline Flight Routes and Fares 1993-2024.csv"

print(f"Loading data from: {file_path}")

# Load CSV data with proper options
df = spark.read \
    .option("header", "true") \
    .option("multiLine", "true") \
    .option("inferSchema", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .csv(file_path)

# Show basic information about the dataset
print(f"Data loaded successfully!")
print(f"Dataset shape: ({df.count():,} rows, {len(df.columns)} columns)")
print(f"Columns: {df.columns}")

# Show first few rows
print("\nFirst 5 rows:")
df.show(5, truncate=False)

# Show data types
print("\nData types:")
df.printSchema()


Loading data from: ./data/US Airline Flight Routes and Fares 1993-2024.csv
Data loaded successfully!
Dataset shape: (245,955 rows, 23 columns)
Columns: ['tbl', 'Year', 'quarter', 'citymarketid_1', 'citymarketid_2', 'city1', 'city2', 'airportid_1', 'airportid_2', 'airport_1', 'airport_2', 'nsmiles', 'passengers', 'fare', 'carrier_lg', 'large_ms', 'fare_lg', 'carrier_low', 'lf_ms', 'fare_low', 'Geocoded_City1', 'Geocoded_City2', 'tbl1apk']

First 5 rows:
+-------+----+-------+--------------+--------------+------------------------------+-----------------------------+-----------+-----------+---------+---------+-------+----------+------+----------+--------+-------+-----------+------+--------+--------------+--------------+---------------------+
|tbl    |Year|quarter|citymarketid_1|citymarketid_2|city1                         |city2                        |airportid_1|airportid_2|airport_1|airport_2|nsmiles|passengers|fare  |carrier_lg|large_ms|fare_lg|carrier_low|lf_ms |fare_low|Geocoded_Cit

In [6]:
# Cell 2: Data Cleaning
from pyspark.sql.functions import col, trim, regexp_replace, when, isnan, isnull

print("=== DATA CLEANING PROCESS ===")
print(f"Original dataset: {df.count():,} rows")

# 1. Remove leading/trailing whitespaces from string columns
print("\n1. Cleaning whitespaces...")
string_columns = [field.name for field in df.schema.fields if field.dataType.typeName() == 'string']

for col_name in string_columns:
    df = df.withColumn(col_name, trim(col(col_name)))

# 2. Clean and standardize string values
print("2. Standardizing string values...")
# Remove extra spaces and standardize
for col_name in string_columns:
    df = df.withColumn(col_name, regexp_replace(col(col_name), "\\s+", " "))

# 3. Handle numeric columns - convert to proper types
print("3. Converting numeric columns...")
numeric_columns = ['Year', 'quarter', 'citymarketid_1', 'citymarketid_2', 
                   'airportid_1', 'airportid_2', 'nsmiles', 'passengers', 
                   'fare', 'large_ms', 'fare_lg', 'lf_ms', 'fare_low']

for col_name in numeric_columns:
    if col_name in df.columns:
        # Convert to double for numeric columns
        df = df.withColumn(col_name, col(col_name).cast("double"))

# 4. Show data quality metrics before cleaning
print("\n=== DATA QUALITY BEFORE CLEANING ===")
print("Missing values per column:")
for col_name in df.columns:
    null_count = df.filter(col(col_name).isNull()).count()
    nan_count = df.filter(isnan(col(col_name))).count() if col_name in numeric_columns else 0
    total_missing = null_count + nan_count
    if total_missing > 0:
        print(f"  {col_name}: {total_missing:,} missing values")

# 5. Show sample of cleaned data
print(f"\nCleaned dataset: {df.count():,} rows")
print("\nSample of cleaned data:")
df.show(5, truncate=False)


=== DATA CLEANING PROCESS ===
Original dataset: 245,955 rows

1. Cleaning whitespaces...
2. Standardizing string values...
3. Converting numeric columns...

=== DATA QUALITY BEFORE CLEANING ===
Missing values per column:
  carrier_lg: 1,540 missing values
  large_ms: 1,540 missing values
  fare_lg: 1,540 missing values
  carrier_low: 1,612 missing values
  lf_ms: 1,612 missing values
  fare_low: 1,612 missing values
  Geocoded_City1: 39,206 missing values
  Geocoded_City2: 39,206 missing values

Cleaned dataset: 245,955 rows

Sample of cleaned data:
+-------+------+-------+--------------+--------------+------------------------------+-----------------------------+-----------+-----------+---------+---------+-------+----------+------+----------+--------+-------+-----------+------+--------+--------------+--------------+---------------------+
|tbl    |Year  |quarter|citymarketid_1|citymarketid_2|city1                         |city2                        |airportid_1|airportid_2|airport_1|a

In [7]:
# Cell 3: Remove Missing Values (Excluding Geocoded_City columns)
from pyspark.sql.functions import col, isnan, isnull, count, when

print("=== REMOVING MISSING VALUES ===")
print(f"Dataset before removing missing values: {df.count():,} rows")

# 1. Count missing values in each column
print("\nMissing values analysis:")
missing_summary = []
for col_name in df.columns:
    null_count = df.filter(col(col_name).isNull()).count()
    nan_count = df.filter(isnan(col(col_name))).count()
    total_missing = null_count + nan_count
    missing_summary.append((col_name, total_missing))
    if total_missing > 0:
        print(f"  {col_name}: {total_missing:,} missing values")

# 2. Define columns to exclude from missing value removal
exclude_columns = ['Geocoded_City1', 'Geocoded_City2']
print(f"\nExcluding columns from missing value removal: {exclude_columns}")

# 3. Remove rows with missing values (excluding Geocoded_City columns)
print(f"\nRemoving rows with missing values (excluding {exclude_columns})...")

# Create condition to check for missing values in all columns except excluded ones
missing_condition = None
for col_name in df.columns:
    if col_name not in exclude_columns:
        if missing_condition is None:
            missing_condition = col(col_name).isNull() | isnan(col(col_name))
        else:
            missing_condition = missing_condition | col(col_name).isNull() | isnan(col(col_name))

# Filter out rows with missing values in non-excluded columns
df_clean = df.filter(~missing_condition)

print(f"Dataset after removing missing values: {df_clean.count():,} rows")
print(f"Removed {df.count() - df_clean.count():,} rows ({(df.count() - df_clean.count())/df.count()*100:.2f}%)")

# 4. Verify missing values in important columns (excluding Geocoded_City)
print("\n=== VERIFICATION: Missing values after cleaning ===")
important_columns = [col for col in df_clean.columns if col not in exclude_columns]
all_clean = True
for col_name in important_columns:
    null_count = df_clean.filter(col(col_name).isNull()).count()
    nan_count = df_clean.filter(isnan(col(col_name))).count()
    total_missing = null_count + nan_count
    if total_missing > 0:
        print(f"  {col_name}: {total_missing:,} missing values")
        all_clean = False

if all_clean:
    print("✓ No missing values found in important columns!")

# Show status of excluded columns
print(f"\nStatus of excluded columns:")
for col_name in exclude_columns:
    if col_name in df_clean.columns:
        null_count = df_clean.filter(col(col_name).isNull()).count()
        print(f"  {col_name}: {null_count:,} missing values (kept as-is)")

# 5. Show final dataset statistics
print(f"\n=== FINAL DATASET STATISTICS ===")
print(f"Total rows: {df_clean.count():,}")
print(f"Total columns: {len(df_clean.columns)}")
print(f"Columns: {df_clean.columns}")

# 6. Show sample of final clean data
print("\nSample of final clean data:")
df_clean.show(5, truncate=False)

# 7. Cache the cleaned dataset for better performance
print("\nCaching cleaned dataset for better performance...")
df_clean.cache()
df_clean.count()  # Trigger caching

print("✓ Dataset cleaned and cached successfully!")


=== REMOVING MISSING VALUES ===
Dataset before removing missing values: 245,955 rows

Missing values analysis:
  carrier_lg: 1,540 missing values
  large_ms: 1,540 missing values
  fare_lg: 1,540 missing values
  carrier_low: 1,612 missing values
  lf_ms: 1,612 missing values
  fare_low: 1,612 missing values
  Geocoded_City1: 39,206 missing values
  Geocoded_City2: 39,206 missing values

Excluding columns from missing value removal: ['Geocoded_City1', 'Geocoded_City2']

Removing rows with missing values (excluding ['Geocoded_City1', 'Geocoded_City2'])...
Dataset after removing missing values: 244,343 rows
Removed 1,612 rows (0.66%)

=== VERIFICATION: Missing values after cleaning ===
✓ No missing values found in important columns!

Status of excluded columns:
  Geocoded_City1: 39,154 missing values (kept as-is)
  Geocoded_City2: 39,154 missing values (kept as-is)

=== FINAL DATASET STATISTICS ===
Total rows: 244,343
Total columns: 23
Columns: ['tbl', 'Year', 'quarter', 'citymarketid_1'

In [None]:
# Cell 4: Data Processing and COVID Labeling for Crisis Prediction
from pyspark.sql.functions import *
from pyspark.sql import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
import time

print("=" * 70)
print("XỬ LÝ DỮ LIỆU VÀ DÁN NHÃN COVID CHO CRISIS PREDICTION")
print("=" * 70)

# Bước 1: Aggregate data theo Year-Quarter
print("\n1. AGGREGATE DATA THEO YEAR-QUARTER...")

df_quarterly = df_clean.groupBy('Year', 'quarter').agg(
    # Volume metrics
    count('*').alias('num_routes'),
    sum('passengers').alias('total_passengers'),
    avg('passengers').alias('avg_passengers_per_route'),

    # Price metrics
    avg('fare').alias('avg_fare'),
    stddev('fare').alias('fare_std'),
    min('fare').alias('fare_min'),
    max('fare').alias('fare_max'),

    # Distance metrics
    avg('nsmiles').alias('avg_distance'),
    stddev('nsmiles').alias('distance_std'),

    # Competition metrics
    countDistinct('carrier_lg').alias('num_carriers'),
    avg('large_ms').alias('avg_market_share_large'),
    avg('lf_ms').alias('avg_market_share_lowcost')
).orderBy('Year', 'quarter')

# Tạo time_period identifier
df_quarterly = df_quarterly.withColumn('time_period',
    concat(col('Year').cast('string'), lit('-Q'), col('quarter').cast('string'))
)

print(f"  ✓ Aggregated to {df_quarterly.count()} quarters")

# Bước 2: Tạo labels (COVID = crisis)
print("\n2. TẠO LABELS (COVID = CRISIS)...")

df_quarterly = df_quarterly.withColumn('is_crisis',
    when((col('Year') >= 2020) & (col('Year') <= 2021), 1.0)
    .otherwise(0.0)
)

# Kiểm tra phân bố labels
label_distribution = df_quarterly.groupBy('is_crisis').count()
print("\n  Phân bố labels:")
label_distribution.show()

# Bước 3: Feature Engineering - Rate of Change
print("\n3. FEATURE ENGINEERING - RATE OF CHANGE...")

# Window specs
window_qoq = Window.orderBy('Year', 'quarter')

# QoQ (Quarter-over-Quarter) changes
change_cols = ['num_routes', 'total_passengers', 'avg_fare', 'avg_distance']

for col_name in change_cols:
    # Lấy giá trị quarter trước
    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_prev_q',
        lag(col(col_name), 1).over(window_qoq)
    )

    # Tính % change
    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_change_qoq',
        when(col(f'{col_name}_prev_q').isNotNull() & (col(f'{col_name}_prev_q') != 0),
             (col(col_name) - col(f'{col_name}_prev_q')) / col(f'{col_name}_prev_q'))
        .otherwise(0.0)
    )

    # Drop temp column
    df_quarterly = df_quarterly.drop(f'{col_name}_prev_q')

print("  ✓ QoQ changes calculated")

# YoY (Year-over-Year) changes
yoy_cols = ['num_routes', 'total_passengers', 'avg_fare']

for col_name in yoy_cols:
    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_prev_year',
        lag(col(col_name), 4).over(window_qoq)
    )

    df_quarterly = df_quarterly.withColumn(
        f'{col_name}_change_yoy',
        when(col(f'{col_name}_prev_year').isNotNull() & (col(f'{col_name}_prev_year') != 0),
             (col(col_name) - col(f'{col_name}_prev_year')) / col(f'{col_name}_prev_year'))
        .otherwise(0.0)
    )

    df_quarterly = df_quarterly.drop(f'{col_name}_prev_year')

print("  ✓ YoY changes calculated")

# Bước 4: Derived features
print("\n4. CREATING DERIVED FEATURES...")

# Volatility metrics
df_quarterly = df_quarterly.withColumn('fare_volatility',
    when(col('avg_fare') != 0, col('fare_std') / col('avg_fare')).otherwise(0.0)
)

df_quarterly = df_quarterly.withColumn('distance_volatility',
    when(col('avg_distance') != 0, col('distance_std') / col('avg_distance')).otherwise(0.0)
)

# Range metrics
df_quarterly = df_quarterly.withColumn('fare_range',
    col('fare_max') - col('fare_min')
)

# Passenger efficiency
df_quarterly = df_quarterly.withColumn('passenger_efficiency',
    when(col('num_routes') != 0, col('total_passengers') / col('num_routes')).otherwise(0.0)
)

print("  ✓ Derived features created")

# Bước 5: Handle missing values
print("\n5. HANDLING MISSING VALUES...")

# Fill NaN với 0 (từ lag functions cho first rows)
df_quarterly = df_quarterly.fillna(0.0)

# Bước 6: Select final features
print("\n6. SELECTING FEATURES FOR MODEL...")

feature_cols_final = [
    # Core volume metrics (3)
    'num_routes',
    'total_passengers',
    'avg_passengers_per_route',

    # Core price metrics (2)
    'avg_fare',
    'avg_distance',

    # Volatility metrics (2)
    'fare_volatility',
    'fare_range',

    # QoQ changes - IMPORTANT! (4)
    'num_routes_change_qoq',
    'total_passengers_change_qoq',
    'avg_fare_change_qoq',
    'avg_distance_change_qoq',

    # YoY changes (3)
    'num_routes_change_yoy',
    'total_passengers_change_yoy',
    'avg_fare_change_yoy',

    # Competition (2)
    'num_carriers',
    'avg_market_share_large',

    # Seasonal (1)
    'quarter',
]

print(f"\n  Tổng số features: {len(feature_cols_final)}")
print(f"\n  Danh sách features:")
for i, feat in enumerate(feature_cols_final, 1):
    print(f"    {i:2}. {feat}")

# Hiển thị final dataset
print("\n7. FINAL DATASET:")
df_quarterly.select(['time_period', 'Year', 'quarter', 'is_crisis'] +
                    feature_cols_final[:5]).show(15)

print(f"\n✓ Data preparation completed!")
print(f"  Total quarters: {df_quarterly.count()}")
print(f"  Total features: {len(feature_cols_final)}")
print(f"  Target: is_crisis (0 = Normal, 1 = COVID)")


In [None]:
# Cell 5: Simple Models Experiment (Logistic Regression, Linear Regression)
print("=" * 70)
print("THÍ NGHIỆM VỚI MÔ HÌNH ĐƠN GIẢN")
print("=" * 70)

# Bước 1: Prepare data for modeling
print("\n1. PREPARING DATA FOR MODELING...")

# Vector Assembler
assembler = VectorAssembler(
    inputCols=feature_cols_final,
    outputCol="features"
)

# Standard Scaler
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withStd=True,
    withMean=True
)

# Train-Test Split
print("\n2. TRAIN-TEST SPLIT...")
train_data, test_data = df_quarterly.randomSplit([0.8, 0.2], seed=42)

print(f"  Training set: {train_data.count()} quarters")
print(f"  Test set:     {test_data.count()} quarters")

# Cache data for better performance
train_data.cache()
test_data.cache()

# Bước 3: Model 1 - Logistic Regression
print("\n3. MODEL 1: LOGISTIC REGRESSION...")
start_time = time.time()

# Create pipeline
lr_pipeline = Pipeline(stages=[
    assembler,
    scaler,
    LogisticRegression(
        featuresCol="scaled_features",
        labelCol="is_crisis",
        maxIter=100,
        regParam=0.01
    )
])

# Train model
lr_model = lr_pipeline.fit(train_data)
lr_predictions = lr_model.transform(test_data)

# Evaluate
lr_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")
lr_auc = lr_evaluator.evaluate(lr_predictions)

lr_time = time.time() - start_time

print(f"  ✓ Logistic Regression - AUC: {lr_auc:.4f} (Time: {lr_time:.2f}s)")

# Bước 4: Model 2 - Linear Regression (for comparison)
print("\n4. MODEL 2: LINEAR REGRESSION...")
start_time = time.time()

# Create pipeline
linear_pipeline = Pipeline(stages=[
    assembler,
    scaler,
    LinearRegression(
        featuresCol="scaled_features",
        labelCol="is_crisis",
        maxIter=100,
        regParam=0.01
    )
])

# Train model
linear_model = linear_pipeline.fit(train_data)
linear_predictions = linear_model.transform(test_data)

# Evaluate
linear_evaluator = RegressionEvaluator(
    labelCol="is_crisis",
    predictionCol="prediction",
    metricName="rmse"
)
linear_rmse = linear_evaluator.evaluate(linear_predictions)

linear_time = time.time() - start_time

print(f"  ✓ Linear Regression - RMSE: {linear_rmse:.4f} (Time: {linear_time:.2f}s)")

# Bước 5: Model 3 - Simple Random Forest (baseline)
print("\n5. MODEL 3: SIMPLE RANDOM FOREST...")
start_time = time.time()

# Create pipeline
rf_simple_pipeline = Pipeline(stages=[
    assembler,
    RandomForestClassifier(
        featuresCol="features",
        labelCol="is_crisis",
        numTrees=50,
        maxDepth=10,
        seed=42
    )
])

# Train model
rf_simple_model = rf_simple_pipeline.fit(train_data)
rf_simple_predictions = rf_simple_model.transform(test_data)

# Evaluate
rf_simple_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")
rf_simple_auc = rf_simple_evaluator.evaluate(rf_simple_predictions)

rf_simple_time = time.time() - start_time

print(f"  ✓ Simple Random Forest - AUC: {rf_simple_auc:.4f} (Time: {rf_simple_time:.2f}s)")

# Bước 6: Show results summary
print("\n6. SIMPLE MODELS SUMMARY:")
print("=" * 50)
print(f"{'Model':<25} {'Metric':<10} {'Score':<10} {'Time(s)':<10}")
print("-" * 50)
print(f"{'Logistic Regression':<25} {'AUC':<10} {lr_auc:<10.4f} {lr_time:<10.2f}")
print(f"{'Linear Regression':<25} {'RMSE':<10} {linear_rmse:<10.4f} {linear_time:<10.2f}")
print(f"{'Simple Random Forest':<25} {'AUC':<10} {rf_simple_auc:<10.4f} {rf_simple_time:<10.2f}")
print("=" * 50)

# Show feature importance for Random Forest
print("\n7. FEATURE IMPORTANCE (Simple Random Forest):")
rf_simple_feature_importance = rf_simple_model.stages[-1].featureImportances.toArray()
feature_importance_df = spark.createDataFrame([
    (feature_cols_final[i], float(rf_simple_feature_importance[i]))
    for i in range(len(feature_cols_final))
], ["feature", "importance"]).orderBy(col("importance").desc())

print("Top 10 most important features:")
feature_importance_df.show(10, truncate=False)

print(f"\n✓ Simple models experiment completed!")
print(f"  Best simple model: {'Logistic Regression' if lr_auc > rf_simple_auc else 'Simple Random Forest'}")
print(f"  Best AUC: {max(lr_auc, rf_simple_auc):.4f}")


In [None]:
# Cell 6: Advanced Models Experiment (Random Forest, Gradient Boosting)
print("=" * 70)
print("THÍ NGHIỆM VỚI MÔ HÌNH NÂNG CAO")
print("=" * 70)

# Bước 1: Model 4 - Advanced Random Forest
print("\n1. MODEL 4: ADVANCED RANDOM FOREST...")
start_time = time.time()

# Create pipeline with more trees and depth
rf_advanced_pipeline = Pipeline(stages=[
    assembler,
    RandomForestClassifier(
        featuresCol="features",
        labelCol="is_crisis",
        numTrees=200,
        maxDepth=15,
        maxBins=32,
        subsamplingRate=0.8,
        seed=42
    )
])

# Train model
rf_advanced_model = rf_advanced_pipeline.fit(train_data)
rf_advanced_predictions = rf_advanced_model.transform(test_data)

# Evaluate
rf_advanced_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")
rf_advanced_auc = rf_advanced_evaluator.evaluate(rf_advanced_predictions)

rf_advanced_time = time.time() - start_time

print(f"  ✓ Advanced Random Forest - AUC: {rf_advanced_auc:.4f} (Time: {rf_advanced_time:.2f}s)")

# Bước 2: Model 5 - Gradient Boosting Trees
print("\n2. MODEL 5: GRADIENT BOOSTING TREES...")
start_time = time.time()

# Create pipeline
gbt_pipeline = Pipeline(stages=[
    assembler,
    GBTClassifier(
        featuresCol="features",
        labelCol="is_crisis",
        maxIter=100,
        maxDepth=6,
        stepSize=0.1,
        seed=42
    )
])

# Train model
gbt_model = gbt_pipeline.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)

# Evaluate
gbt_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")
gbt_auc = gbt_evaluator.evaluate(gbt_predictions)

gbt_time = time.time() - start_time

print(f"  ✓ Gradient Boosting Trees - AUC: {gbt_auc:.4f} (Time: {gbt_time:.2f}s)")

# Bước 3: Model 6 - Random Forest with Feature Scaling
print("\n3. MODEL 6: RANDOM FOREST WITH SCALING...")
start_time = time.time()

# Create pipeline with scaling
rf_scaled_pipeline = Pipeline(stages=[
    assembler,
    scaler,
    RandomForestClassifier(
        featuresCol="scaled_features",
        labelCol="is_crisis",
        numTrees=150,
        maxDepth=12,
        maxBins=32,
        subsamplingRate=0.9,
        seed=42
    )
])

# Train model
rf_scaled_model = rf_scaled_pipeline.fit(train_data)
rf_scaled_predictions = rf_scaled_model.transform(test_data)

# Evaluate
rf_scaled_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")
rf_scaled_auc = rf_scaled_evaluator.evaluate(rf_scaled_predictions)

rf_scaled_time = time.time() - start_time

print(f"  ✓ Random Forest with Scaling - AUC: {rf_scaled_auc:.4f} (Time: {rf_scaled_time:.2f}s)")

# Bước 4: Model 7 - Gradient Boosting with Scaling
print("\n4. MODEL 7: GRADIENT BOOSTING WITH SCALING...")
start_time = time.time()

# Create pipeline with scaling
gbt_scaled_pipeline = Pipeline(stages=[
    assembler,
    scaler,
    GBTClassifier(
        featuresCol="scaled_features",
        labelCol="is_crisis",
        maxIter=150,
        maxDepth=8,
        stepSize=0.05,
        seed=42
    )
])

# Train model
gbt_scaled_model = gbt_scaled_pipeline.fit(train_data)
gbt_scaled_predictions = gbt_scaled_model.transform(test_data)

# Evaluate
gbt_scaled_evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")
gbt_scaled_auc = gbt_scaled_evaluator.evaluate(gbt_scaled_predictions)

gbt_scaled_time = time.time() - start_time

print(f"  ✓ Gradient Boosting with Scaling - AUC: {gbt_scaled_auc:.4f} (Time: {gbt_scaled_time:.2f}s)")

# Bước 5: Show results summary
print("\n5. ADVANCED MODELS SUMMARY:")
print("=" * 60)
print(f"{'Model':<30} {'AUC':<10} {'Time(s)':<10}")
print("-" * 60)
print(f"{'Advanced Random Forest':<30} {rf_advanced_auc:<10.4f} {rf_advanced_time:<10.2f}")
print(f"{'Gradient Boosting Trees':<30} {gbt_auc:<10.4f} {gbt_time:<10.2f}")
print(f"{'Random Forest + Scaling':<30} {rf_scaled_auc:<10.4f} {rf_scaled_time:<10.2f}")
print(f"{'Gradient Boosting + Scaling':<30} {gbt_scaled_auc:<10.4f} {gbt_scaled_time:<10.2f}")
print("=" * 60)

# Bước 6: Feature Importance Analysis
print("\n6. FEATURE IMPORTANCE ANALYSIS:")

# Random Forest Feature Importance
print("\n  Random Forest Feature Importance:")
rf_advanced_feature_importance = rf_advanced_model.stages[-1].featureImportances.toArray()
rf_importance_df = spark.createDataFrame([
    (feature_cols_final[i], float(rf_advanced_feature_importance[i]))
    for i in range(len(feature_cols_final))
], ["feature", "importance"]).orderBy(col("importance").desc())

print("  Top 10 most important features (Random Forest):")
rf_importance_df.show(10, truncate=False)

# Gradient Boosting Feature Importance
print("\n  Gradient Boosting Feature Importance:")
gbt_feature_importance = gbt_model.stages[-1].featureImportances.toArray()
gbt_importance_df = spark.createDataFrame([
    (feature_cols_final[i], float(gbt_feature_importance[i]))
    for i in range(len(feature_cols_final))
], ["feature", "importance"]).orderBy(col("importance").desc())

print("  Top 10 most important features (Gradient Boosting):")
gbt_importance_df.show(10, truncate=False)

# Bước 7: Find best model so far
all_models = {
    'Logistic Regression': lr_auc,
    'Simple Random Forest': rf_simple_auc,
    'Advanced Random Forest': rf_advanced_auc,
    'Gradient Boosting Trees': gbt_auc,
    'Random Forest + Scaling': rf_scaled_auc,
    'Gradient Boosting + Scaling': gbt_scaled_auc
}

best_model_name = max(all_models, key=all_models.get)
best_auc = all_models[best_model_name]

print(f"\n7. BEST MODEL SO FAR:")
print(f"  Model: {best_model_name}")
print(f"  AUC: {best_auc:.4f}")

print(f"\n✓ Advanced models experiment completed!")
print(f"  Best advanced model: {best_model_name}")
print(f"  Best AUC: {best_auc:.4f}")


In [None]:
# Cell 7: Hyperparameter Tuning for Random Forest
print("=" * 70)
print("HYPERPARAMETER TUNING CHO RANDOM FOREST")
print("=" * 70)

# Bước 1: Prepare for Cross Validation
print("\n1. PREPARING FOR CROSS VALIDATION...")

# Create base pipeline
base_rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="is_crisis",
    seed=42
)

# Create pipeline
rf_tuning_pipeline = Pipeline(stages=[assembler, base_rf])

# Bước 2: Define Parameter Grid
print("\n2. DEFINING PARAMETER GRID...")

paramGrid = ParamGridBuilder() \
    .addGrid(base_rf.numTrees, [100, 200, 300]) \
    .addGrid(base_rf.maxDepth, [10, 15, 20]) \
    .addGrid(base_rf.maxBins, [16, 32, 64]) \
    .addGrid(base_rf.subsamplingRate, [0.8, 0.9, 1.0]) \
    .build()

print(f"  Total combinations: {len(paramGrid)}")
print("  Parameters to test:")
print("    - numTrees: [100, 200, 300]")
print("    - maxDepth: [10, 15, 20]")
print("    - maxBins: [16, 32, 64]")
print("    - subsamplingRate: [0.8, 0.9, 1.0]")

# Bước 3: Cross Validation Setup
print("\n3. SETTING UP CROSS VALIDATION...")

# Create evaluator
evaluator = BinaryClassificationEvaluator(labelCol="is_crisis")

# Create CrossValidator
cv = CrossValidator(
    estimator=rf_tuning_pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3,  # 3-fold CV
    seed=42
)

print("  ✓ Cross Validator created with 3-fold CV")

# Bước 4: Run Hyperparameter Tuning
print("\n4. RUNNING HYPERPARAMETER TUNING...")
print("  This may take several minutes...")

start_time = time.time()

# Fit the model
cv_model = cv.fit(train_data)

tuning_time = time.time() - start_time

print(f"  ✓ Hyperparameter tuning completed in {tuning_time:.2f} seconds")

# Bước 5: Get Best Model
print("\n5. GETTING BEST MODEL...")

# Get best model
best_model = cv_model.bestModel
best_params = best_model.stages[-1].extractParamMap()

# Get best score
best_score = cv_model.avgMetrics[cv_model.bestIndex]

print(f"  ✓ Best model found!")
print(f"  Best AUC: {best_score:.4f}")

# Bước 6: Show Best Parameters
print("\n6. BEST PARAMETERS:")
print("=" * 40)

for param, value in best_params.items():
    print(f"  {param.name}: {value}")

# Bước 7: Evaluate Best Model on Test Set
print("\n7. EVALUATING BEST MODEL ON TEST SET...")

# Make predictions
best_predictions = best_model.transform(test_data)

# Evaluate
best_auc = evaluator.evaluate(best_predictions)

print(f"  ✓ Best model test AUC: {best_auc:.4f}")

# Bước 8: Compare with Previous Models
print("\n8. COMPARISON WITH PREVIOUS MODELS:")
print("=" * 60)
print(f"{'Model':<30} {'AUC':<10} {'Improvement':<15}")
print("-" * 60)

# Get previous best
previous_best_auc = max(all_models.values())
improvement = best_auc - previous_best_auc

print(f"{'Previous Best':<30} {previous_best_auc:<10.4f} {'-':<15}")
print(f"{'Tuned Random Forest':<30} {best_auc:<10.4f} {improvement:+.4f}")
print("=" * 60)

if improvement > 0:
    print(f"  ✓ Improvement: +{improvement:.4f} AUC")
else:
    print(f"  ⚠ No improvement: {improvement:.4f} AUC")

# Bước 9: Feature Importance of Best Model
print("\n9. FEATURE IMPORTANCE OF BEST MODEL:")

best_feature_importance = best_model.stages[-1].featureImportances.toArray()
best_importance_df = spark.createDataFrame([
    (feature_cols_final[i], float(best_feature_importance[i]))
    for i in range(len(feature_cols_final))
], ["feature", "importance"]).orderBy(col("importance").desc())

print("Top 10 most important features (Tuned Random Forest):")
best_importance_df.show(10, truncate=False)

# Bước 10: Save Best Model
print("\n10. SAVING BEST MODEL...")

# Save the best model
best_model.write().overwrite().save("./model/best_random_forest_model")

print("  ✓ Best model saved to ./model/best_random_forest_model")

print(f"\n✓ Hyperparameter tuning completed!")
print(f"  Best model: Tuned Random Forest")
print(f"  Best AUC: {best_auc:.4f}")
print(f"  Tuning time: {tuning_time:.2f} seconds")


In [None]:
# Cell 8: Final Model Comparison and Selection
print("=" * 70)
print("SO SÁNH KẾT QUẢ VÀ CHỌN MÔ HÌNH TỐT NHẤT")
print("=" * 70)

# Bước 1: Collect All Results
print("\n1. COLLECTING ALL MODEL RESULTS...")

# Update all_models with tuned model
all_models['Tuned Random Forest'] = best_auc

# Create comprehensive results table
results_data = []
for model_name, auc_score in all_models.items():
    results_data.append((model_name, auc_score))

results_df = spark.createDataFrame(results_data, ["Model", "AUC"]).orderBy(col("AUC").desc())

print("  ✓ All model results collected")

# Bước 2: Display Final Results Table
print("\n2. FINAL MODEL COMPARISON:")
print("=" * 70)
print(f"{'Rank':<5} {'Model':<35} {'AUC':<10} {'Performance':<15}")
print("-" * 70)

# Show ranked results
ranked_results = results_df.collect()
for i, row in enumerate(ranked_results, 1):
    model_name = row['Model']
    auc_score = row['AUC']
    
    # Performance category
    if auc_score >= 0.9:
        performance = "Excellent"
    elif auc_score >= 0.8:
        performance = "Good"
    elif auc_score >= 0.7:
        performance = "Fair"
    else:
        performance = "Poor"
    
    print(f"{i:<5} {model_name:<35} {auc_score:<10.4f} {performance:<15}")

print("=" * 70)

# Bước 3: Statistical Analysis
print("\n3. STATISTICAL ANALYSIS:")

# Calculate statistics
auc_scores = [row['AUC'] for row in ranked_results]
mean_auc = sum(auc_scores) / len(auc_scores)
max_auc = max(auc_scores)
min_auc = min(auc_scores)
std_auc = (sum([(x - mean_auc) ** 2 for x in auc_scores]) / len(auc_scores)) ** 0.5

print(f"  Mean AUC: {mean_auc:.4f}")
print(f"  Max AUC:  {max_auc:.4f}")
print(f"  Min AUC:  {min_auc:.4f}")
print(f"  Std AUC:  {std_auc:.4f}")

# Bước 4: Best Model Analysis
print("\n4. BEST MODEL ANALYSIS:")

best_model_name = ranked_results[0]['Model']
best_auc_score = ranked_results[0]['AUC']

print(f"  🏆 BEST MODEL: {best_model_name}")
print(f"  🎯 BEST AUC: {best_auc_score:.4f}")

# Performance interpretation
if best_auc_score >= 0.9:
    print("  📈 Performance: EXCELLENT - Model can reliably predict crisis periods")
elif best_auc_score >= 0.8:
    print("  📈 Performance: GOOD - Model shows strong predictive capability")
elif best_auc_score >= 0.7:
    print("  📈 Performance: FAIR - Model has moderate predictive power")
else:
    print("  📈 Performance: POOR - Model needs improvement")

# Bước 5: Feature Importance Analysis (Best Model)
print("\n5. FEATURE IMPORTANCE ANALYSIS (BEST MODEL):")

if best_model_name == 'Tuned Random Forest':
    # Use the tuned model's feature importance
    best_importance_df = best_importance_df
else:
    # Get feature importance from the best model
    if 'Random Forest' in best_model_name:
        if best_model_name == 'Advanced Random Forest':
            best_importance_df = rf_importance_df
        elif best_model_name == 'Simple Random Forest':
            best_importance_df = feature_importance_df
        else:
            best_importance_df = rf_scaled_model.stages[-1].featureImportances.toArray()
            best_importance_df = spark.createDataFrame([
                (feature_cols_final[i], float(best_importance_df[i]))
                for i in range(len(feature_cols_final))
            ], ["feature", "importance"]).orderBy(col("importance").desc())
    else:
        print("  Feature importance not available for this model type")
        best_importance_df = None

if best_importance_df is not None:
    print("\n  Top 10 Most Important Features:")
    best_importance_df.show(10, truncate=False)
    
    # Get top 5 features
    top_5_features = best_importance_df.limit(5).collect()
    print("\n  🎯 Top 5 Most Critical Features for Crisis Prediction:")
    for i, row in enumerate(top_5_features, 1):
        print(f"    {i}. {row['feature']} (Importance: {row['importance']:.4f})")

# Bước 6: Model Recommendations
print("\n6. MODEL RECOMMENDATIONS:")

print("  📋 RECOMMENDATIONS:")
print("    ✓ Use the best performing model for production")
print("    ✓ Monitor model performance over time")
print("    ✓ Retrain model with new data periodically")
print("    ✓ Consider ensemble methods for improved accuracy")

if best_auc_score < 0.8:
    print("    ⚠ Consider collecting more features or data")
    print("    ⚠ Try different algorithms or ensemble methods")

# Bước 7: Business Impact Analysis
print("\n7. BUSINESS IMPACT ANALYSIS:")

print("  💼 BUSINESS IMPACT:")
print(f"    • Model can predict crisis periods with {best_auc_score:.1%} accuracy")
print("    • Early warning system for aviation industry")
print("    • Helps in strategic planning and risk management")
print("    • Supports decision making during uncertain times")

# Bước 8: Next Steps
print("\n8. NEXT STEPS:")

print("  🚀 NEXT STEPS:")
print("    1. Deploy the best model to production")
print("    2. Set up monitoring and alerting systems")
print("    3. Create dashboard for real-time crisis prediction")
print("    4. Integrate with existing business systems")
print("    5. Plan for model retraining schedule")

# Bước 9: Final Summary
print("\n9. FINAL SUMMARY:")
print("=" * 50)

print(f"🎯 EXPERIMENT COMPLETED SUCCESSFULLY!")
print(f"📊 Total models tested: {len(all_models)}")
print(f"🏆 Best model: {best_model_name}")
print(f"🎯 Best AUC: {best_auc_score:.4f}")
print(f"⏱️ Total experiment time: ~{sum([lr_time, linear_time, rf_simple_time, rf_advanced_time, gbt_time, rf_scaled_time, gbt_scaled_time, tuning_time]):.0f} seconds")

print("\n" + "=" * 50)
print("✅ CRISIS PREDICTION MODEL READY FOR DEPLOYMENT!")
print("=" * 50)
