# Flight Fare Prediction - Exploratory Data Analysis

This notebook performs EDA on the Bangladesh flight price dataset.
It uses reusable functions from `ml.src` and saves all plots to `output/figures/`.

In [None]:
import sys
import os

# Add project root to path
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.insert(0, PROJECT_ROOT)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ml.src.data_loader import load_data, load_raw_data
from ml.src.visualize import (
    plot_fare_distribution,
    plot_fare_by_airline,
    plot_avg_fare_by_season,
    plot_avg_fare_by_month,
    plot_correlation_heatmap,
    plot_predicted_vs_actual,
    plot_feature_importance,
    plot_model_comparison,
)

%matplotlib inline
sns.set_theme(style='whitegrid', font_scale=1.1)

FIGURES_DIR = os.path.join(PROJECT_ROOT, 'output', 'figures')
os.makedirs(FIGURES_DIR, exist_ok=True)

print(f'Project root: {PROJECT_ROOT}')
print(f'Figures will be saved to: {FIGURES_DIR}')

## 1. Load Data

In [None]:
# Load raw data (all columns for EDA)
df = load_raw_data()
print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Data types and info
df.info()

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# Missing values
missing = df.isnull().sum()
missing[missing > 0]

## 2. Fare Distribution

In [None]:
fig = plot_fare_distribution(df, output_dir=FIGURES_DIR)
plt.show()

## 3. Fare by Airline (Boxplot)

In [None]:
fig = plot_fare_by_airline(df, output_dir=FIGURES_DIR)
plt.show()

## 4. Average Fare by Season

In [None]:
fig = plot_avg_fare_by_season(df, output_dir=FIGURES_DIR)
plt.show()

## 5. Average Fare by Month

In [None]:
fig = plot_avg_fare_by_month(df, output_dir=FIGURES_DIR)
plt.show()

## 6. Correlation Heatmap

In [None]:
fig = plot_correlation_heatmap(df, output_dir=FIGURES_DIR)
plt.show()

## 7. KPI Exploration

In [None]:
# Average fare per airline
print('=== Average Fare per Airline ===')
avg_by_airline = df.groupby('airline')['total_fare_bdt'].agg(['mean', 'count']).sort_values('mean', ascending=False)
avg_by_airline.columns = ['avg_fare', 'flight_count']
print(avg_by_airline.to_string())

print('\n=== Most Popular Routes ===')
routes = df.groupby(['source', 'destination']).size().sort_values(ascending=False).head(10)
print(routes.to_string())

print('\n=== Top 5 Most Expensive Routes ===')
expensive = df.groupby(['source', 'destination'])['total_fare_bdt'].mean().sort_values(ascending=False).head(5)
print(expensive.to_string())

print('\n=== Seasonal Fare Variation ===')
seasonal = df.groupby('seasonality')['total_fare_bdt'].agg(['mean', 'std', 'count']).sort_values('mean', ascending=False)
print(seasonal.to_string())

## 8. Model Training & Evaluation

Train models and generate predicted vs actual + feature importance plots.

In [None]:
from ml.src.preprocessing import prepare_features, split_data
from ml.src.train import train_all_models
from ml.src.evaluate import evaluate_all_models, save_metrics

# Load ML-ready data (no leakage columns)
ml_df = load_data(source='csv')
print(f'ML data shape: {ml_df.shape}')

# Prepare features
X, y, scaler, feature_names = prepare_features(ml_df)
X_train, X_test, y_train, y_test = split_data(X, y)

print(f'Train: {X_train.shape}, Test: {X_test.shape}')
print(f'Features: {len(feature_names)}')

In [None]:
# Train all models
trained_models, cv_scores = train_all_models(X_train, y_train, cv=5, n_iter=10)

# Evaluate
metrics_df, predictions = evaluate_all_models(trained_models, X_test, y_test)
metrics_df

In [None]:
# Model comparison
fig = plot_model_comparison(metrics_df, output_dir=FIGURES_DIR)
plt.show()

## 9. Predicted vs Actual (Best Model)

In [None]:
best_name = metrics_df.iloc[0]['model']
best_preds = predictions[best_name]
best_model = trained_models[best_name]

fig = plot_predicted_vs_actual(y_test, best_preds, best_name, output_dir=FIGURES_DIR)
plt.show()

## 10. Feature Importance

In [None]:
if hasattr(best_model, 'feature_importances_'):
    fig = plot_feature_importance(
        best_model.feature_importances_, feature_names,
        best_name, top_n=20, output_dir=FIGURES_DIR
    )
    plt.show()
elif hasattr(best_model, 'coef_'):
    import numpy as np
    fig = plot_feature_importance(
        np.abs(best_model.coef_), feature_names,
        best_name, top_n=20, output_dir=FIGURES_DIR
    )
    plt.show()
else:
    print('Best model does not expose feature importances.')

## Summary

**Key Findings:**
- Fare distribution, most popular routes, seasonal patterns, and top airline pricing strategies analyzed.
- Multiple ML models compared; see metrics table above.
- All plots saved to `output/figures/` for report integration.