# Flight Price Prediction

References:
- model selection: https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html
- learning paths: https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.html

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.stattools import durbin_watson

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', 50)

SIMPLIFIED_PLOTS = True
PLOT_SUMSAMPLE = 2000 # Used for heavy plots like scatter

In [3]:
df_raw = pd.read_csv("./data/Clean_Dataset.csv", index_col=0)
df_raw_sample = df_raw.sample(n=PLOT_SUMSAMPLE)
df_raw 

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,UK-822,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,UK-826,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,UK-832,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,UK-828,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


## Exploratory Data Analysis
- Top 5 Highest correlation predictors: class, duration, days_left, stops, airline
- Target: Bimodal due to classes. 
- duration class:Business: Log, heteroskedasticity 
- duration class:Economy: Linear, heteroskedasticity
- days_left class:economy: 1/x

In [4]:
# Create EDA report IF it doesn't exist
path_eda_report = Path("EDA/profiling_report.html")
path_eda_report.parent.mkdir(exist_ok=True)
if not path_eda_report.exists():
    profile = ProfileReport(df_raw, title="Profiling Report")
    profile.to_file(path_eda_report)

In [5]:
df_raw.describe()

Unnamed: 0,duration,days_left,price
count,300153.0,300153.0,300153.0
mean,12.221021,26.004751,20889.660523
std,7.191997,13.561004,22697.767366
min,0.83,1.0,1105.0
25%,6.83,15.0,4783.0
50%,11.25,26.0,7425.0
75%,16.17,38.0,42521.0
max,49.83,49.0,123071.0


In [None]:
px.histogram(df_raw, x="price", color="class")
# df_raw.price.hist(bins=20)

In [7]:
if SIMPLIFIED_PLOTS:
    fig = px.scatter(df_raw_sample, x="days_left", y="price", color="class")
else: 
    fig = px.scatter(df_raw, x="days_left", y="price", color="class", marginal_y="violin", marginal_x="box", trendline="ols", template="simple_white")
fig.show()

In [8]:
if SIMPLIFIED_PLOTS:
    fig = px.scatter(df_raw_sample, x="duration", y="price", color="class")
else:
    fig = px.scatter(df_raw, x="duration", y="price", color="class", marginal_y="violin", marginal_x="box", trendline="ols", template="simple_white")
fig.show()

## Data Cleaning and Preprocessing

In [9]:
df = df_raw.copy()

df.drop(columns=["flight"], inplace=True)

y = df['price']
X = df.drop(columns=['price'])

X_encoded = pd.get_dummies(X, drop_first=True)

# Step 5: Add intercept
X_encoded = sm.add_constant(X_encoded)

# Make sure all data is numeric
X_encoded = X_encoded.astype(float)
y = y.astype(float)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,const,duration,days_left,airline_Air_India,airline_GO_FIRST,airline_Indigo,airline_SpiceJet,airline_Vistara,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai,departure_time_Early_Morning,departure_time_Evening,departure_time_Late_Night,departure_time_Morning,departure_time_Night,stops_two_or_more,stops_zero,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai,class_Economy
148417,1.0,19.42,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
36879,1.0,7.00,13.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
274531,1.0,21.17,44.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
166397,1.0,10.25,11.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
272722,1.0,26.50,5.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,1.0,20.50,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
259178,1.0,25.42,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
131932,1.0,13.67,29.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
146867,1.0,8.33,39.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


## Lasso, Statsmodels Implementation

In [10]:
def evaluate_predictions(y_true, y_pred):
    # Metrics
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)

    print(f"R²: {r2:.3f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")

def show_nonzero_coefficients(coef:pd.Series):
    # Filter non-zero coefficients
    nonzero_coef = coef[coef != 0]
    print(f"\nSelected Coefficients (non-zero): {len(nonzero_coef)}")
    # Build DataFrame with sign and absolute importance
    coef_df = pd.DataFrame({
        "Selected Feature": nonzero_coef.index,
        "Coefficient": nonzero_coef.values,
        "Importance": nonzero_coef.abs().values
    })

    # Sort by absolute importance (descending)
    coef_df = coef_df.sort_values(by="Importance", ascending=False).reset_index(drop=True)
    coef_df.drop(columns="Importance", inplace=True)
    return coef_df

In [11]:
# Step 4: Scale numeric features
X_preprocessed = X_train.copy()
NUMERICAL_COLS = ['duration', 'days_left']
scaler = StandardScaler()
X_preprocessed[NUMERICAL_COLS] = scaler.fit_transform(X_preprocessed[NUMERICAL_COLS])

# Step 6: Fit Lasso using statsmodels (fit_regularized)
model = sm.OLS(y_train, X_preprocessed)
lasso_result = model.fit_regularized(method='elastic_net', alpha=1.0, L1_wt=1.0)

# Predictions
y_pred = lasso_result.predict(X_preprocessed)
print("Metrics Training:")
evaluate_predictions(y_train, y_pred)

# Step 7: Display coefficients
coef = pd.Series(lasso_result.params, index=X_preprocessed.columns)
show_nonzero_coefficients(coef)

Metrics Training:
R²: 0.911
MSE: 45875614.68
RMSE: 6773.15
MAE: 4594.10

Selected Coefficients (non-zero): 25


Unnamed: 0,Selected Feature,Coefficient
0,const,50748.899859
1,class_Economy,-44722.005244
2,stops_zero,-7510.561076
3,airline_Vistara,3178.035956
4,days_left,-1782.591587
5,source_city_Kolkata,1706.785602
6,destination_city_Hyderabad,-1675.968602
7,source_city_Hyderabad,-1639.671488
8,destination_city_Delhi,-1507.124571
9,destination_city_Kolkata,1493.95061


## Lasso, ScikitLearn Implementation

- Same training R2 as Statsmodels but slightly different coefficients and MSE for some reason. Scaling difference?
- Statsmodels doesn't give information table, so we might as well use Scikitlearn
- It's also x10 times faster

In [12]:
# 4. Build pipeline: scale numericals and fit Lasso
# Preprocessing: scale numericals, pass-through the rest
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERICAL_COLS)
    ],
    remainder='passthrough'  # leave one-hot encoded dummies unchanged
)

lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso(alpha=1.0))
])

lasso_pipeline.fit(X_train, y_train)

# 5. Predictions and metrics
y_pred = lasso_pipeline.predict(X_train)
print("Metrics Training:")
evaluate_predictions(y_train, y_pred )

# 6. Extract coefficients
lasso = lasso_pipeline.named_steps['model']
coef = pd.Series(lasso.coef_, index=X_encoded.columns)
show_nonzero_coefficients(coef)

Metrics Training:
R²: 0.911
MSE: 45592781.12
RMSE: 6752.24
MAE: 4573.16

Selected Coefficients (non-zero): 29


Unnamed: 0,Selected Feature,Coefficient
0,class_Economy,-44927.039846
1,stops_zero,-7594.554556
2,airline_Vistara,3959.834212
3,airline_SpiceJet,2158.949125
4,stops_two_or_more,2066.661971
5,airline_Indigo,2010.066314
6,duration,-1769.48933
7,destination_city_Hyderabad,-1683.962152
8,source_city_Hyderabad,-1647.005015
9,source_city_Kolkata,1608.552126


## CrossValidation

In [13]:
# 3. Define alpha (lambda) values to search
alphas = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30 ,100, 300, 1000]

# 4. Create pipeline with LassoCV
lassocv_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso_cv', LassoCV(alphas=alphas, cv=10, max_iter=10000))
])

# 6. Fit pipeline
lassocv_pipeline.fit(X_train, y_train)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




In [14]:
# 7. Best lambda
best_alpha = lassocv_pipeline.named_steps['lasso_cv'].alpha_
print(f"Best lambda (alpha) from CV: {best_alpha}")

print("Metrics Train:")
y_pred = lassocv_pipeline.predict(X_train)
evaluate_predictions(y_train, y_pred)

print("\nMetrics Test:")
y_pred = lassocv_pipeline.predict(X_test)
evaluate_predictions(y_test, y_pred)

# 6. Extract coefficients
lasso = lassocv_pipeline.named_steps['lasso_cv']
coef = pd.Series(lasso.coef_, index=X_encoded.columns)
show_nonzero_coefficients(coef)

Best lambda (alpha) from CV: 0.001
Metrics Train:
R²: 0.911
MSE: 45591331.05
RMSE: 6752.14
MAE: 4573.98

Metrics Test:
R²: 0.911
MSE: 45720771.19
RMSE: 6761.71
MAE: 4553.29

Selected Coefficients (non-zero): 30


Unnamed: 0,Selected Feature,Coefficient
0,class_Economy,-44929.059741
1,stops_zero,-7613.315572
2,airline_Vistara,4068.801384
3,airline_SpiceJet,2303.840003
4,airline_Indigo,2124.210815
5,stops_two_or_more,2105.203233
6,duration,-1769.90824
7,destination_city_Hyderabad,-1712.043004
8,airline_GO_FIRST,1691.971492
9,source_city_Hyderabad,-1675.405911


In [15]:
import plotly.graph_objects as go

# Extract model and aligned data
lasso_cv = lassocv_pipeline.named_steps['lasso_cv']
alphas_used = lasso_cv.alphas_  # Correct order (descending)
mean_mse = lasso_cv.mse_path_.mean(axis=1)
std_mse = lasso_cv.mse_path_.std(axis=1)
best_alpha = lasso_cv.alpha_

# Step 2: Find minimum mean MSE and threshold
min_index = np.argmin(mean_mse)
mse_min = mean_mse[min_index]
mse_threshold = mse_min + std_mse[min_index]

# Find largest alpha within 1-SE threshold (first from the top that satisfies condition)
alpha_1se = alphas_used[mean_mse <= mse_threshold][0]

print(f"Best alpha (min MSE): {best_alpha}")
print(f"MSE_min: {mse_min:.2f}")
print(f"1_SE: {mse_min:.2f}")
print(f"MSE_min + 1-SE threshold: {mse_threshold:.2f}")
print(f"Alpha from 1-SE rule: {alpha_1se}")

# Plot
fig = go.Figure()

# Mean MSE with error bars
fig.add_trace(go.Scatter(
    x=alphas_used,
    y=mean_mse,
    error_y=dict(
        type='data',
        array=std_mse,
        visible=True
    ),
    mode='lines+markers',
    name='Mean CV MSE',
    line=dict(color='royalblue'),
    marker=dict(size=8)
))

# Vertical line for best alpha
fig.add_trace(go.Scatter(
    x=[best_alpha, best_alpha],
    y=[min(mean_mse) * 0.95, max(mean_mse) * 1.05],
    mode='lines',
    name=f'Best alpha = {best_alpha:.4f}',
    line=dict(color='red', dash='dash')
))

# Layout
fig.update_layout(
    title='LassoCV: Mean Cross-Validation Error vs Alpha',
    xaxis=dict(title='Alpha (log scale)', type='log'),
    yaxis=dict(title='Mean CV MSE'),
    legend=dict(x=0.01, y=0.99),
    template='plotly_white',
    width=800,
    height=500
)

# Add vertical line for 1-SE rule alpha
fig.add_trace(go.Scatter(
    x=[alpha_1se, alpha_1se],
    y=[min(mean_mse) * 0.95, max(mean_mse) * 1.05],
    mode='lines',
    name=f'1-SE alpha = {alpha_1se:.4f}',
    line=dict(color='green', dash='dot')
))

fig.show()

Best alpha (min MSE): 0.001
MSE_min: 45602697.61
1_SE: 45602697.61
MSE_min + 1-SE threshold: 46236502.63
Alpha from 1-SE rule: 30.0


In [17]:
import numpy as np
import plotly.graph_objects as go
from sklearn.linear_model import lasso_path

# Compute Lasso path
X_pipeline = preprocessor.fit_transform(X_train)
alphas_lasso, coefs_lasso, _ = lasso_path(X_pipeline, y_train, eps=5e-4)
feature_names = X_train.columns.to_list()

# Count non-zero coefficients per alpha
nonzero_counts = (coefs_lasso != 0).sum(axis=0)

# Find where non-zero count changes
change_indices = np.where(np.diff(nonzero_counts) != 0)[0] + 1
change_alphas = alphas_lasso[change_indices]
change_counts = nonzero_counts[change_indices]

# Create base plot: coefficient paths
fig = go.Figure()

for coef, name in zip(coefs_lasso, feature_names):
    fig.add_trace(go.Scatter(
        x=alphas_lasso,
        y=coef,
        mode='lines',
        name=name
    ))

# Add vertical dotted lines where non-zero feature count changes
for alpha, count in zip(change_alphas, change_counts):
    fig.add_trace(go.Scatter(
        x=[alpha, alpha],
        y=[coefs_lasso.min(), coefs_lasso.max()],
        mode='lines',
        line=dict(dash='dot', color='gray'),
        showlegend=False,
        hovertemplate=f"Alpha: {alpha:.4f}<br>Non-zero features: {count}<extra></extra>"
    ))

# Final layout
fig.update_layout(
    title='Lasso Regularization Path (with Feature Selection Steps)',
    xaxis=dict(title='Alpha (log scale)', type='log'),
    yaxis=dict(title='Coefficient Value'),
    template='plotly_white',
    legend_title_text='Features',
    width=900,
    height=600
)

fig.show()


## Final Selected Model

In [18]:
lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso(alpha=alpha_1se))
])

lasso_pipeline.fit(X_train, y_train)

print(f"Selected Alpha from 1-SE rule: {alpha_1se}")

# 5. Predictions and metrics
print("\nMetrics Training:")
y_pred = lasso_pipeline.predict(X_train)
evaluate_predictions(y_train, y_pred )

print("\nMetrics Test:")
y_pred = lasso_pipeline.predict(X_test)
evaluate_predictions(y_test, y_pred )

# 6. Extract coefficients
lasso = lasso_pipeline.named_steps['model']
coef = pd.Series(lasso.coef_, index=X_encoded.columns)
show_nonzero_coefficients(coef)

Selected Alpha from 1-SE rule: 30.0

Metrics Training:
R²: 0.911
MSE: 45938135.87
RMSE: 6777.77
MAE: 4546.78

Metrics Test:
R²: 0.911
MSE: 46065085.54
RMSE: 6787.13
MAE: 4526.19

Selected Coefficients (non-zero): 19


Unnamed: 0,Selected Feature,Coefficient
0,class_Economy,-44712.037512
1,stops_zero,-7233.533484
2,airline_Vistara,2688.257792
3,duration,-1753.843322
4,source_city_Kolkata,1534.823157
5,destination_city_Hyderabad,-1364.36319
6,destination_city_Kolkata,1315.960962
7,source_city_Hyderabad,-1279.974772
8,destination_city_Delhi,-1244.439433
9,stops_two_or_more,1171.393483
