In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

mlp_term_2_2025_kaggle_assignment_1_path = kagglehub.competition_download('mlp-term-2-2025-kaggle-assignment-1')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Loading Train and Test datasets

In [None]:
try:
    train_df = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/train.csv")
    test_df = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/test.csv")
    submission_df = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/sample_submission.csv")
except FileNotFoundError:
    print("Recheck your dataset paths")

#Making copies
train_df_processed = train_df.copy()
test_df_processed = test_df.copy()

In [None]:
print(train_df_processed.columns)
print(test_df_processed.columns)

# Listing Datatype of Each Column in the train dataset

In [None]:
train_df_processed.info()

# Listing Datatype of Each Column in the test dataframe

In [None]:
test_df_processed.info()

# Descriptive Statistics of Numerical Columns of Train Dataframe

In [None]:
#Including the 'id' column
train_df_processed.describe()

# Descriptive Statistics of Numerical Columns of Test Dataframe

In [None]:
#Including the 'id' column
test_df_processed.describe()

# Identifying the number of missing/null values in the train dataframe

In [None]:
train_df_processed.isnull().sum()

# Identifying the number of missing/null values in the test dataframe

In [None]:
test_df_processed.isnull().sum()

In [None]:
test_df_processed.shape

# Handling Missing/null values in numerical columns

In [None]:
num_cols = ['duration', 'days_left']

broken_train = train_df_processed[num_cols].isnull().all(axis=1)
broken_test = test_df_processed[num_cols].isnull().all(axis=1)

knn_imputer = KNNImputer(n_neighbors=5)

train_df_processed[num_cols] = knn_imputer.fit_transform(train_df_processed[num_cols])
test_df_processed[num_cols] = knn_imputer.transform(test_df_processed[num_cols])

print(train_df_processed[num_cols].isnull().sum())
print(test_df_processed[num_cols].isnull().sum())

In [None]:
train_df_processed.head(10)

# Replacing the scientific notations in 'flight' column with 'NaN'

In [None]:
for col in ['airline', 'flight', 'source', 'departure', 'stops', 'arrival', 'destination', 'class']:
    if col in train_df_processed and train_df_processed[col].dtype == 'object':
        train_df_processed[col] = train_df_processed[col].replace(r'^\d+\.?\d*E[+-]?\d+$', np.nan, regex = True)
    if col in test_df_processed and test_df_processed[col].dtype == 'object':
        test_df_processed[col] = test_df_processed[col].replace(r'^\d+\.?\d*E[+-]?\d+$', np.nan, regex = True)

# Handling missing/null values in categorical columns

In [None]:
train_df_processed.head(10)

In [None]:
cat_cols = ['airline', 'departure', 'stops', 'flight']
simple_imputer = SimpleImputer(strategy='most_frequent')
train_df_processed[cat_cols] = simple_imputer.fit_transform(train_df_processed[cat_cols])
test_df_processed[cat_cols] = simple_imputer.transform(test_df_processed[cat_cols])

print(train_df_processed.isnull().sum())
print(test_df_processed.isnull().sum())

In [None]:
test_df_processed.shape

# Checking for duplicates in the train and test dataframes

In [None]:
train_df_processed.duplicated().sum()

In [None]:
test_df_processed.duplicated().sum()

There are no duplicates in these dataframes

In [None]:
train_df_processed.head()

# Checking for Outliers

In [None]:
cols = ['duration', 'days_left', 'price']

fig, axis = plt.subplots(1, 3, figsize = (15,5))
for i in range(3):
    axis[i].boxplot(train_df_processed[cols[i]])
    axis[i].set_title(f'Boxplot for {cols[i]}')

Outliers are detected in the 'duration' and 'price' columns.

# Removing rows with outliers from the train dataframe

In [None]:
for i in ['duration', 'price']:
    Q1 = train_df_processed[i].quantile(0.25)
    Q3 = train_df_processed[i].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR

    train_df_processed = train_df_processed[(train_df_processed[i] >= lower) & (train_df_processed[i] <= upper)]

print(train_df_processed.shape)

In [None]:
test_df_processed.shape

The Outliers were removed because they can mislead the model, reduce accuracy, and cause it to learn patterns that don’t represent normal data. Removing them helps the model perform better and generalize well to inputs.

Now, there are only 39634 rows in the train dataframe (after removing the rows with outliers).

# VISUALIZATION 1: COUNT OF FLIGHTS BY AIRLINE

In [None]:
sns.set(style = "whitegrid")

plt.figure(figsize = (8,5))
sns.countplot(data = train_df_processed, x='airline', order = train_df_processed['airline'].value_counts().index)

plt.xticks(rotation = 45)
plt.xlabel("Airline")
plt.ylabel("Number of Flights")
plt.title("Count of Flights by Airline")

**Observations:**
* Vistara has the highest number of flights (nearly 20000). Therefore, they are the largest contributors to the dataset.
* Air India has about 9000 flights, which is less than almost half of Vistara. But, they are in 2nd place when it comes to number of flights.
* Indigo is in 3rd place with around 5000 flights.
* GO_FIRST, AirAsia and SpiceJet have lower counts than Indigo with less than 3000 flights.

**Inference:**
* Vistara dominates the dataset, so there is a chance the model might become biased towards Vistara.
* Airlines like SpiceJet, AirAsia and GO_FIRST have fewer entries maybe because they serve fewer routes or operate regionally.

# VISUALIZATION 2: PRICE VS DURATION OF FLIGHTS

In [None]:
sns.set(style = "whitegrid")

plt.figure(figsize = (10,6))
sns.scatterplot(data = train_df_processed, x = 'duration', y = 'price', hue = 'class')

plt.title("Price vs Duration of Flights")
plt.xlabel("Duration (hours)")
plt.ylabel("Price (₹)")
plt.legend(title="Class")

**Observation:**
* Business class (blue) prices are much higher than the economy class (orange) prices. There is very little overlap between the two.
* The price does not strongly increase with duration of the flight.
* Most economy class tickets are below ₹20,000 whereas business class tickets are spread from around ₹20,000 up to almost ₹100,000

**Inference:**
* Since there is a very little overlap between blue and orange, it is clear that 'class' is a major driver of price.
* Since both business and economy prices are fairly scattered across all durations, there is weak correlation between price and duration. So, duration is not one of the major factors that affect price.
* Economy ticket prices are more consistent and affordable, regardless of how long the flight is. Business ticket prices, on the other hand, are highly variable.

# VISUALIZATION 3: AVERAGE FLIGHT PRICE PER ROUTE (SOURCE -> DESTINATION)

In [None]:
route_prices = train_df_processed.groupby(['source', 'destination'])['price'].mean().reset_index()

route_matrix = route_prices.pivot(index = 'source', columns = 'destination', values ='price')

plt.figure(figsize = (10, 6))
sns.heatmap(route_matrix, annot = True, fmt = ".0f")

plt.title("Average flight price per route (Source -> Destination)")
plt.xlabel("Destination")
plt.ylabel("Source")

**Observation:**
* The route Chennai -> Bangalore has the highest average flight price (₹ 25,692).
* The route Hyderabad -> Delhi has the lowest average flight price (₹ 17,235).
* Flights originating from Chennai generally show higher prices, especially to Bangalore, Kolkalta and Delhi.
* Flights originating from Delhi generally show lower prices.

**Inference:**
* The high cost of Chennai -> Bangalore suggests that, even though the route may be small, it is in high demand.
* Delhi's lower prices across routes indicate that, there might be high competitions between airlines there.
* Flights coming to Delhi also have low prices, indicating price has a dependency on source and destination.

# Scaling Numerical Features and Encoding Categorical Features

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import pandas as pd

num_cols = ['duration', 'days_left']
cat_cols = ['airline', 'flight', 'source', 'departure', 'stops', 'arrival', 'destination', 'class']

X = train_df_processed.drop(columns=['id', 'price'])
y = train_df_processed['price']
X_test = test_df_processed.drop(columns=['id'])

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

X_transformed = pipeline.fit_transform(X)
X_test_transformed = pipeline.transform(X_test)

feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

X_processed_df = pd.DataFrame(X_transformed, columns=feature_names)
X_test_processed_df = pd.DataFrame(X_test_transformed, columns=feature_names)

print("Train rows:", X_processed_df.shape[0])
print("Test rows: ", X_test_processed_df.shape[0])

In [None]:
test_df_processed.shape

**We scaled the numerical features in the dataset:**
* To ensure numerical features 'days_left' and 'duration' are on a similar scale.
* To prevent features with large values from dominating the learning process.
* To improve performance of models sensitive to feature magnitude.

**We encoded the categorical features in the dataset:**
* Because machine learning models require numerical inputs and cannot directly work with categorical strings.
* To ensure consistent handling of categorical variables, especially when categories are unordered.

# Train - Val Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_processed_df, y, test_size = 0.2, random_state = 42)

print("X_train: ", X_train.shape)
print("X_val: ", X_val.shape)
print("y_train: ", y_train.shape)
print("y_val: ", y_val.shape)

In [None]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_reduced = selector.fit_transform(X_train)
X_val_reduced = selector.transform(X_val)

X_reduced = pd.DataFrame(X_reduced)
X_val_reduced = pd.DataFrame(X_val_reduced)

print(X_reduced.shape)
print(X_val_reduced.shape)

# Model Building

In [None]:
model_result = []
models = {
    "SGD Regressor": SGDRegressor(),
    "Ridge Regression": Ridge(random_state = 42),
    "Lasso Regression": Lasso(random_state = 42),
    "ElasticNet Regession": ElasticNet(random_state = 42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state = 42),
    "Random Forest Regressor": RandomForestRegressor(random_state = 42, n_jobs = -1),
    "K Neighbors Regressor": KNeighborsRegressor(n_neighbors = 3),
}

for name, model in models.items():
    model.fit(X_reduced, y_train)
    y_pred = model.predict(X_val_reduced)

    r2 = r2_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    model_result.append({
        "Model": name,
        "R2": round(r2, 3),
        "MAE": round(mae, 3),
        "RMSE": round(rmse, 3)
    })

In [None]:
pd.set_option("display.float_format", lambda x: f"{x:.3f}")

model_result_df = pd.DataFrame(model_result, columns = ['Model', 'R2', 'MAE', 'RMSE'])
model_result_df = model_result_df.sort_values(by= 'R2', ascending = False).reset_index(drop = True)
print(model_result_df)

# Hyperparameter Tuning

Random Forest

In [None]:
rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20]}

grid_rf = GridSearchCV(
    RandomForestRegressor(n_jobs=-1),
    rf_params,
    cv=3,
    n_jobs=-1,
    verbose=0
)

grid_rf.fit(X_reduced, y_train)

print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Best Random Forest Estimator:", grid_rf.best_estimator_)
print("Best Random Forest r2 score: ", grid_rf.best_score_)

Decision Tree

In [None]:
dt_params = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

grid_dt = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    dt_params,
    cv=3,
    n_jobs=-1,
    verbose=0
)

grid_dt.fit(X_reduced, y_train)

print("Best Decision Tree Parameters:", grid_dt.best_params_)
print("Best Decision Tree Estimator:", grid_dt.best_estimator_)
print("Best Decision Tree r2 score:", grid_dt.best_score_)

K Neighbors

In [None]:
knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}

grid_knn = GridSearchCV(
    KNeighborsRegressor(),
    knn_params,
    cv=3,
    n_jobs=-1,
    verbose=0
)

grid_knn.fit(X_reduced, y_train)

print("Best KNN Parameters:", grid_knn.best_params_)
print("Best KNN Estimator:", grid_knn.best_estimator_)
print("Best KNN r2 score:", grid_knn.best_score_)

# Comparision of Models

Before Hyperparameter Tuning:
* Random Forest Regressor (r2 score: 0.972) - highest performance
* Decision Tree Regressor (r2 score: 0.950) - next highest performance
* K Neighbors Regressor (r2 score: 0.941) - followed closely
* Lasso Regression (r2 score: 0.909)
* Ridge Regression (r2 score: 0.909)
* SGD Regressor (r2 score: 0.909)
* ElasticNet Regession (r2 score: 0.671)

After Hyperparameter Tuning:
* Random Forest Regressor (r2 score: 0.967) - slight dip
* Decision Tree Regressor (r2 score: 0.950) - same
* K Neighbors Regressor (r2 score: 0.948) - slight increase

Highest r2 score is of Random Forest Regressor (before tuning).
Therefore, Random Forest Regressor is the best model.

In [None]:
num_cols = ['duration', 'days_left']
cat_cols = ['airline', 'flight', 'source', 'departure', 'stops', 'arrival', 'destination', 'class']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42, n_jobs=-1))
])

X_train = X[num_cols + cat_cols]
X_test = test_df_processed[num_cols + cat_cols]

pipeline.fit(X_train, y)
final_pred = pipeline.predict(X_test)

submission_df['price'] = final_pred
submission_df.to_csv("submission.csv", index=False)

print("Submission file generated successfully")
print("Predictions shape:", final_pred.shape)
print("Submission shape:", submission_df.shape)