In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

# This lists every file the Colab server can currently see
print(os.listdir('/content/drive/MyDrive/Colab Notebooks'))

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Data_train.csv")

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data["Duration"].value_counts()


In [None]:

train_data.dropna(inplace = True)

In [None]:

train_data.isnull().sum()

In [None]:
train_data["Date_of_Journey"] = pd.to_datetime(train_data["Date_of_Journey"], errors='coerce')
train_data = train_data.dropna(subset=["Date_of_Journey"])


In [None]:
train_data["Journey_day"] = train_data["Date_of_Journey"].dt.day

In [None]:
train_data["Journey_month"] = pd.to_datetime(train_data["Date_of_Journey"], format = "%d/%m/%Y").dt.month


In [None]:
train_data.head()


In [None]:
train_data.drop(["Date_of_Journey"], axis = 1, inplace = True)


In [None]:

train_data.head()

In [None]:
# Convert to datetime once, specifying the format
# %H is for 24-hour clock, %M is for minutes
train_data["Dep_Time"] = pd.to_datetime(train_data["Dep_Time"], format="%H:%M")

# Extracting Hours
train_data["Dep_hour"] = train_data["Dep_Time"].dt.hour

# Extracting Minutes
train_data["Dep_min"] = train_data["Dep_Time"].dt.minute

# Drop the original column
train_data.drop(["Dep_Time"], axis = 1, inplace = True)

In [None]:
train_data.head()

In [None]:
# Convert Arrival_Time to datetime once
# If it's just HH:MM, use format="%H:%M"
# If it has extra info, use errors='coerce' or a specific format
train_data["Arrival_Time"] = pd.to_datetime(train_data["Arrival_Time"], format="%H:%M")

# Extracting Hours
train_data["Arrival_hour"] = train_data["Arrival_Time"].dt.hour

# Extracting Minutes
train_data["Arrival_min"] = train_data["Arrival_Time"].dt.minute

# Drop the original column
train_data.drop(["Arrival_Time"], axis = 1, inplace = True)

In [None]:
train_data.head()

In [None]:
# Time taken by plane to reach destination is called Duration
# It is the differnce betwwen Departure Time and Arrival time


# Assigning and converting Duration column into list
duration = list(train_data["Duration"])

for i in range(len(duration)):
    if len(duration[i].split()) != 2:    # Check if duration contains only hour or mins
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"   # Adds 0 minute
        else:
            duration[i] = "0h " + duration[i]           # Adds 0 hour

duration_hours = []
duration_mins = []
for i in range(len(duration)):
    duration_hours.append(int(duration[i].split(sep = "h")[0]))    # Extract hours from duration
    duration_mins.append(int(duration[i].split(sep = "m")[0].split()[-1]))   # Extracts only minutes from duration

In [None]:
# Adding duration_hours and duration_mins list to train_data dataframe

train_data["Duration_hours"] = duration_hours
train_data["Duration_mins"] = duration_mins

In [None]:
train_data.drop(["Duration"], axis = 1, inplace = True)


In [None]:
train_data.head()

In [None]:
train_data["Airline"].value_counts()

In [None]:
# From graph we can see that Jet Airways Business have the highest Price.
# Apart from the first Airline almost all are having similar median

# Airline vs Price
sns.catplot(y = "Price", x = "Airline", data = train_data.sort_values("Price", ascending = False), kind="boxen", height = 6, aspect = 3)
plt.show()

In [None]:
# As Airline is Nominal Categorical data we will perform OneHotEncoding

Airline = train_data[["Airline"]]

Airline = pd.get_dummies(Airline, drop_first= True).astype(int)

Airline.head()

In [None]:
train_data["Source"].value_counts()

In [None]:
# Source vs Price

sns.catplot(y = "Price", x = "Source", data = train_data.sort_values("Price", ascending = False), kind="boxen", height = 4, aspect = 3)
plt.show()

In [None]:
# As Source is Nominal Categorical data we will perform OneHotEncoding

Source = train_data[["Source"]]

Source = pd.get_dummies(Source, drop_first= True).astype(int)

Source.head()

In [None]:
train_data["Destination"].value_counts()

In [None]:
# As Destination is Nominal Categorical data we will perform OneHotEncoding

Destination = train_data[["Destination"]]

Destination = pd.get_dummies(Destination, drop_first = True).astype(int)

Destination.head()

In [None]:
train_data["Route"]

In [None]:
# Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other

train_data.drop(["Route", "Additional_Info"], axis = 1, inplace = True)

In [None]:
train_data["Total_Stops"].value_counts()

In [None]:
# As this is case of Ordinal Categorical type we perform LabelEncoder
# Here Values are assigned with corresponding keys

train_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)

In [None]:
train_data.head()

In [None]:
# Concatenate dataframe --> train_data + Airline + Source + Destination

data_train = pd.concat([train_data, Airline, Source, Destination], axis = 1)

In [None]:
data_train.head()

In [None]:
data_train.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

In [None]:
data_train.head()

In [None]:
data_train.shape

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Test_set.csv")

In [None]:
test_data.head()

In [None]:
# Preprocessing

print("Test data Info")
print("-"*75)
print(test_data.info())

print()
print()

print("Null values :")
print("-"*75)
test_data.dropna(inplace = True)
print(test_data.isnull().sum())

# Check what the Date_of_Journey column actually contains
print("\nSample Date_of_Journey values:")
print(test_data["Date_of_Journey"].head())
print("Data type:", test_data["Date_of_Journey"].dtype)

# EDA

# Date_of_Journey - FIX: Change format to YYYY-MM-DD (ISO format)
test_data["Date_of_Journey"] = pd.to_datetime(test_data["Date_of_Journey"], format="%Y-%m-%d", errors='coerce')
test_data = test_data.dropna(subset=["Date_of_Journey"]).copy() # Explicitly make a copy after dropping NaNs
test_data["Journey_day"] = test_data["Date_of_Journey"].dt.day
test_data["Journey_month"] = test_data["Date_of_Journey"].dt.month
test_data.drop(["Date_of_Journey"], axis = 1, inplace = True)

# Dep_Time
test_data["Dep_hour"] = pd.to_datetime(test_data["Dep_Time"], format="%H:%M").dt.hour
test_data["Dep_min"] = pd.to_datetime(test_data["Dep_Time"], format="%H:%M").dt.minute
test_data.drop(["Dep_Time"], axis = 1, inplace = True)

# Arrival_Time
test_data["Arrival_hour"] = pd.to_datetime(test_data["Arrival_Time"], format="%H:%M").dt.hour
test_data["Arrival_min"] = pd.to_datetime(test_data["Arrival_Time"], format="%H:%M").dt.minute
test_data.drop(["Arrival_Time"], axis = 1, inplace = True)

# Duration - FIX: Add proper error handling
print("\nSample Duration values:")
print(test_data["Duration"].head())

duration = list(test_data["Duration"])

for i in range(len(duration)):
    # Ensure it's a string
    duration[i] = str(duration[i])
    if len(duration[i].split()) != 2:    # Check if duration contains only hour or mins
        if "h" in duration[i]:
            duration[i] = duration[i].strip() + " 0m"   # Adds 0 minute
        elif "m" in duration[i]:
            duration[i] = "0h " + duration[i]           # Adds 0 hour
        else:
            # Handle edge cases
            duration[i] = "0h 0m"

duration_hours = []
duration_mins = []
for i in range(len(duration)):
    try:
        # Extract hours
        hour_part = duration[i].split("h")[0]
        duration_hours.append(int(hour_part))

        # Extract minutes - more robust method
        if "m" in duration[i]:
            # Get everything after h and before m
            min_part = duration[i].split("h")[1].split("m")[0].strip()
            duration_mins.append(int(min_part) if min_part else 0)
        else:
            duration_mins.append(0)
    except (ValueError, IndexError) as e:
        print(f"Error parsing duration: {duration[i]}, error: {e}")
        duration_hours.append(0)
        duration_mins.append(0)

# Adding Duration column to test set
test_data["Duration_hours"] = duration_hours
test_data["Duration_mins"] = duration_mins
test_data.drop(["Duration"], axis = 1, inplace = True)


# Categorical data

print("\nAirline")
print("-"*75)
print(test_data["Airline"].value_counts())
Airline = pd.get_dummies(test_data["Airline"], drop_first= True)

print()

print("Source")
print("-"*75)
print(test_data["Source"].value_counts())
Source = pd.get_dummies(test_data["Source"], drop_first= True)

print()

print("Destination")
print("-"*75)
print(test_data["Destination"].value_counts())
Destination = pd.get_dummies(test_data["Destination"], drop_first = True)

# Additional_Info contains almost 80% no_info
# Route and Total_Stops are related to each other
test_data.drop(["Route", "Additional_Info"], axis = 1, inplace = True)

# Replacing Total_Stops
test_data.replace({"non-stop": 0, "1 stop": 1, "2 stops": 2, "3 stops": 3, "4 stops": 4}, inplace = True)

# Concatenate dataframe --> test_data + Airline + Source + Destination
data_test = pd.concat([test_data, Airline, Source, Destination], axis = 1)

data_test.drop(["Airline", "Source", "Destination"], axis = 1, inplace = True)

print()
print()

print("Shape of test data : ", data_test.shape)


In [None]:
# Identify boolean columns in data_train and convert them to int
for col in data_train.select_dtypes(include='bool').columns:
    data_train[col] = data_train[col].astype(int)

# Identify boolean columns in data_test and convert them to int
for col in data_test.select_dtypes(include='bool').columns:
    data_test[col] = data_test[col].astype(int)

print("Data_train after boolean to int conversion:")
display(data_train.head())

print("\nData_test after boolean to int conversion:")
display(data_test.head())

In [None]:
data_test.head()

In [None]:
data_train.shape

In [None]:
data_train.columns

In [None]:
X = data_train.loc[:, ['Total_Stops', 'Journey_day', 'Journey_month', 'Dep_hour',
       'Dep_min', 'Arrival_hour', 'Arrival_min', 'Duration_hours',
       'Duration_mins', 'Airline_Cebu Pacific', 'Airline_Philippine Airlines',
       'Source_Cagayan de Oro (CGY)', 'Source_Caticlan (MPH)',
       'Source_Cebu (CEB)', 'Source_Davao (DVO)',
       'Source_General Santos (GES)', 'Source_Iloilo (ILO)',
       'Source_Kalibo (KLO)', 'Source_Legazpi (LGP)', 'Source_Manila (MNL)',
       'Source_Puerto Princesa (PPS)', 'Source_Tacloban (TAC)',
       'Source_Zamboanga (ZAM)', 'Destination_Cagayan de Oro (CGY)',
       'Destination_Caticlan (MPH)', 'Destination_Cebu (CEB)',
       'Destination_Clark (CRK)', 'Destination_Davao (DVO)',
       'Destination_General Santos (GES)', 'Destination_Iloilo (ILO)',
       'Destination_Kalibo (KLO)', 'Destination_Legazpi (LGP)',
       'Destination_Manila (MNL)', 'Destination_Puerto Princesa (PPS)',
       'Destination_Tacloban (TAC)', 'Destination_Zamboanga (ZAM)']]
X.head()

In [None]:
y = data_train.iloc[:, 1]
y = pd.to_numeric(y, errors='coerce')
y.dropna(inplace=True)
y.head()

In [None]:
# Finds correlation between Independent and dependent attributes

# Convert 'Price' column to numeric in data_train, coercing errors to NaN and then dropping them
data_train['Price'] = pd.to_numeric(data_train['Price'], errors='coerce')
data_train.dropna(subset=['Price'], inplace=True)

plt.figure(figsize = (18,18))
sns.heatmap(data_train.corr(), annot = True, cmap = "RdYlGn")

plt.show()

In [None]:
# Important feature using ExtraTreesRegressor

from sklearn.ensemble import ExtraTreesRegressor
selection = ExtraTreesRegressor()
selection.fit(X, y)

In [None]:
print(selection.feature_importances_)

In [None]:
#plot graph of feature importances for better visualization

plt.figure(figsize = (12,8))
feat_importances = pd.Series(selection.feature_importances_, index=X.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg_rf = RandomForestRegressor()
reg_rf.fit(X_train, y_train)

In [None]:
y_pred = reg_rf.predict(X_test)

In [None]:
reg_rf.score(X_train, y_train)

In [None]:
sns.distplot(y_test-y_pred)
plt.show()

In [None]:
plt.scatter(y_test, y_pred, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
from sklearn import metrics

In [None]:

print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# RMSE/(max(DV)-min(DV))

2090.5509/(max(y)-min(y))

In [None]:
metrics.r2_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
# 'auto' is deprecated, using 'sqrt' as it's often a good default
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

In [None]:
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
# Random search of parameters, using 5 fold cross validation,
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = reg_rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
prediction = rf_random.predict(X_test)

In [None]:
plt.figure(figsize = (8,8))
sns.distplot(y_test-prediction)
plt.show()

In [None]:
plt.figure(figsize = (8,8))
plt.scatter(y_test, prediction, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))

In [None]:
import pickle

# 1. Save the model correctly
# Using 'with' ensures the file is closed and saved immediately
save_path = '/content/drive/MyDrive/Colab Notebooks/flight_rf.pkl'

with open(save_path, 'wb') as file:
    pickle.dump(reg_rf, file)

print("Model saved successfully!")

# 2. Load the model back
with open(save_path, 'rb') as model_file:
    forest = pickle.load(model_file)

# 3. Test prediction
y_prediction = forest.predict(X_test)
print(f"R2 Score: {metrics.r2_score(y_test, y_prediction)}")