In [1]:
# Importing packages
from datetime import datetime
import re
import pandas as pd

In [2]:
#Loading datafile
df=pd.read_parquet('../data/processed/full_file.parquet')

In [3]:
# checking columns
df

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration_minutes,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,depart_time,segmentsAirlineCode,segmentsCabinCode
0,2022-04-16,2022-04-17,EWR,ATL,142,False,False,True,190.59,PT0H0M,17:17,NK,coach
1,2022-04-16,2022-04-17,EWR,ATL,150,False,False,True,198.6,762.0,20:40,B6,coach
2,2022-04-16,2022-04-17,EWR,ATL,149,True,False,True,203.6,762.0,16:19,UA,coach
3,2022-04-16,2022-04-17,EWR,ATL,502,False,False,False,207.6,2105.0,15:31,AA,coach
4,2022-04-16,2022-04-17,EWR,ATL,150,False,False,True,223.6,762.0,17:59,UA,coach
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13519994,2022-05-19,2022-06-03,DTW,SFO,600,False,False,False,621.6,2652.0,12:35,DL,coach
13519995,2022-05-19,2022-06-03,DTW,SFO,418,False,False,False,637.6,2087.0,07:05,AA,coach
13519996,2022-05-19,2022-06-03,DTW,SFO,946,False,False,False,667.19,3051.0,22:50,DL,coach
13519997,2022-05-19,2022-06-03,DTW,SFO,625,False,False,False,681.61,2606.0,07:15,AS,coach


In [51]:
# Creating features for modeling

if 'startingAirport' in df.columns: df['startingAirportNum'], _ = pd.factorize(df['startingAirport'])
if 'destinationAirport' in df.columns: df['destinationAirportNum'], _ = pd.factorize(df['destinationAirport'])
if 'segmentsCabinCode' in df.columns: 
    df['segmentsCabinCodeNum'], _ = pd.factorize(df['segmentsCabinCode'])
if 'segmentsAirlineCode' in df.columns: df['AirlineCodeNum'], _ = pd.factorize(df['segmentsAirlineCode'])
if 'searchDate' in df.columns: df['searchDateNum'], _ = pd.factorize(df['searchDate'])
if 'flightDate' in df.columns: df['flightDateNum'], _ = pd.factorize(df['flightDate'])
if 'depart_time' in df.columns: df['depart_timeNum'], _ = pd.factorize(df['depart_time']) 

In [57]:
# Data cleaning
df.loc[df['totalTravelDistance'] == "PT0H0M", 'totalTravelDistance'] = "0"
df['totalTravelDistance']=df['totalTravelDistance'].astype('float')
df['totalFare']=df['totalFare'].astype('float')

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration_minutes,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,depart_time,segmentsAirlineCode,segmentsCabinCode,startingAirportNum,destinationAirportNum,segmentsCabinCodeNum


In [77]:
# Model for gradient boost

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define all features and target variable
features = [
    'startingAirportNum',
    'destinationAirportNum', 
    'segmentsCabinCodeNum',
    'searchDateNum',
    'flightDateNum', 
    'depart_timeNum', 
    'AirlineCodeNum',
    'travelDuration_minutes',
    'totalTravelDistance'    
]

target = 'totalFare'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting regression model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the model's performance metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 19042.97
R-squared (R2): 0.56


In [78]:
# Saving the model file
from joblib import dump
#dump(model,'../models/model_gb_boost_srusti.joblib')
# Dump model file after compressing. The raw file was of size 153 MB due to which the heroku app was failing.
dump(model,'../models/model_gb_boost_srusti.joblib', compress=3)

['../models/model_gb_boost_srusti.joblib']

4
