In [1]:
# Importing packages
from datetime import datetime
import re
import pandas as pd

In [49]:
#Loading datafile
df=pd.read_parquet('../data/processed/full_file.parquet')

In [50]:
# checking columns
df.columns

Index(['searchDate', 'flightDate', 'startingAirport', 'destinationAirport',
       'travelDuration_minutes', 'isBasicEconomy', 'isRefundable', 'isNonStop',
       'totalFare', 'totalTravelDistance', 'depart_time',
       'segmentsAirlineCode', 'segmentsCabinCode'],
      dtype='object')

In [51]:
# Creating features for modeling

if 'startingAirport' in df.columns: df['startingAirportNum'], _ = pd.factorize(df['startingAirport'])
if 'destinationAirport' in df.columns: df['destinationAirportNum'], _ = pd.factorize(df['destinationAirport'])
if 'segmentsCabinCode' in df.columns: 
    df['segmentsCabinCodeNum'], _ = pd.factorize(df['segmentsCabinCode'])
if 'segmentsAirlineCode' in df.columns: df['AirlineCodeNum'], _ = pd.factorize(df['segmentsAirlineCode'])
if 'searchDate' in df.columns: df['searchDateNum'], _ = pd.factorize(df['searchDate'])
if 'flightDate' in df.columns: df['flightDateNum'], _ = pd.factorize(df['flightDate'])
if 'depart_time' in df.columns: df['depart_timeNum'], _ = pd.factorize(df['depart_time']) 

In [57]:
# Data cleaning
df.loc[df['totalTravelDistance'] == "PT0H0M", 'totalTravelDistance'] = "0"
df['totalTravelDistance']=df['totalTravelDistance'].astype('float')
df['totalFare']=df['totalFare'].astype('float')

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration_minutes,isBasicEconomy,isRefundable,isNonStop,totalFare,totalTravelDistance,depart_time,segmentsAirlineCode,segmentsCabinCode,startingAirportNum,destinationAirportNum,segmentsCabinCodeNum


In [77]:
# Model for gradient boost

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define all features and target variable
features = [
    'startingAirportNum',
    'destinationAirportNum', 
    'segmentsCabinCodeNum',
    'searchDateNum',
    'flightDateNum', 
    'depart_timeNum', 
    'AirlineCodeNum',
    'travelDuration_minutes',
    'totalTravelDistance'    
]

target = 'totalFare'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting regression model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the model's performance metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 19042.97
R-squared (R2): 0.56


In [78]:
# Saving the model file
from joblib import dump
#dump(model,'../models/model_gb_boost_srusti.joblib')
# Dump model file after compressing. The raw file was of size 153 MB due to which the heroku app was failing.
dump(model,'../models/model_gb_boost_srusti.joblib', compress=3)

['../models/model_gb_boost_srusti.joblib']

In [101]:
# EDA for streamlit app creation

df_temp1=pd.DataFrame()
df_temp1['startingAirport']=df['startingAirport'].unique()
if 'startingAirport' in df_temp1.columns: 
    df_temp1['startingAirportNum'], _ = pd.factorize(df_temp1['startingAirport'])
    
    
df_temp2=pd.DataFrame()
df_temp2['destinationAirport']=df['destinationAirport'].unique()
if 'destinationAirport' in df_temp2.columns: 
    df_temp2['destinationAirportNum'], _ = pd.factorize(df_temp2['destinationAirport'])
    
df_temp3=pd.DataFrame()
df_temp3['segmentsCabinCode']=df['segmentsCabinCode'].unique()
if 'segmentsCabinCode' in df_temp3.columns: 
    df_temp3['segmentsCabinCodeNum'], _ = pd.factorize(df_temp3['segmentsCabinCode'])

In [102]:
df_temp1

Unnamed: 0,startingAirport,startingAirportNum
0,EWR,0
1,JFK,1
2,ORD,2
3,ATL,3
4,BOS,4
5,LAX,5
6,SFO,6
7,CLT,7
8,OAK,8
9,LGA,9


In [114]:
df_temp2

Unnamed: 0,destinationAirport,destinationAirportNum
0,ATL,0
1,BOS,1
2,CLT,2
3,DEN,3
4,DFW,4
5,DTW,5
6,IAD,6
7,LAX,7
8,MIA,8
9,OAK,9


In [112]:
df_temp3

Unnamed: 0,segmentsCabinCode,segmentsCabinCodeNum
0,coach,0
1,first,1
2,business,2
3,premium coach,3
