In [1]:
# Importing packages
from datetime import datetime
import re
import pandas as pd
import numpy as np

In [2]:
#Loading datafile
df=pd.read_parquet('../data/processed/full_file.parquet')

In [3]:
# Define conditions and corresponding values using a dictionary
conditions = {
    "business": 1,
    "business||business": 1,
    "coach": 2,
    "coach||coach": 2,
    "coach||coach||coach": 2,
    "first": 3,
    "first||first": 3,
    "first||first||first": 3,
    "business||business||coach": 5,
    "business||coach": 5,
    "business||coach||business": 5,
    "business||coach||coach": 5,
    "business||first": 5,
    "business||first||first": 5,
    "coach||business": 5,
    "coach||business||business": 5,
    "coach||business||coach": 5,
    "coach||business||first": 5,
    "coach||coach||business": 5,
    "coach||coach||business||coach": 5,
    "coach||coach||coach||coach": 5,
    "coach||coach||coach||first": 5,
    "coach||coach||coach||premium coach": 5,
    "coach||coach||first": 5,
    "coach||coach||first||coach": 5,
    "coach||coach||first||first": 5,
    "coach||coach||premium coach": 5,
    "coach||coach||premium coach||coach": 5,
    "coach||coach||premium coach||premium coach": 5,
    "coach||first": 5,
    "coach||first||coach": 5,
    "coach||first||first": 5,
    "coach||premium coach": 5,
    "coach||premium coach||coach": 5,
    "coach||premium coach||premium coach": 5,
    "first||business": 5,
    "first||coach": 5,
    "first||coach||business": 5,
    "first||coach||coach": 5,
    "first||coach||coach||coach": 5,
    "first||coach||first": 5,
    "first||first||coach": 5,
    "first||first||coach||coach": 5,
    "premium coach||business||coach": 5,
    "premium coach||coach": 5,
    "premium coach||coach||coach": 5,
    "premium coach||coach||coach||coach": 5,
    "premium coach||first": 5,
    "premium coach||premium coach||coach": 5,
    "premium coach": 4,
    "premium coach||premium coach": 4,
    "premium coach||premium coach||premium coach": 4
}

# Use numpy.select to set 'segmentsCabinCodeNum' based on conditions
df['segmentsCabinCodeNum'] = np.select([df['segmentsCabinCode'] == cond for cond in conditions.keys()], list(conditions.values()), default=0)

In [9]:
# Creating features for modeling
from datetime import datetime
import re
import pandas as pd
from dateutil.parser import parse
from dateutil import parser


# Convert 'searchDate' to Unix timestamps
df['searchDateNum'] = pd.to_datetime(df['searchDate']).astype(int) // 10**9
df['flightDateNum'] = pd.to_datetime(df['flightDate']).astype(int) // 10**9
if 'startingAirport' in df.columns: df['startingAirportNum'], _ = pd.factorize(df['startingAirport'])
if 'destinationAirport' in df.columns: df['destinationAirportNum'], _ = pd.factorize(df['destinationAirport'])
if 'segmentsArrivalAirportCode' in df.columns: 
    df['segmentsArrivalAirportCodeNum'], _ = pd.factorize(df['segmentsArrivalAirportCode'])
if 'segmentsDepartureAirportCode' in df.columns: 
    df['segmentsDepartureAirportCodeNum'], _ = pd.factorize(df['segmentsDepartureAirportCode'])        
    
# Define a function to convert duration strings to 'x.xx' format
def convert_duration(duration_str):
    match = re.search(r'(\d+)H(\d+)M', duration_str)
    if match:
        hours, minutes = map(int, match.groups())
        total_hours = hours + minutes / 60.0
        return '{:.2f}'.format(total_hours)
    else:
        return None

# Apply the conversion function to the 'travelDuration' column
df['travelDuration_minutes']=df['travelDuration_minutes'].astype(str)
if 'travelDuration_minutes' in df.columns: df['travelDurationNum'] = df['travelDuration_minutes'].apply(convert_duration)
    
    
##########################################################################################

df['segmentsDepartureTimeRawStr'] = df['segmentsDepartureTimeRaw'].astype(str)
df['segmentsDepartureTimeRawStr'] = df['segmentsDepartureTimeRawStr'].str.split(r'\|\|').str[0]

df['segmentsDepartureTimeRawStr'] = df['segmentsDepartureTimeRawStr'].apply(lambda x: parser.parse(x) if pd.notna(x) else None)
df['day_of_week'] = df['segmentsDepartureTimeRawStr'].apply(lambda x: x.strftime('%A') if x else None)

df['departure_month'] = df['segmentsDepartureTimeRawStr'].apply(lambda x: pd.to_datetime(x).month if pd.notna(x) else None)
df['departure_time_bin'] = df['segmentsDepartureTimeRawStr'].apply(lambda x: (x.hour * 60 + x.minute) // 120)
df['departure_month'] = df['segmentsDepartureTimeRawStr'].apply(lambda x: pd.to_datetime(x).month if pd.notna(x) else None)
    
##########################################################################################

# Define a mapping from day names to numbers
day_to_number = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}

# Convert day of the week to a number
df['departure_day_of_week'] = df['day_of_week'].map(day_to_number)
 

In [10]:
# Data cleaning
df.loc[df['totalTravelDistance'] == "PT0H0M", 'totalTravelDistance'] = "0"
df['totalTravelDistance']=df['totalTravelDistance'].astype('float')
df['totalFare']=df['totalFare'].astype('float')

In [13]:
df[['startingAirportNum',
    'destinationAirportNum',
    'flightDateNum',  
    'departure_time_bin',
    'departure_day_of_week',
    'departure_month',
    'segmentsCabinCodeNum',
    'totalTravelDistance']]

Unnamed: 0,startingAirportNum,destinationAirportNum,flightDateNum,departure_time_bin,departure_day_of_week,departure_month,segmentsCabinCodeNum,totalTravelDistance
0,0,0,1650153600,8,7,4,2,0.0
1,0,0,1650153600,10,7,4,2,762.0
2,0,0,1650153600,8,7,4,2,762.0
3,0,0,1650153600,7,7,4,2,2105.0
4,0,0,1650153600,8,7,4,2,762.0
...,...,...,...,...,...,...,...,...
13519994,15,12,1654214400,6,5,6,2,2652.0
13519995,15,12,1654214400,3,5,6,2,2087.0
13519996,15,12,1654214400,11,5,6,2,3051.0
13519997,15,12,1654214400,3,5,6,2,2606.0


In [14]:
# Model for gradient boost

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define all features and target variable
features = [
    'startingAirportNum',
    'destinationAirportNum',
    'flightDateNum',  
    'departure_time_bin',
    'departure_day_of_week',
    'departure_month',
    'segmentsCabinCodeNum',
    'totalTravelDistance'   
]

target = 'totalFare'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting regression model
model = GradientBoostingRegressor(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the model's performance metrics
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 19217.47
R-squared (R2): 0.55


In [16]:
# Saving the model file
from joblib import dump
#dump(model,'../models/model_gb_boost_srusti.joblib')
# Dump model file after compressing. The raw file was of size 153 MB due to which the heroku app was failing.
dump(model,'../models/model_gb_boost.joblib', compress=3)

['../models/model_gb_boost.joblib']

4
