# Airfare Price Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data Exploration

In [3]:
data = pd.read_excel("ML Live Flight Fare Resourses16963295320.xlsx")

In [4]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


# Data Cleaning and Manipulating

#### Since columns Route and Additional info are not affecting to flight price prediction so it is bettor to remove

In [5]:
data.drop(["Route","Additional_Info"],axis=1,inplace=True)

In [6]:
data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897
1,Air India,1/05/2019,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302


In [7]:
data["Airline"].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [8]:
data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Price              0
dtype: int64

In [9]:
data[data["Total_Stops"].isnull()==True]

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price
9039,Air India,6/05/2019,Delhi,Cochin,09:45,09:25 07 May,23h 40m,,7480


In [10]:
data.dropna(inplace=True)

In [11]:
data.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        0
Price              0
dtype: int64

In [12]:
pd.to_datetime(data["Date_of_Journey"]).dt.year.unique()

array([2019])

In [13]:
data["journey_month"]=pd.to_datetime(data["Date_of_Journey"]).dt.month

data["journey_date"]=pd.to_datetime(data["Date_of_Journey"]).dt.day

data.drop("Date_of_Journey",axis=1,inplace=True)

In [14]:
data.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price,journey_month,journey_date
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897,3,24
1,Air India,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662,5,1
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882,6,9
3,IndiGo,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218,5,12
4,IndiGo,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302,3,1


In [15]:
data["Dep_hour"]=pd.to_datetime(data["Dep_Time"]).dt.hour

data["Dep_minutes"]=pd.to_datetime(data["Dep_Time"]).dt.minute

In [16]:
data.drop("Dep_Time",axis=1,inplace=True)

In [17]:
data.head()

Unnamed: 0,Airline,Source,Destination,Arrival_Time,Duration,Total_Stops,Price,journey_month,journey_date,Dep_hour,Dep_minutes
0,IndiGo,Banglore,New Delhi,01:10 22 Mar,2h 50m,non-stop,3897,3,24,22,20
1,Air India,Kolkata,Banglore,13:15,7h 25m,2 stops,7662,5,1,5,50
2,Jet Airways,Delhi,Cochin,04:25 10 Jun,19h,2 stops,13882,6,9,9,25
3,IndiGo,Kolkata,Banglore,23:30,5h 25m,1 stop,6218,5,12,18,5
4,IndiGo,Banglore,New Delhi,21:35,4h 45m,1 stop,13302,3,1,16,50


In [18]:
data["Arr_hour"]=pd.to_datetime(data["Arrival_Time"]).dt.hour

data["Arr_minutes"]=pd.to_datetime(data["Arrival_Time"]).dt.minute

In [19]:
data.drop("Arrival_Time",axis=1,inplace=True)

In [20]:
data["dur_hours"]=data["Duration"].str.split("h").str[0]

In [21]:
data["dur_minutes"]=data["Duration"].str.split("h").str[1].str.split("m").str[0].str.strip()

In [22]:
data.drop("Duration",axis=1,inplace=True)

In [23]:
data.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,journey_month,journey_date,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes,dur_hours,dur_minutes
0,IndiGo,Banglore,New Delhi,non-stop,3897,3,24,22,20,1,10,2,50.0
1,Air India,Kolkata,Banglore,2 stops,7662,5,1,5,50,13,15,7,25.0
2,Jet Airways,Delhi,Cochin,2 stops,13882,6,9,9,25,4,25,19,
3,IndiGo,Kolkata,Banglore,1 stop,6218,5,12,18,5,23,30,5,25.0
4,IndiGo,Banglore,New Delhi,1 stop,13302,3,1,16,50,21,35,4,45.0


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10682 entries, 0 to 10682
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Airline        10682 non-null  object
 1   Source         10682 non-null  object
 2   Destination    10682 non-null  object
 3   Total_Stops    10682 non-null  object
 4   Price          10682 non-null  int64 
 5   journey_month  10682 non-null  int32 
 6   journey_date   10682 non-null  int32 
 7   Dep_hour       10682 non-null  int32 
 8   Dep_minutes    10682 non-null  int32 
 9   Arr_hour       10682 non-null  int32 
 10  Arr_minutes    10682 non-null  int32 
 11  dur_hours      10682 non-null  object
 12  dur_minutes    10681 non-null  object
dtypes: int32(6), int64(1), object(6)
memory usage: 918.0+ KB


In [25]:
data["dur_hours"].unique()

array(['2', '7', '19', '5', '4', '15', '21', '25', '13', '12', '26', '22',
       '23', '20', '10', '6', '11', '8', '16', '3', '27', '1', '14', '9',
       '18', '17', '24', '30', '28', '29', '37', '34', '38', '35', '36',
       '47', '33', '32', '31', '42', '39', '5m', '41', '40'], dtype=object)

In [26]:
data["dur_minutes"].unique()

array(['50', '25', '', '45', '30', '5', '15', '35', '10', '20', '55',
       '40', nan], dtype=object)

In [27]:
data["dur_minutes"][data["dur_minutes"]==""]=0

In [28]:
data["dur_minutes"].replace(np.nan,0,inplace=True)

In [29]:
data["dur_hours"].replace("[m]","",regex=True,inplace=True)

In [30]:
data[["dur_minutes","dur_hours"]] = data[["dur_minutes","dur_hours"]].astype(int)

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10682 entries, 0 to 10682
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Airline        10682 non-null  object
 1   Source         10682 non-null  object
 2   Destination    10682 non-null  object
 3   Total_Stops    10682 non-null  object
 4   Price          10682 non-null  int64 
 5   journey_month  10682 non-null  int32 
 6   journey_date   10682 non-null  int32 
 7   Dep_hour       10682 non-null  int32 
 8   Dep_minutes    10682 non-null  int32 
 9   Arr_hour       10682 non-null  int32 
 10  Arr_minutes    10682 non-null  int32 
 11  dur_hours      10682 non-null  int32 
 12  dur_minutes    10682 non-null  int32 
dtypes: int32(8), int64(1), object(4)
memory usage: 834.5+ KB


In [32]:
data.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,journey_month,journey_date,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes,dur_hours,dur_minutes
0,IndiGo,Banglore,New Delhi,non-stop,3897,3,24,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2 stops,7662,5,1,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2 stops,13882,6,9,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1 stop,6218,5,12,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1 stop,13302,3,1,16,50,21,35,4,45


In [33]:
data["Total_Stops"].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [34]:
data["No_of_stops"] = data["Total_Stops"].replace(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],[0,2,1,3,4])

In [35]:
data.drop("Total_Stops",axis=1,inplace=True)

In [36]:
data.head(5)

Unnamed: 0,Airline,Source,Destination,Price,journey_month,journey_date,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes,dur_hours,dur_minutes,No_of_stops
0,IndiGo,Banglore,New Delhi,3897,3,24,22,20,1,10,2,50,0
1,Air India,Kolkata,Banglore,7662,5,1,5,50,13,15,7,25,2
2,Jet Airways,Delhi,Cochin,13882,6,9,9,25,4,25,19,0,2
3,IndiGo,Kolkata,Banglore,6218,5,12,18,5,23,30,5,25,1
4,IndiGo,Banglore,New Delhi,13302,3,1,16,50,21,35,4,45,1


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10682 entries, 0 to 10682
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Airline        10682 non-null  object
 1   Source         10682 non-null  object
 2   Destination    10682 non-null  object
 3   Price          10682 non-null  int64 
 4   journey_month  10682 non-null  int32 
 5   journey_date   10682 non-null  int32 
 6   Dep_hour       10682 non-null  int32 
 7   Dep_minutes    10682 non-null  int32 
 8   Arr_hour       10682 non-null  int32 
 9   Arr_minutes    10682 non-null  int32 
 10  dur_hours      10682 non-null  int32 
 11  dur_minutes    10682 non-null  int32 
 12  No_of_stops    10682 non-null  int64 
dtypes: int32(8), int64(2), object(3)
memory usage: 834.5+ KB


In [38]:
data.head()

Unnamed: 0,Airline,Source,Destination,Price,journey_month,journey_date,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes,dur_hours,dur_minutes,No_of_stops
0,IndiGo,Banglore,New Delhi,3897,3,24,22,20,1,10,2,50,0
1,Air India,Kolkata,Banglore,7662,5,1,5,50,13,15,7,25,2
2,Jet Airways,Delhi,Cochin,13882,6,9,9,25,4,25,19,0,2
3,IndiGo,Kolkata,Banglore,6218,5,12,18,5,23,30,5,25,1
4,IndiGo,Banglore,New Delhi,13302,3,1,16,50,21,35,4,45,1


# Feature Engineering

In [39]:
data["Airline"].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [40]:
data["Source"].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [41]:
data["Destination"].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [42]:
from sklearn.preprocessing import LabelEncoder

In [43]:
LE=LabelEncoder()

In [44]:
data["Airline"]=LE.fit_transform(data["Airline"])

In [45]:
LE1=LabelEncoder()

In [46]:
data["Source"]=LE1.fit_transform(data["Source"])

In [47]:
LE2=LabelEncoder()

In [48]:
data["Destination"]=LE2.fit_transform(data["Destination"])

In [49]:
data.head()

Unnamed: 0,Airline,Source,Destination,Price,journey_month,journey_date,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes,dur_hours,dur_minutes,No_of_stops
0,3,0,5,3897,3,24,22,20,1,10,2,50,0
1,1,3,0,7662,5,1,5,50,13,15,7,25,2
2,4,2,1,13882,6,9,9,25,4,25,19,0,2
3,3,3,0,6218,5,12,18,5,23,30,5,25,1
4,3,0,5,13302,3,1,16,50,21,35,4,45,1


# Model Selection

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
x=data.drop("Price",axis=1)

In [52]:
y=data["Price"]

In [119]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [120]:
x_train.shape

(8545, 12)

In [121]:
x_test.shape

(2137, 12)

In [122]:
data.head()

Unnamed: 0,Airline,Source,Destination,Price,journey_month,journey_date,Dep_hour,Dep_minutes,Arr_hour,Arr_minutes,dur_hours,dur_minutes,No_of_stops
0,3,0,5,3897,3,24,22,20,1,10,2,50,0
1,1,3,0,7662,5,1,5,50,13,15,7,25,2
2,4,2,1,13882,6,9,9,25,4,25,19,0,2
3,3,3,0,6218,5,12,18,5,23,30,5,25,1
4,3,0,5,13302,3,1,16,50,21,35,4,45,1


In [123]:
from sklearn.preprocessing import MinMaxScaler

In [124]:
scaler = MinMaxScaler()

In [125]:
from sklearn.linear_model import LinearRegression

In [126]:
model=LinearRegression()

In [127]:
model.fit(scaler.fit_transform(x_train),y_train)

In [128]:
model.score(scaler.fit_transform(x_train),y_train)

0.4359904903404066

In [129]:
model.score(scaler.fit_transform(x_test),y_test)

0.3353849679104801

In [130]:
from sklearn.linear_model import Lasso,Ridge

In [131]:
reg1=Lasso()
reg2=Ridge()

In [132]:
reg1.fit(x_train,y_train)
reg2.fit(x_train,y_train)

In [133]:
print(reg1.score(x_train,y_train))
print(reg2.score(x_train,y_train))

0.43599014850191653
0.4359904427502641


In [134]:
print(reg1.score(x_test,y_test))
print(reg2.score(x_test,y_test))

0.4381280610143886
0.43814725529943266


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor

In [136]:
list=[DecisionTreeRegressor,SVR,RandomForestRegressor,AdaBoostRegressor]

In [137]:
for i in list:
    model_new=i()
    model_new.fit(scaler.fit_transform(x_train),y_train)
    print(model_new.score(scaler.fit_transform(x_train),y_train),"\t",model_new.score(scaler.fit_transform(x_test),y_test),"\t",i)

0.9692484150527355 	 0.7333314158271533 	 <class 'sklearn.tree._classes.DecisionTreeRegressor'>
0.007646626183045013 	 0.010285620584348787 	 <class 'sklearn.svm._classes.SVR'>
0.9534263120504168 	 0.8421356425709641 	 <class 'sklearn.ensemble._forest.RandomForestRegressor'>
0.3572814292264195 	 0.3277982116306083 	 <class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>


### RandomForestRegressor achived the highest accuracy, making it the most suitable model for our predictive task

In [72]:
Final_model = RandomForestRegressor(n_estimators = 100)

In [73]:
Final_model.fit(scaler.fit_transform(x_train),y_train)

In [74]:
Final_model.score(scaler.fit_transform(x_train),y_train)

0.9535509510004445

In [75]:
Final_model.score(scaler.fit_transform(x_test),y_test)

0.8390599255602325

In [76]:
y_pred = Final_model.predict(scaler.fit_transform(x_test))

# Model Evaluation

In [77]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score


In [78]:
mean_absolute_error(y_test, y_pred)

1182.2075698000463

In [79]:
np.sqrt(mean_squared_error(y_test, y_pred))

3470198.9498850573

In [80]:
r2_score(y_test, y_pred)

0.8390599255602325

### Since R2 Score is in the range 50 to 80 it is valuable so this machine learning model is acceptable