In [60]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import tree

In [6]:
original_df = pd.read_excel(r'C:\Users\elill\OneDrive\Desktop\flight\Data_Train.xlsx')

In [7]:
original_df.Airline.unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [8]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [9]:
cleaned_df = original_df

In [10]:
#data cleaning

#cleaned_df['Date_of_Journey'] = pd.to_datetime(original_df['Date_of_Journey'], format="%d/%m/%Y")
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [11]:
#feature eng

#extract month and year and day of the week etc from doj
cleaned_df.head(25)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU → BLR,09:00,11:25,2h 25m,non-stop,No info,3873
6,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15h 30m,1 stop,In-flight meal not included,11087
7,Jet Airways,01/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21h 5m,1 stop,No info,22270
8,Jet Airways,12/03/2019,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25h 30m,1 stop,In-flight meal not included,11087
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7h 50m,1 stop,No info,8625


In [12]:
cleaned_df['Date_of_Journey'] = pd.to_datetime(cleaned_df['Date_of_Journey'], format= '%d/%m/%Y')

In [13]:
#data cleaning
cleaned_df = original_df
#dropping
cleaned_df = cleaned_df[cleaned_df['Duration']!='5m']
cleaned_df = cleaned_df.dropna()
cleaned_df.info()

cleaned_df[cleaned_df['Total_Stops'].isnull()]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10681 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Airline          10681 non-null  object        
 1   Date_of_Journey  10681 non-null  datetime64[ns]
 2   Source           10681 non-null  object        
 3   Destination      10681 non-null  object        
 4   Route            10681 non-null  object        
 5   Dep_Time         10681 non-null  object        
 6   Arrival_Time     10681 non-null  object        
 7   Duration         10681 non-null  object        
 8   Total_Stops      10681 non-null  object        
 9   Additional_Info  10681 non-null  object        
 10  Price            10681 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(9)
memory usage: 1001.3+ KB


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price


In [14]:
cleaned = cleaned_df['Duration'].str.split('h', expand=True)
cleaned.columns = ['hour','min']
cleaned['min'] = cleaned['min'].str.replace('m','')


cleaned['hour'] = pd.to_numeric(cleaned['hour'])
cleaned['min']=pd.to_numeric(cleaned['min'])/60
cleaned['min']= cleaned['min'].fillna(0)
cleaned['Duration'] = cleaned['hour']+cleaned['min']
cleaned_df['Duration']= cleaned['Duration']
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10681 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Airline          10681 non-null  object        
 1   Date_of_Journey  10681 non-null  datetime64[ns]
 2   Source           10681 non-null  object        
 3   Destination      10681 non-null  object        
 4   Route            10681 non-null  object        
 5   Dep_Time         10681 non-null  object        
 6   Arrival_Time     10681 non-null  object        
 7   Duration         10681 non-null  float64       
 8   Total_Stops      10681 non-null  object        
 9   Additional_Info  10681 non-null  object        
 10  Price            10681 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 1001.3+ KB


In [15]:
cleaned_df.Total_Stops.unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

0     0
1     2
2     2
3     1
4     1
5     0
6     1
7     1
8     1
9     1
10    1
11    0
12    0
13    1
14    0
15    2
16    1
17    1
18    2
19    1
20    1
21    1
22    0
23    0
24    1
Name: Total_Stops, dtype: int32

In [17]:
cleaned_df.Dep_Time.head(25)

0     22:20
1     05:50
2     09:25
3     18:05
4     16:50
5     09:00
6     18:55
7     08:00
8     08:55
9     11:25
10    09:45
11    20:20
12    11:40
13    21:10
14    17:15
15    16:40
16    08:45
17    14:00
18    20:15
19    16:00
20    14:10
21    22:00
22    04:00
23    18:55
24    18:55
Name: Dep_Time, dtype: object

In [18]:
#feature eng

#create premium flag
#seperate class information as a seperate column remove class information from airline
#extract month and year and day from doj
# onehot encoding to airline source and destination
#bucket departure and arrival times
#early morning 0-6
#morning 6-11
#noon 12-15
#afternoon 15-18
#evening 18-21
#night 21-24
#convert total_stops to numbers
#drop route
#onehot encoding to airline, source, destination, and additional info

In [19]:
def change_into_datetime(Date_of_Journey):
    cleaned_df[Date_of_Journey]=pd.to_datetime(cleaned_df[Date_of_Journey])

In [20]:
for i in ['Date_of_Journey']:
    change_into_datetime(i)

In [21]:
cleaned_df['journey_day']=cleaned_df['Date_of_Journey'].dt.day
cleaned_df['journey_month']=cleaned_df['Date_of_Journey'].dt.month

In [22]:
cleaned_df.head(10)


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2.833333,0,No info,3897,24,3
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7.416667,2,No info,7662,1,5
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19.0,2,No info,13882,9,6
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5.416667,1,No info,6218,12,5
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4.75,1,No info,13302,1,3
5,SpiceJet,2019-06-24,Kolkata,Banglore,CCU → BLR,09:00,11:25,2.416667,0,No info,3873,24,6
6,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15.5,1,In-flight meal not included,11087,12,3
7,Jet Airways,2019-03-01,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21.083333,1,No info,22270,1,3
8,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25.5,1,In-flight meal not included,11087,12,3
9,Multiple carriers,2019-05-27,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7.833333,1,No info,8625,27,5


In [23]:
#dict={'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}
#cleaned_df['Total_Stops']=cleaned_df['Total_Stops'].map(dict)

In [24]:
cleaned_df.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2.833333,0,No info,3897,24,3
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7.416667,2,No info,7662,1,5
2,Jet Airways,2019-06-09,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19.0,2,No info,13882,9,6
3,IndiGo,2019-05-12,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5.416667,1,No info,6218,12,5
4,IndiGo,2019-03-01,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4.75,1,No info,13302,1,3
5,SpiceJet,2019-06-24,Kolkata,Banglore,CCU → BLR,09:00,11:25,2.416667,0,No info,3873,24,6
6,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15.5,1,In-flight meal not included,11087,12,3
7,Jet Airways,2019-03-01,Banglore,New Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21.083333,1,No info,22270,1,3
8,Jet Airways,2019-03-12,Banglore,New Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25.5,1,In-flight meal not included,11087,12,3
9,Multiple carriers,2019-05-27,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7.833333,1,No info,8625,27,5


In [25]:
cleaned_df = cleaned_df[['Airline','Source','Destination','Dep_Time','Arrival_Time','Duration','Total_Stops','Additional_Info','Price','journey_day','journey_month']]

In [26]:
cleaned_df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2.833333,0,No info,3897,24,3
1,Air India,Kolkata,Banglore,05:50,13:15,7.416667,2,No info,7662,1,5
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19.0,2,No info,13882,9,6
3,IndiGo,Kolkata,Banglore,18:05,23:30,5.416667,1,No info,6218,12,5
4,IndiGo,Banglore,New Delhi,16:50,21:35,4.75,1,No info,13302,1,3


In [27]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10681 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          10681 non-null  object 
 1   Source           10681 non-null  object 
 2   Destination      10681 non-null  object 
 3   Dep_Time         10681 non-null  object 
 4   Arrival_Time     10681 non-null  object 
 5   Duration         10681 non-null  float64
 6   Total_Stops      10681 non-null  int32  
 7   Additional_Info  10681 non-null  object 
 8   Price            10681 non-null  int64  
 9   journey_day      10681 non-null  int64  
 10  journey_month    10681 non-null  int64  
dtypes: float64(1), int32(1), int64(3), object(6)
memory usage: 959.6+ KB


In [28]:
cleaned_df.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [29]:
cleaned_df['Additional_Info'] = cleaned_df['Additional_Info'].str.replace("No Info","No info")
cleaned_df.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [30]:
cleaned_df.Airline.unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [31]:
#business class =1 and premium class =2
dict={'Business class':1, 'Vistara Premium economy':2, 'Jet Airways Business':1, 'Multiple carriers Premium economy':2}
cleaned_df['Class1']=cleaned_df['Airline'].map(dict).fillna(0)
cleaned_df['Class2']=cleaned_df['Additional_Info'].map(dict).fillna(0)



cleaned_df['class'] = cleaned_df['Class1'] + cleaned_df['Class2']

In [32]:
cleaned_df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month,Class1,Class2,class
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2.833333,0,No info,3897,24,3,0.0,0.0,0.0
1,Air India,Kolkata,Banglore,05:50,13:15,7.416667,2,No info,7662,1,5,0.0,0.0,0.0
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19.0,2,No info,13882,9,6,0.0,0.0,0.0
3,IndiGo,Kolkata,Banglore,18:05,23:30,5.416667,1,No info,6218,12,5,0.0,0.0,0.0
4,IndiGo,Banglore,New Delhi,16:50,21:35,4.75,1,No info,13302,1,3,0.0,0.0,0.0


In [33]:
#cleaned_df.groupby('class').count()

df_tmp = cleaned_df[cleaned_df['class']>0]

df_tmp.head(50)

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month,Class1,Class2,class
426,Vistara Premium economy,Banglore,New Delhi,16:00,18:35,2.583333,0,No info,11793,3,3,2.0,0.0,2.0
657,Jet Airways Business,Banglore,New Delhi,05:45,10:45,5.0,1,No info,52229,1,3,1.0,0.0,1.0
717,Multiple carriers Premium economy,Delhi,Cochin,07:30,19:15,11.75,1,No info,10161,21,3,2.0,0.0,2.0
1799,Multiple carriers Premium economy,Delhi,Cochin,07:30,21:00,13.5,1,No info,9845,21,3,2.0,0.0,2.0
2924,Jet Airways Business,Banglore,New Delhi,05:45,11:25,5.666667,1,Business class,79512,1,3,1.0,1.0,2.0
3032,Jet Airways,Banglore,New Delhi,05:50,12:20,6.5,1,Business class,28097,1,3,0.0,1.0,1.0
4627,Multiple carriers Premium economy,Delhi,Cochin,08:55,19:15,10.333333,1,No info,10161,21,3,2.0,0.0,2.0
4718,Multiple carriers Premium economy,Delhi,Cochin,06:00,21:00,15.0,1,No info,9845,21,3,2.0,0.0,2.0
5090,Multiple carriers Premium economy,Delhi,Cochin,06:00,13:20,7.333333,1,No info,11269,21,3,2.0,0.0,2.0
5372,Jet Airways Business,Banglore,New Delhi,05:45,12:25,6.666667,1,Business class,62427,1,3,1.0,1.0,2.0


In [34]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10681 entries, 0 to 10682
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          10681 non-null  object 
 1   Source           10681 non-null  object 
 2   Destination      10681 non-null  object 
 3   Dep_Time         10681 non-null  object 
 4   Arrival_Time     10681 non-null  object 
 5   Duration         10681 non-null  float64
 6   Total_Stops      10681 non-null  int32  
 7   Additional_Info  10681 non-null  object 
 8   Price            10681 non-null  int64  
 9   journey_day      10681 non-null  int64  
 10  journey_month    10681 non-null  int64  
 11  Class1           10681 non-null  float64
 12  Class2           10681 non-null  float64
 13  class            10681 non-null  float64
dtypes: float64(4), int32(1), int64(3), object(6)
memory usage: 1.2+ MB


In [35]:
cleaned_df['Additional_Info'] = cleaned_df['Additional_Info'].str.replace("Business class","No info")
cleaned_df.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover',
       '1 Long layover', 'Change airports', 'Red-eye flight',
       '2 Long layover'], dtype=object)

In [36]:
import numpy

cleaned_df['Class'] = np.select([cleaned_df.Class1 != cleaned_df.Class2], [cleaned_df.Class1 != cleaned_df.Class2],default =cleaned_df.Class1)

In [37]:
cleaned_df.groupby('Class').count()

Unnamed: 0_level_0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month,Class1,Class2,class
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.0,10658,10658,10658,10658,10658,10658,10658,10658,10658,10658,10658,10658,10658,10658
1.0,23,23,23,23,23,23,23,23,23,23,23,23,23,23


In [38]:
cleaned_df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,journey_day,journey_month,Class1,Class2,class,Class
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2.833333,0,No info,3897,24,3,0.0,0.0,0.0,0.0
1,Air India,Kolkata,Banglore,05:50,13:15,7.416667,2,No info,7662,1,5,0.0,0.0,0.0,0.0
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19.0,2,No info,13882,9,6,0.0,0.0,0.0,0.0
3,IndiGo,Kolkata,Banglore,18:05,23:30,5.416667,1,No info,6218,12,5,0.0,0.0,0.0,0.0
4,IndiGo,Banglore,New Delhi,16:50,21:35,4.75,1,No info,13302,1,3,0.0,0.0,0.0,0.0


In [39]:
#early morning 3-6
#morn 6-11
#noon 11-14
#after noon 14-17
#evening 17-20
#night 20-3

In [40]:
cleaned_df['Dep_hr'] = cleaned_df['Dep_Time'].str.split(':',expand=True)[0].astype(int)
cleaned_df['Arrival_hr'] = cleaned_df['Arrival_Time'].str.split(':',expand=True)[0].astype(int)

In [41]:
bins = [0,3,7,11,15,19,np.inf]
names = ['night','early morning','morning','noon','after noon','evening']

cleaned_df['dep_time_bin'] = pd.cut(cleaned_df['Dep_hr'],bins,labels=names).fillna('night')
cleaned_df['arrival_time_bin'] = pd.cut(cleaned_df['Arrival_hr'],bins,labels=names).fillna('night')


#cleaned_df[cleaned_df.arrival_time_bin.isna()].head(25)




In [42]:

#encoded_df = cleaned_df['Airline','Source','Destination','Duration','Total_Stops','Additional_Info','Price','journey_day','journey_month','Class','dep_time_bin','arrival_time_bin']

In [43]:
cleaned_df.columns

Index(['Airline', 'Source', 'Destination', 'Dep_Time', 'Arrival_Time',
       'Duration', 'Total_Stops', 'Additional_Info', 'Price', 'journey_day',
       'journey_month', 'Class1', 'Class2', 'class', 'Class', 'Dep_hr',
       'Arrival_hr', 'dep_time_bin', 'arrival_time_bin'],
      dtype='object')

In [44]:
cat_cols = ['Airline', 'Source', 'Destination', 'Additional_Info', 'dep_time_bin', 'arrival_time_bin']

encoder = OrdinalEncoder().fit_transform(cleaned_df[cat_cols])

encoder = pd.DataFrame(encoder,columns=cat_cols)

encoder.head()

num_cols = ['Duration','Total_Stops','Class','Price','journey_day','journey_month']

encoded_df = pd.concat([encoder, cleaned_df[num_cols]],axis=1)

encoded_df.info()

df_target = encoded_df['Price']


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10683 entries, 0 to 10682
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Airline           10681 non-null  float64
 1   Source            10681 non-null  float64
 2   Destination       10681 non-null  float64
 3   Additional_Info   10681 non-null  float64
 4   dep_time_bin      10681 non-null  float64
 5   arrival_time_bin  10681 non-null  float64
 6   Duration          10681 non-null  float64
 7   Total_Stops       10681 non-null  float64
 8   Class             10681 non-null  float64
 9   Price             10681 non-null  float64
 10  journey_day       10681 non-null  float64
 11  journey_month     10681 non-null  float64
dtypes: float64(12)
memory usage: 1.1 MB


In [47]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7478 entries, 9481 to 3582
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Airline           7476 non-null   float64
 1   Source            7476 non-null   float64
 2   Destination       7476 non-null   float64
 3   Additional_Info   7476 non-null   float64
 4   dep_time_bin      7476 non-null   float64
 5   arrival_time_bin  7476 non-null   float64
 6   Duration          7477 non-null   float64
 7   Total_Stops       7477 non-null   float64
 8   Class             7477 non-null   float64
 9   Price             7477 non-null   float64
 10  journey_day       7477 non-null   float64
 11  journey_month     7477 non-null   float64
dtypes: float64(12)
memory usage: 759.5 KB


In [172]:
feature_list = ['Airline', 'Source', 'Destination',
                'arrival_time_bin','Duration', 'Total_Stops',
                'Class','journey_day','journey_month']

In [173]:
encoded_df = encoded_df.fillna(0)

In [174]:
X = encoded_df[feature_list]
Y= encoded_df['Price']
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state = 123)

x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8546 entries, 5955 to 3582
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Airline           8546 non-null   float64
 1   Source            8546 non-null   float64
 2   Destination       8546 non-null   float64
 3   arrival_time_bin  8546 non-null   float64
 4   Duration          8546 non-null   float64
 5   Total_Stops       8546 non-null   float64
 6   Class             8546 non-null   float64
 7   journey_day       8546 non-null   float64
 8   journey_month     8546 non-null   float64
dtypes: float64(9)
memory usage: 667.7 KB


In [175]:
model = LinearRegression().fit(x_train,y_train)
y_pred = model.predict(x_test)
accuracy = model.score(x_test,y_test)
print("Accuracy is",accuracy*100,'%')

Accuracy is 41.12709980926418 %


In [176]:
r2_score(y_test,y_pred)

0.4112709980926418

In [177]:
#feature_list = ['Airline', 'Source', 'Destination', #'Additional_Info',
                #'dep_time_bin','arrival_time_bin','Duration','Total_Stops',
                #'Class',#'journey_day','journey_month']

model.coef_

array([   88.60399342,  -144.21457784,    65.19398174,  -148.5955855 ,
          69.4243049 ,  3585.56593824, 12991.0527067 ,   -82.95446601,
        -497.63339218])

In [148]:
model.intercept_

7679.868990776806