In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [3]:
flight = pd.read_csv('Data_Train.csv')
flight.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
df_test = pd.read_excel('Test_set.xlsx')
df_test.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [5]:
flight.shape

(10683, 11)

In [6]:
flight.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10683 non-null  object
 1   Date_of_Journey  10683 non-null  object
 2   Source           10683 non-null  object
 3   Destination      10683 non-null  object
 4   Route            10682 non-null  object
 5   Dep_Time         10683 non-null  object
 6   Arrival_Time     10683 non-null  object
 7   Duration         10683 non-null  object
 8   Total_Stops      10682 non-null  object
 9   Additional_Info  10683 non-null  object
 10  Price            10683 non-null  int64 
dtypes: int64(1), object(10)
memory usage: 918.2+ KB


In [7]:
flight.Airline.unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [8]:
flight.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [9]:
flight.Source.unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

# Cleaning

In [10]:
flight['month'] = pd.to_datetime(flight.Date_of_Journey, format='%d/%m/%Y').dt.month

In [12]:
df_test['month'] = pd.to_datetime(df_test.Date_of_Journey, format='%d/%m/%Y').dt.month

In [13]:
flight.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,6
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,3


In [14]:
def arr(x):
    return x.split()[0]

In [15]:
flight['arrival'] = flight.Arrival_Time.map(arr)

In [16]:
df_test['arrival'] = df_test.Arrival_Time.map(arr)

In [17]:
pd.to_datetime(flight.arrival, format='%H:%M')

0       1900-01-01 01:10:00
1       1900-01-01 13:15:00
2       1900-01-01 04:25:00
3       1900-01-01 23:30:00
4       1900-01-01 21:35:00
                ...        
10678   1900-01-01 22:25:00
10679   1900-01-01 23:20:00
10680   1900-01-01 11:20:00
10681   1900-01-01 14:10:00
10682   1900-01-01 19:15:00
Name: arrival, Length: 10683, dtype: datetime64[ns]

In [18]:
def arri(x):
    lst = x.split(':')
    y = float(lst[0])
    if y >= 22 or y < 4:
        return 'Night'
    elif y >= 4 and y < 10:
        return 'Morning'
    elif y >= 10 and y < 16:
        return 'Afternoon'
    elif y >= 16 and y < 22:
        return 'Evening'

In [19]:
flight['arrival'] = flight.arrival.map(arri)

In [20]:
df_test['arrival'] = df_test.arrival.map(arri)

In [21]:
flight.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,arrival
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,Night
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,Afternoon
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,6,Morning
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5,Night
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,3,Evening


In [22]:
flight.Total_Stops.fillna('non-stop', inplace=True)

In [23]:
df_test.Total_Stops.fillna('non-stop', inplace=True)

In [24]:
def stops(x):
    if x == 'non-stop':
        return int(0)
    else:
        return int(x.split()[0])

In [25]:
flight['stops'] = flight.Total_Stops.map(stops)

In [26]:
df_test['stops'] = df_test.Total_Stops.map(stops)

In [27]:
flight.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,arrival,stops
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,Night,0
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,Afternoon,2
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,6,Morning,2
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5,Night,1
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,3,Evening,1


In [28]:
import re

def dur(x):
    lst = re.findall(r'\d+', x)
    return float('.'.join(lst))

In [29]:
flight['duration'] = flight.Duration.map(dur)

In [30]:
df_test['duration'] = df_test.Duration.map(dur)

In [31]:
flight.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,arrival,stops,duration
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,Night,0,2.5
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,Afternoon,2,7.25
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,6,Morning,2,19.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5,Night,1,5.25
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,3,Evening,1,4.45


In [32]:
flight.Additional_Info.unique()

array(['No info', 'In-flight meal not included',
       'No check-in baggage included', '1 Short layover', 'No Info',
       '1 Long layover', 'Change airports', 'Business class',
       'Red-eye flight', '2 Long layover'], dtype=object)

In [33]:
def bag(x):
    if x == 'No check-in baggage included':
        return 0
    else:
        return 1

flight['check_in_baggage'] = flight.Additional_Info.map(bag)

In [34]:
def meal(x):
    if x == 'In-flight meal not included':
        return 0
    else:
        return 1
    
flight['in_flight_meal'] = flight.Additional_Info.map(meal)

In [35]:
df_test['check_in_baggage'] = df_test.Additional_Info.map(bag)
df_test['in_df_test_meal'] = df_test.Additional_Info.map(meal)

In [36]:
flight.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,month,arrival,stops,duration,check_in_baggage,in_flight_meal
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,3,Night,0,2.5,1,1
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662,5,Afternoon,2,7.25,1,1
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,6,Morning,2,19.0,1,1
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218,5,Night,1,5.25,1,1
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302,3,Evening,1,4.45,1,1


In [37]:
flight.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route',
       'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info', 'Price', 'month', 'arrival', 'stops', 'duration',
       'check_in_baggage', 'in_flight_meal'],
      dtype='object')

In [38]:
flight.drop(['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info'], axis=1, inplace=True)

In [39]:
df_test.drop(['Date_of_Journey', 'Route', 'Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
       'Additional_Info'], axis=1, inplace=True)

In [40]:
flight.Destination.unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [41]:
flight

Unnamed: 0,Airline,Source,Destination,Price,month,arrival,stops,duration,check_in_baggage,in_flight_meal
0,IndiGo,Banglore,New Delhi,3897,3,Night,0,2.50,1,1
1,Air India,Kolkata,Banglore,7662,5,Afternoon,2,7.25,1,1
2,Jet Airways,Delhi,Cochin,13882,6,Morning,2,19.00,1,1
3,IndiGo,Kolkata,Banglore,6218,5,Night,1,5.25,1,1
4,IndiGo,Banglore,New Delhi,13302,3,Evening,1,4.45,1,1
...,...,...,...,...,...,...,...,...,...,...
10678,Air Asia,Kolkata,Banglore,4107,4,Night,0,2.30,1,1
10679,Air India,Kolkata,Banglore,4145,4,Night,0,2.35,1,1
10680,Jet Airways,Banglore,Delhi,7229,4,Afternoon,0,3.00,1,1
10681,Vistara,Banglore,New Delhi,12648,3,Afternoon,0,2.40,1,1


In [42]:
cat_cols = flight.select_dtypes(include='object')
cat_cols.head()

Unnamed: 0,Airline,Source,Destination,arrival
0,IndiGo,Banglore,New Delhi,Night
1,Air India,Kolkata,Banglore,Afternoon
2,Jet Airways,Delhi,Cochin,Morning
3,IndiGo,Kolkata,Banglore,Night
4,IndiGo,Banglore,New Delhi,Evening


In [43]:
test_cat_cols = df_test.select_dtypes(include='object')

In [44]:
from sklearn.preprocessing import OrdinalEncoder

In [45]:
oe = OrdinalEncoder()
cat = pd.DataFrame(oe.fit_transform(cat_cols), columns=cat_cols.columns)

In [46]:
test_cat = pd.DataFrame(oe.transform(test_cat_cols), columns=test_cat_cols.columns)

In [47]:
num_cols = flight.select_dtypes(include=np.number)
num_cols.head()

Unnamed: 0,Price,month,stops,duration,check_in_baggage,in_flight_meal
0,3897,3,0,2.5,1,1
1,7662,5,2,7.25,1,1
2,13882,6,2,19.0,1,1
3,6218,5,1,5.25,1,1
4,13302,3,1,4.45,1,1


In [48]:
test_num_cols = df_test.select_dtypes(include=np.number)

In [49]:
df1 = pd.concat([num_cols, cat], axis=1)
df1.head()

Unnamed: 0,Price,month,stops,duration,check_in_baggage,in_flight_meal,Airline,Source,Destination,arrival
0,3897,3,0,2.5,1,1,3.0,0.0,5.0,3.0
1,7662,5,2,7.25,1,1,1.0,3.0,0.0,0.0
2,13882,6,2,19.0,1,1,4.0,2.0,1.0,2.0
3,6218,5,1,5.25,1,1,3.0,3.0,0.0,3.0
4,13302,3,1,4.45,1,1,3.0,0.0,5.0,1.0


In [50]:
df1_test = pd.concat([test_num_cols, test_cat], axis=1)
df1_test.head()

Unnamed: 0,month,stops,duration,check_in_baggage,in_df_test_meal,Airline,Source,Destination,arrival
0,6,1,10.55,1,1,4.0,2.0,1.0,2.0
1,5,1,4.0,1,1,3.0,3.0,0.0,0.0
2,5,1,23.45,1,0,4.0,2.0,1.0,1.0
3,5,1,13.0,1,1,6.0,2.0,1.0,1.0
4,6,0,2.5,1,1,0.0,0.0,2.0,3.0


In [None]:
# sns.pairplot()

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [52]:
X = df1.drop("Price", axis=1)
y = df1['Price']

In [53]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
xtrain.shape, xtest.shape

((8546, 9), (2137, 9))

In [55]:
lr = LinearRegression()

lr.fit(xtrain, ytrain)

ypred = lr.predict(xtest)

In [56]:
print("R2 Score:", r2_score(ytest, ypred))
print("RMSE", np.sqrt(mean_squared_error(ypred, ytest)))

R2 Score: 0.40846558308490066
RMSE 3538.2501777513785


In [57]:
from statsmodels.formula.api import ols

In [58]:
flight.columns

Index(['Airline', 'Source', 'Destination', 'Price', 'month', 'arrival',
       'stops', 'duration', 'check_in_baggage', 'in_flight_meal'],
      dtype='object')

In [59]:
lr_model = ols("Price ~ Airline + Source + Destination + month + arrival + stops + duration + check_in_baggage + in_flight_meal", data=df1).fit()
lr_model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.421
Model:,OLS,Adj. R-squared:,0.421
Method:,Least Squares,F-statistic:,862.4
Date:,"Mon, 25 Apr 2022",Prob (F-statistic):,0.0
Time:,21:21:12,Log-Likelihood:,-102360.0
No. Observations:,10683,AIC:,204700.0
Df Residuals:,10673,BIC:,204800.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2924.9326,335.799,8.710,0.000,2266.704,3583.161
Airline,249.9107,15.372,16.257,0.000,219.779,280.043
Source,-211.5131,35.932,-5.886,0.000,-281.946,-141.080
Destination,169.9060,31.310,5.427,0.000,108.533,231.279
month,-410.0685,31.594,-12.979,0.000,-471.999,-348.139
arrival,-84.9563,32.786,-2.591,0.010,-149.223,-20.689
stops,3650.1328,76.679,47.603,0.000,3499.829,3800.437
duration,78.9925,6.063,13.030,0.000,67.109,90.876
check_in_baggage,3208.8491,212.689,15.087,0.000,2791.940,3625.759

0,1,2,3
Omnibus:,8061.618,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,509833.481
Skew:,3.062,Prob(JB):,0.0
Kurtosis:,36.284,Cond. No.,162.0


In [60]:
from sklearn.tree import DecisionTreeRegressor

In [61]:
dtree = DecisionTreeRegressor(max_depth=8, random_state=42)

dtree.fit(xtrain, ytrain)

pred_test_tree= dtree.predict(xtest)
pred_train_tree= dtree.predict(xtrain)
print("Test RMSE:", np.sqrt(mean_squared_error(ytest,pred_test_tree)))
print("Test R2 Score", r2_score(ytest, pred_test_tree))
print("Train R2 Score", r2_score(ytrain, pred_train_tree))

Test RMSE: 2520.5975850146415
Test R2 Score 0.699800492551041
Train R2 Score 0.7855681838531235


In [96]:
# tuned_params = [{'criterion': ['mse', "mae"],
#                  'max_depth': range(2, 10),
#                  'min_samples_split': range(2,10)}]

# dtree = DecisionTreeRegressor()
# grid = GridSearchCV(estimator=dtree, param_grid=tuned_params, cv=5)
# grid.fit(xtrain, ytrain)
# grid.best_params_

{'criterion': 'mse', 'max_depth': 9, 'min_samples_split': 6}

In [98]:
# tuned_params = [{'max_leaf_nodes': range(5,15),
#                  'max_depth': range(2, 10),
#                  'min_samples_split': range(2,10)}]

# dtree = DecisionTreeRegressor()
# grid = GridSearchCV(estimator=dtree, param_grid=tuned_params, cv=5)
# grid.fit(xtrain, ytrain)
# grid.best_params_

{'max_depth': 6, 'max_leaf_nodes': 14, 'min_samples_split': 2}

In [62]:
dtree = DecisionTreeRegressor(criterion='mse', max_leaf_nodes=14, max_depth=9, min_samples_split=6, random_state=42)

dtree.fit(xtrain, ytrain)

pred_test_tree= dtree.predict(xtest)
pred_train_tree= dtree.predict(xtrain)
print("Test RMSE:", np.sqrt(mean_squared_error(ytest,pred_test_tree)))
print("Test R2 Score", r2_score(ytest, pred_test_tree))
print("Train R2 Score", r2_score(ytrain, pred_train_tree))

Test RMSE: 2793.9276503536744
Test R2 Score 0.6311640410462769
Train R2 Score 0.683685688784055


In [63]:
from sklearn.ensemble import RandomForestRegressor

In [64]:
model_rf = RandomForestRegressor(n_estimators=100, max_depth=10, oob_score=True, random_state=42)
model_rf.fit(xtrain, ytrain) 

pred_test_rf = model_rf.predict(xtest)
pred_train_rf = model_rf.predict(xtrain)
print("Test RMSE:", np.sqrt(mean_squared_error(ytest,pred_test_rf)))
print("Test R2 Score", r2_score(ytest, pred_test_rf))
print("Train R2 Score", r2_score(ytrain, pred_train_rf))

Test RMSE: 2434.1890334234663
Test R2 Score 0.7200299681614344
Train R2 Score 0.8286449025230715


In [105]:
# tuned_params = [{'n_estimators':[50, 100, 150, 200, 300],
#                  'max_leaf_nodes': range(5,15),
#                  'max_depth': range(2, 10),
#                  'min_samples_split': range(2,10)}]

# rf_model = RandomForestRegressor()
# grid = GridSearchCV(estimator=rf_model, param_grid=tuned_params, cv=5)
# grid.fit(xtrain, ytrain)
# grid.best_params_

KeyboardInterrupt: 

In [65]:
from xgboost import XGBRegressor

In [66]:
model_xgb = XGBRegressor(n_estimators=100, max_depth=10, oob_score=True, random_state=42)
model_xgb.fit(xtrain, ytrain) 

pred_test_xgb = model_xgb.predict(xtest)
pred_train_xgb = model_xgb.predict(xtrain)
print("Test RMSE:", np.sqrt(mean_squared_error(ytest,pred_test_xgb)))
print("Test R2 Score", r2_score(ytest, pred_test_xgb))
print("Train R2 Score", r2_score(ytrain, pred_train_xgb))

Parameters: { "oob_score" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Test RMSE: 2612.4777349158317
Test R2 Score 0.6775160242898224
Train R2 Score 0.8902571538709669


In [110]:
# tuned_params = [{'n_estimators': [100, 120, 150],
#                 'learning_rate': [0.1, 0.01, 0.001, 0.15, 0.015], 
#                 'gamma': [2, 3, 4, 5, 6],
#                 'max_depth': [2, 3, 4, 5, 6]}]

# xgb_model = XGBRegressor()
# grid = GridSearchCV(estimator=xgb_model, param_grid=tuned_params, cv=5)
# grid.fit(xtrain, ytrain)
# grid.best_params_

{'gamma': 2, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 150}

In [67]:
model_xgb = XGBRegressor(n_estimators=150, max_depth=6, learning_rate=0.1, gamma=2, oob_score=True, random_state=42)
model_xgb.fit(xtrain, ytrain) 

pred_test_xgb = model_xgb.predict(xtest)
pred_train_xgb = model_xgb.predict(xtrain)
print("Test RMSE:", np.sqrt(mean_squared_error(ytest,pred_test_xgb)))
print("Test R2 Score", r2_score(ytest, pred_test_xgb))
print("Train R2 Score", r2_score(ytrain, pred_train_xgb))

Parameters: { "oob_score" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Test RMSE: 2311.019686824723
Test R2 Score 0.7476459762390304
Train R2 Score 0.836380013825538


In [74]:
1 - np.sqrt(np.square(np.log10(pred_test_xgb +1) - np.log10(ytest +1)).mean())

0.910798239205572

In [68]:
from sklearn.metrics import mean_squared_log_error

In [71]:
ypred_test = model_xgb.predict(df1_test)
ypred_test_df = pd.DataFrame(ypred_test, columns=['Price'])

In [73]:
ypred_test_df.to_csv('ypred_test.csv')