In [18]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import TargetEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [19]:
df = pd.read_csv("Data_Train.csv")
df.head()


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR ? DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU ? IXR ? BBI ? BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL ? LKO ? BOM ? COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU ? NAG ? BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR ? NAG ? DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [20]:
df["Dep_Time"] = pd.to_datetime(df["Dep_Time"], format="%H:%M")

df["Dep_Hour"] = df["Dep_Time"].dt.hour
df["Dep_Min"] = df["Dep_Time"].dt.minute

df.drop("Dep_Time", axis=1, inplace=True)


In [21]:
df["Arrival_Time"] = df["Arrival_Time"].apply(lambda x: x.split(" ")[0])
df["Arrival_Time"] = pd.to_datetime(df["Arrival_Time"], format="%H:%M")

df["Arrival_Hour"] = df["Arrival_Time"].dt.hour
df["Arrival_Min"] = df["Arrival_Time"].dt.minute

df.drop("Arrival_Time", axis=1, inplace=True)


In [22]:

df["Date_of_Journey"] = pd.to_datetime(df["Date_of_Journey"], format="%d/%m/%Y")
df["Journey_Day"] = df["Date_of_Journey"].dt.day
df["Journey_Month"] = df["Date_of_Journey"].dt.month
df.drop("Date_of_Journey", axis=1, inplace=True)


In [23]:

def convert_duration(duration):
    h, m = 0, 0
    if "h" in duration:
        h = int(duration.split("h")[0])
    if "m" in duration:
        m = int(duration.split("m")[-2].split()[-1])
    return h * 60 + m

df["Duration"] = df["Duration"].apply(convert_duration)


In [24]:
encoder = OrdinalEncoder()

In [25]:
df[["Airline", "Source", "Destination", "Route", "Additional_Info", "Total_Stops"]] = encoder.fit_transform(df[["Airline", "Source", "Destination", "Route", "Additional_Info", "Total_Stops"]])

In [26]:
df.dropna(inplace=True)

In [27]:

X = df.drop("Price", axis=1)
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [28]:

dt = DecisionTreeRegressor(
    # max_depth=12,
    # min_samples_split=10,
    random_state=42
)

dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree R2:", r2_score(y_test, y_pred_dt))
print("MAE:", mean_absolute_error(y_test, y_pred_dt))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_dt)))


Decision Tree R2: 0.8005455942295916
MAE: 697.927546404617
RMSE: 2073.800166680427


In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8545 entries, 10005 to 7270
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          8545 non-null   float64
 1   Source           8545 non-null   float64
 2   Destination      8545 non-null   float64
 3   Route            8545 non-null   float64
 4   Duration         8545 non-null   int64  
 5   Total_Stops      8545 non-null   float64
 6   Additional_Info  8545 non-null   float64
 7   Dep_Hour         8545 non-null   int32  
 8   Dep_Min          8545 non-null   int32  
 9   Arrival_Hour     8545 non-null   int32  
 10  Arrival_Min      8545 non-null   int32  
 11  Journey_Day      8545 non-null   int32  
 12  Journey_Month    8545 non-null   int32  
dtypes: float64(6), int32(6), int64(1)
memory usage: 734.3 KB


In [30]:
rf = RandomForestRegressor(
  
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest R2:", r2_score(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))


Random Forest R2: 0.8648405435913182
MAE: 634.055096753348
RMSE: 1707.1367290955595


In [31]:
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid = {
        'max_depth' : [None,5,10,15,20],
        'min_samples_split' : [2,4,6,8,10],
        'min_samples_leaf' :[1,3,5,7,9],
        'criterion' : ['squared_error','absolute_error'],       
    },
    cv=5,
    n_jobs=-1,
    verbose=3,
    scoring='r2'
)

In [32]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8545 entries, 10005 to 7270
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          8545 non-null   float64
 1   Source           8545 non-null   float64
 2   Destination      8545 non-null   float64
 3   Route            8545 non-null   float64
 4   Duration         8545 non-null   int64  
 5   Total_Stops      8545 non-null   float64
 6   Additional_Info  8545 non-null   float64
 7   Dep_Hour         8545 non-null   int32  
 8   Dep_Min          8545 non-null   int32  
 9   Arrival_Hour     8545 non-null   int32  
 10  Arrival_Min      8545 non-null   int32  
 11  Journey_Day      8545 non-null   int32  
 12  Journey_Month    8545 non-null   int32  
dtypes: float64(6), int32(6), int64(1)
memory usage: 734.3 KB


In [33]:
X_train

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min,Journey_Day,Journey_Month
10005,6.0,2.0,1.0,104.0,645,0.0,8.0,8,30,19,15,27,5
3684,4.0,2.0,1.0,104.0,1505,0.0,5.0,11,30,12,35,9,5
1034,8.0,2.0,1.0,121.0,380,0.0,8.0,15,45,22,5,24,4
3909,6.0,2.0,1.0,104.0,765,0.0,8.0,12,50,1,35,21,3
3088,1.0,2.0,1.0,110.0,1560,1.0,8.0,17,15,19,15,24,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,4.0,2.0,1.0,104.0,1165,0.0,8.0,9,0,4,25,27,3
5191,4.0,3.0,0.0,66.0,400,0.0,5.0,14,5,20,45,9,5
5390,6.0,2.0,1.0,104.0,760,0.0,8.0,12,50,1,30,15,5
860,3.0,0.0,5.0,18.0,165,4.0,8.0,0,40,3,25,3,3


In [34]:
num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(exclude='number').columns

In [35]:
# Preprocessing transformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
preprocessor = ColumnTransformer(
        transformers=[
            ('Scaler', RobustScaler(), num_cols),
            ('Encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols),
        ],
        remainder='drop'
    )

In [36]:
pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('model', RandomForestRegressor(criterion="absolute_error",n_estimators=200, random_state=42, max_depth=15, min_samples_leaf=5, min_samples_split=10))
    ])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
train_score=pipeline.score(X_train,y_train)
train_score

In [None]:
test_score=pipeline.score(X_test,y_test)
test_score

In [37]:
# grid_search.fit(X_train,y_train)

In [38]:
# grid_search.best_params_

In [39]:
# grid_search.best_estimator_

In [40]:
# rfr = RandomForestRegressor()

In [41]:
# rfr.score(X_train,y_train)

In [42]:
# rfr.score(X_test,y_test)