In [14]:
#Load data

import pandas as pd

df = pd.read_csv("Clean_Dataset.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [15]:
#Feature selection

features = [
    "airline",
    "source_city",
    "destination_city",
    "stops",
    "class",
    "duration",
    "days_left"
]
target = "price"

X = df[features].copy()
y = df[target]


In [16]:
#Encode categorical features

from sklearn.preprocessing import LabelEncoder

encoders = {}
cat_cols = ["airline", "source_city", "destination_city", "stops", "class"]

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le


In [17]:
#Time-aware train split

train_mask = df["days_left"] > 10

X_train = X[train_mask]
y_train = y[train_mask]

X_test = X[~train_mask]
y_test = y[~train_mask]


In [18]:


!pip install lightgbm



In [19]:
#Train LightGBM model

import lightgbm as lgb

model = lgb.LGBMRegressor(
    objective="regression_l1",
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 318
[LightGBM] [Info] Number of data points in the train set: 250786, number of used features: 7
[LightGBM] [Info] Start training from score 6439.000000


LGBMRegressor(colsample_bytree=0.8, learning_rate=0.05, n_estimators=600,
              objective='regression_l1', random_state=42, subsample=0.8)

In [20]:
#Evaluate

from sklearn.metrics import mean_absolute_error

preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print("MAE:", mae)


MAE: 4092.3699140314266


In [21]:
#Buy/wait logic

def buy_wait(curr, fut, threshold=0.05):
    if fut < curr * (1 - threshold):
        return "WAIT"
    elif fut > curr * (1 + threshold):
        return "BUY"
    return "STABLE"

sample = X_test.iloc[0].copy()
price_now = model.predict(pd.DataFrame([sample]))[0]

sample["days_left"] -= 7
price_future = model.predict(pd.DataFrame([sample]))[0]

print(price_now, price_future, buy_wait(price_now, price_future))


5978.755005856992 5978.755005856992 STABLE


In [22]:
#save model encoders

import pickle

with open("lightgbm_price_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("feature_encoders.pkl", "wb") as f:
    pickle.dump(encoders, f)

print("Saved successfully")


Saved successfully
