In [58]:
import numpy as np
import pandas as pd

# Data Loading

In [97]:
try:
  train_df = pd.read_csv("/content/train.csv")
  test_df = pd.read_csv("/content/test_8gqdJqH.csv")
  transaction_df = pd.read_csv("/content/transactions.csv")

except FileNotFoundError:
  print("Please make sure train.csv, test.csv, and transactions.csv are in the same directory.")

print("Data loaded sucessfully")
print(train_df.shape)
print(test_df.shape)
print(transaction_df.shape)

Data loaded sucessfully
(67200, 4)
(5900, 4)
(2266100, 11)


In [98]:
# train_df
train_df.head()

Unnamed: 0,doj,srcid,destid,final_seatcount
0,2023-03-01,45,46,2838.0
1,2023-03-01,46,45,2298.0
2,2023-03-01,45,47,2720.0
3,2023-03-01,47,45,2580.0
4,2023-03-01,46,9,4185.0


In [99]:
# test_df
test_df.head()

Unnamed: 0,route_key,doj,srcid,destid
0,2025-02-11_46_45,2025-02-11,46,45
1,2025-01-20_17_23,2025-01-20,17,23
2,2025-01-08_02_14,2025-01-08,2,14
3,2025-01-08_08_47,2025-01-08,8,47
4,2025-01-08_09_46,2025-01-08,9,46


In [100]:
# transaction_df
transaction_df.head()

Unnamed: 0,doj,doi,srcid,destid,srcid_region,destid_region,srcid_tier,destid_tier,cumsum_seatcount,cumsum_searchcount,dbd
0,2023-03-01,2023-01-30,45,46,Karnataka,Tamil Nadu,Tier 1,Tier 1,8.0,76.0,30
1,2023-03-01,2023-01-30,46,45,Tamil Nadu,Karnataka,Tier 1,Tier 1,8.0,70.0,30
2,2023-03-01,2023-01-30,45,47,Karnataka,Andhra Pradesh,Tier 1,Tier 1,4.0,142.0,30
3,2023-03-01,2023-01-30,47,45,Andhra Pradesh,Karnataka,Tier 1,Tier 1,0.0,68.0,30
4,2023-03-01,2023-01-30,46,9,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,9.0,162.0,30


# Data Exploration and Preprocessing

In [101]:
# train_df
train_df.info()
print("Date of Journey (doj) is an object.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67200 entries, 0 to 67199
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   doj              67200 non-null  object 
 1   srcid            67200 non-null  int64  
 2   destid           67200 non-null  int64  
 3   final_seatcount  67200 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 2.1+ MB
Date of Journey (doj) is an object.


In [102]:
# converting "doj" in train_df to "DateTime"
train_df["doj"] = pd.to_datetime(train_df["doj"])

In [103]:
# test_df
test_df.info()
print("Date of Journey (doj) is an object.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5900 entries, 0 to 5899
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   route_key  5900 non-null   object
 1   doj        5900 non-null   object
 2   srcid      5900 non-null   int64 
 3   destid     5900 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 184.5+ KB
Date of Journey (doj) is an object.


In [104]:
# converting "doj" in test_df to "DateTime"
test_df["doj"] = pd.to_datetime(test_df["doj"])

In [105]:
# transaction_df
transaction_df.info()
print(transaction_df.isna().sum())
print("Date of Journey (doj) and Date of Issue (doi) are both an object.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2266100 entries, 0 to 2266099
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   doj                 object 
 1   doi                 object 
 2   srcid               int64  
 3   destid              int64  
 4   srcid_region        object 
 5   destid_region       object 
 6   srcid_tier          object 
 7   destid_tier         object 
 8   cumsum_seatcount    float64
 9   cumsum_searchcount  float64
 10  dbd                 int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 190.2+ MB
doj                   0
doi                   0
srcid                 0
destid                0
srcid_region          0
destid_region         0
srcid_tier            0
destid_tier           0
cumsum_seatcount      0
cumsum_searchcount    0
dbd                   0
dtype: int64
Date of Journey (doj) and Date of Issue (doi) are both an object.


In [106]:
# converting "doj" and "doi" in transaction_df to "DateTime"
transaction_df["doj"] = pd.to_datetime(transaction_df["doj"])
transaction_df["doi"] = pd.to_datetime(transaction_df["doi"])

In [107]:
# filtering the bookings at dbd=15 or earlier
transactions_at_15 = transaction_df[transaction_df["dbd"] >= 15].sort_values("dbd", ascending= True)
transactions_at_15

Unnamed: 0,doj,doi,srcid,destid,srcid_region,destid_region,srcid_tier,destid_tier,cumsum_seatcount,cumsum_searchcount,dbd
1709604,2024-09-02,2024-08-18,46,9,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,30.0,1095.0,15
156509,2023-04-20,2023-04-05,9,45,Tamil Nadu,Karnataka,Tier2,Tier 1,27.0,609.0,15
156510,2023-04-20,2023-04-05,47,5,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier2,12.0,1500.0,15
156511,2023-04-20,2023-04-05,5,47,Andhra Pradesh,Andhra Pradesh,Tier2,Tier 1,6.0,246.0,15
156512,2023-04-20,2023-04-05,46,44,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,36.0,1232.0,15
...,...,...,...,...,...,...,...,...,...,...,...
3,2023-03-01,2023-01-30,47,45,Andhra Pradesh,Karnataka,Tier 1,Tier 1,0.0,68.0,30
2,2023-03-01,2023-01-30,45,47,Karnataka,Andhra Pradesh,Tier 1,Tier 1,4.0,142.0,30
17,2023-03-01,2023-01-30,2,40,Maharashtra and Goa,Maharashtra and Goa,Tier 1,Tier 1,0.0,152.0,30
16,2023-03-01,2023-01-30,44,46,Tamil Nadu,Tamil Nadu,Tier2,Tier 1,0.0,32.0,30


In [108]:
# keeping the first entry for each journey
info_at_15_days = transactions_at_15.drop_duplicates(subset= ["doj", "srcid", "destid"], keep= "first")
info_at_15_days

Unnamed: 0,doj,doi,srcid,destid,srcid_region,destid_region,srcid_tier,destid_tier,cumsum_seatcount,cumsum_searchcount,dbd
1709604,2024-09-02,2024-08-18,46,9,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,30.0,1095.0,15
156509,2023-04-20,2023-04-05,9,45,Tamil Nadu,Karnataka,Tier2,Tier 1,27.0,609.0,15
156510,2023-04-20,2023-04-05,47,5,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier2,12.0,1500.0,15
156511,2023-04-20,2023-04-05,5,47,Andhra Pradesh,Andhra Pradesh,Tier2,Tier 1,6.0,246.0,15
156512,2023-04-20,2023-04-05,46,44,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,36.0,1232.0,15
...,...,...,...,...,...,...,...,...,...,...,...
1260127,2024-04-10,2024-03-26,47,43,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier 1,4.0,1116.0,15
1260128,2024-04-10,2024-03-26,13,23,East 1,East 1,Tier2,Tier 1,16.0,312.0,15
1260129,2024-04-10,2024-03-26,36,7,Delhi,Rest of North,Tier2,Tier2,12.0,1496.0,15
1260130,2024-04-10,2024-03-26,43,47,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier 1,12.0,856.0,15


In [109]:
# renaming columns to be more specific
info_at_15_days = info_at_15_days.rename(columns= {
    "cumsum_seatcount": "seats_at_15_days",
    "cumsum_searchcount": "searches_at_15_days",
    "dbd": "dbd_at_prediction"
})

info_at_15_days

Unnamed: 0,doj,doi,srcid,destid,srcid_region,destid_region,srcid_tier,destid_tier,seats_at_15_days,searches_at_15_days,dbd_at_prediction
1709604,2024-09-02,2024-08-18,46,9,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,30.0,1095.0,15
156509,2023-04-20,2023-04-05,9,45,Tamil Nadu,Karnataka,Tier2,Tier 1,27.0,609.0,15
156510,2023-04-20,2023-04-05,47,5,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier2,12.0,1500.0,15
156511,2023-04-20,2023-04-05,5,47,Andhra Pradesh,Andhra Pradesh,Tier2,Tier 1,6.0,246.0,15
156512,2023-04-20,2023-04-05,46,44,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,36.0,1232.0,15
...,...,...,...,...,...,...,...,...,...,...,...
1260127,2024-04-10,2024-03-26,47,43,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier 1,4.0,1116.0,15
1260128,2024-04-10,2024-03-26,13,23,East 1,East 1,Tier2,Tier 1,16.0,312.0,15
1260129,2024-04-10,2024-03-26,36,7,Delhi,Rest of North,Tier2,Tier2,12.0,1496.0,15
1260130,2024-04-10,2024-03-26,43,47,Andhra Pradesh,Andhra Pradesh,Tier 1,Tier 1,12.0,856.0,15


In [110]:
print("Shape of information at 15 days:", info_at_15_days.shape)

Shape of information at 15 days: (73100, 11)


In [111]:
# merging "info_at_15_days" with "test_df" and "train_df"
train_merged = pd.merge(train_df, info_at_15_days, on= ["doj", "srcid", "destid"], how= "left")
test_merged = pd.merge(test_df, info_at_15_days, on= ["doj", "srcid", "destid"], how= "left")

In [112]:
print("Train Merged Shape:", train_merged.shape)
print("Test Merged Shape:", test_merged.shape)

Train Merged Shape: (67200, 12)
Test Merged Shape: (5900, 12)


In [113]:
test_merged.head()

Unnamed: 0,route_key,doj,srcid,destid,doi,srcid_region,destid_region,srcid_tier,destid_tier,seats_at_15_days,searches_at_15_days,dbd_at_prediction
0,2025-02-11_46_45,2025-02-11,46,45,2025-01-27,Tamil Nadu,Karnataka,Tier 1,Tier 1,38.0,1082.0,15
1,2025-01-20_17_23,2025-01-20,17,23,2025-01-05,East 1,East 1,Tier2,Tier 1,0.0,1175.0,15
2,2025-01-08_02_14,2025-01-08,2,14,2024-12-24,Maharashtra and Goa,Maharashtra and Goa,Tier 1,Tier2,0.0,370.0,15
3,2025-01-08_08_47,2025-01-08,8,47,2024-12-24,Andhra Pradesh,Andhra Pradesh,Tier2,Tier 1,0.0,120.0,15
4,2025-01-08_09_46,2025-01-08,9,46,2024-12-24,Tamil Nadu,Tamil Nadu,Tier2,Tier 1,39.0,1230.0,15


In [114]:
train_merged.head()

Unnamed: 0,doj,srcid,destid,final_seatcount,doi,srcid_region,destid_region,srcid_tier,destid_tier,seats_at_15_days,searches_at_15_days,dbd_at_prediction
0,2023-03-01,45,46,2838.0,2023-02-14,Karnataka,Tamil Nadu,Tier 1,Tier 1,16.0,480.0,15
1,2023-03-01,46,45,2298.0,2023-02-14,Tamil Nadu,Karnataka,Tier 1,Tier 1,34.0,352.0,15
2,2023-03-01,45,47,2720.0,2023-02-14,Karnataka,Andhra Pradesh,Tier 1,Tier 1,36.0,892.0,15
3,2023-03-01,47,45,2580.0,2023-02-14,Andhra Pradesh,Karnataka,Tier 1,Tier 1,18.0,1130.0,15
4,2023-03-01,46,9,4185.0,2023-02-14,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,48.0,1023.0,15


# Feature Engineering

In [115]:
def feature_engineer(df):

  df_copy = df.copy()

  # EXTERNAL DATA
  holiday_dates = [
    # 2023 Holidays
    "2023-01-01",
    "2023-03-08",
    "2023-04-22",
    "2023-06-29",
    "2023-11-12",

    # 2024 Holidays
    "2024-01-01",
    "2024-03-25",
    "2024-04-10",
    "2024-06-17",
    "2024-11-01",

    # 2025 Holidays
    "2025-01-01",
    "2025-03-14",
    "2025-03-31",
    "2025-06-07",
    "2025-10-20",
]

  holiday_dates = pd.to_datetime(holiday_dates)

  # DATE BASED FEATURE
  df_copy["doj_dayofweek"] = df_copy["doj"].dt.dayofweek
  df_copy["doj_dayofyear"] = df_copy["doj"].dt.dayofyear
  df_copy['doj_weekofyear'] = df_copy['doj'].dt.isocalendar().week.astype(int)
  df_copy["doj_month"] = df_copy["doj"].dt.month
  df_copy["doj_year"] = df_copy["doj"].dt.year
  df_copy["is_weekend"] = (df_copy["doj_dayofweek"] >= 5).astype(int)
  df_copy["is_holiday"] = df_copy["doj"].isin(holiday_dates).astype(int)
  df_copy["is_holiday_near"] = (df_copy["doj"].isin(holiday_dates + pd.Timedelta(days= 1)) | df_copy["doj"].isin(holiday_dates - pd.Timedelta(days= 1))).astype(int)

  # ROUTE BASED FEATURE
  # to identify the route of journey
  df_copy["route"] = df_copy["srcid"].astype(str) + "_" + df_copy["destid"].astype(str)
  # to identify if the journey was inter or intra region/tier
  df_copy["is_inter_region"] = (df_copy["srcid_region"] != df_copy["destid_region"]).astype(int)
  df_copy["is_inter_tier"] = (df_copy["srcid_tier"] != df_copy["destid_tier"]).astype(int)

  # SEARCH BASED FEATURE
  # to identify the strength of interaction
  df_copy["search_to_book_ratio"] = df_copy["seats_at_15_days"] / (df_copy["searches_at_15_days"] + 1)

  # filling Nan values created by join
  columns = ["seats_at_15_days", "searches_at_15_days", "search_to_book_ratio"]
  for col in columns:
    if col in df_copy.columns:
      df_copy[col] = df_copy[col].fillna(0)

  return df_copy

In [116]:
test_featured = feature_engineer(test_merged)
train_featured = feature_engineer(train_merged)

In [117]:
test_featured.head()

Unnamed: 0,route_key,doj,srcid,destid,doi,srcid_region,destid_region,srcid_tier,destid_tier,seats_at_15_days,...,doj_weekofyear,doj_month,doj_year,is_weekend,is_holiday,is_holiday_near,route,is_inter_region,is_inter_tier,search_to_book_ratio
0,2025-02-11_46_45,2025-02-11,46,45,2025-01-27,Tamil Nadu,Karnataka,Tier 1,Tier 1,38.0,...,7,2,2025,0,0,0,46_45,1,0,0.035088
1,2025-01-20_17_23,2025-01-20,17,23,2025-01-05,East 1,East 1,Tier2,Tier 1,0.0,...,4,1,2025,0,0,0,17_23,0,1,0.0
2,2025-01-08_02_14,2025-01-08,2,14,2024-12-24,Maharashtra and Goa,Maharashtra and Goa,Tier 1,Tier2,0.0,...,2,1,2025,0,0,0,2_14,0,1,0.0
3,2025-01-08_08_47,2025-01-08,8,47,2024-12-24,Andhra Pradesh,Andhra Pradesh,Tier2,Tier 1,0.0,...,2,1,2025,0,0,0,8_47,0,1,0.0
4,2025-01-08_09_46,2025-01-08,9,46,2024-12-24,Tamil Nadu,Tamil Nadu,Tier2,Tier 1,39.0,...,2,1,2025,0,0,0,9_46,0,1,0.031682


In [118]:
train_featured.head()

Unnamed: 0,doj,srcid,destid,final_seatcount,doi,srcid_region,destid_region,srcid_tier,destid_tier,seats_at_15_days,...,doj_weekofyear,doj_month,doj_year,is_weekend,is_holiday,is_holiday_near,route,is_inter_region,is_inter_tier,search_to_book_ratio
0,2023-03-01,45,46,2838.0,2023-02-14,Karnataka,Tamil Nadu,Tier 1,Tier 1,16.0,...,9,3,2023,0,0,0,45_46,1,0,0.033264
1,2023-03-01,46,45,2298.0,2023-02-14,Tamil Nadu,Karnataka,Tier 1,Tier 1,34.0,...,9,3,2023,0,0,0,46_45,1,0,0.096317
2,2023-03-01,45,47,2720.0,2023-02-14,Karnataka,Andhra Pradesh,Tier 1,Tier 1,36.0,...,9,3,2023,0,0,0,45_47,1,0,0.040314
3,2023-03-01,47,45,2580.0,2023-02-14,Andhra Pradesh,Karnataka,Tier 1,Tier 1,18.0,...,9,3,2023,0,0,0,47_45,1,0,0.015915
4,2023-03-01,46,9,4185.0,2023-02-14,Tamil Nadu,Tamil Nadu,Tier 1,Tier2,48.0,...,9,3,2023,0,0,0,46_9,0,1,0.046875


In [119]:
# one-hot encoding the categorical features in "test_featured" and "test_featured"
categorical_cols = ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']

test_final = pd.get_dummies(test_featured, columns= categorical_cols, dummy_na= False)
train_final = pd.get_dummies(train_featured, columns= categorical_cols, dummy_na= False)

In [120]:
test_final.head()

Unnamed: 0,route_key,doj,srcid,destid,doi,seats_at_15_days,searches_at_15_days,dbd_at_prediction,doj_dayofweek,doj_dayofyear,...,destid_region_Rest of North,destid_region_Tamil Nadu,srcid_tier_Tier 1,srcid_tier_Tier 3,srcid_tier_Tier 4,srcid_tier_Tier2,destid_tier_Tier 1,destid_tier_Tier 3,destid_tier_Tier 4,destid_tier_Tier2
0,2025-02-11_46_45,2025-02-11,46,45,2025-01-27,38.0,1082.0,15,1,42,...,False,False,True,False,False,False,True,False,False,False
1,2025-01-20_17_23,2025-01-20,17,23,2025-01-05,0.0,1175.0,15,0,20,...,False,False,False,False,False,True,True,False,False,False
2,2025-01-08_02_14,2025-01-08,2,14,2024-12-24,0.0,370.0,15,2,8,...,False,False,True,False,False,False,False,False,False,True
3,2025-01-08_08_47,2025-01-08,8,47,2024-12-24,0.0,120.0,15,2,8,...,False,False,False,False,False,True,True,False,False,False
4,2025-01-08_09_46,2025-01-08,9,46,2024-12-24,39.0,1230.0,15,2,8,...,False,True,False,False,False,True,True,False,False,False


In [121]:
train_final.head()

Unnamed: 0,doj,srcid,destid,final_seatcount,doi,seats_at_15_days,searches_at_15_days,dbd_at_prediction,doj_dayofweek,doj_dayofyear,...,destid_region_Rest of North,destid_region_Tamil Nadu,srcid_tier_Tier 1,srcid_tier_Tier 3,srcid_tier_Tier 4,srcid_tier_Tier2,destid_tier_Tier 1,destid_tier_Tier 3,destid_tier_Tier 4,destid_tier_Tier2
0,2023-03-01,45,46,2838.0,2023-02-14,16.0,480.0,15,2,60,...,False,True,True,False,False,False,True,False,False,False
1,2023-03-01,46,45,2298.0,2023-02-14,34.0,352.0,15,2,60,...,False,False,True,False,False,False,True,False,False,False
2,2023-03-01,45,47,2720.0,2023-02-14,36.0,892.0,15,2,60,...,False,False,True,False,False,False,True,False,False,False
3,2023-03-01,47,45,2580.0,2023-02-14,18.0,1130.0,15,2,60,...,False,False,True,False,False,False,True,False,False,False
4,2023-03-01,46,9,4185.0,2023-02-14,48.0,1023.0,15,2,60,...,False,True,True,False,False,False,False,False,False,True


In [133]:
print("Train Final Shape:", train_final.shape)
print("Test Final Shape:", test_final.shape)

Train Final Shape: (67200, 51)
Test Final Shape: (5900, 55)


# Model Training and Evaluation

In [134]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

print("--Sorting data by date--")
train_sorted = train_final.sort_values(by='doj').reset_index(drop=True)


print("--Creating the expanding average feature--")

# creating helper columns for the calculation
train_sorted['cumsum_seats'] = train_sorted.groupby('route')['final_seatcount'].cumsum()
train_sorted['cumcount_journeys'] = train_sorted.groupby('route').cumcount() + 1

# calculating the expanding average using the .shift(1) trick to avoid leakage
train_sorted["expanding_route_avg"] = (train_sorted["cumsum_seats"].shift(1)) / (train_sorted["cumcount_journeys"].shift(1))

# filling empty values for the first time a route appears
global_average = train_sorted["final_seatcount"].mean()
train_sorted["expanding_route_avg"].fillna(global_average, inplace=True)
print("--Feature created--")


# for test data, we're use the final average from the entire training history
final_route_averages = train_sorted.groupby("route")["final_seatcount"].mean()
test_final["expanding_route_avg"] = test_final["route"].map(final_route_averages)
test_final["expanding_route_avg"].fillna(global_average, inplace=True)

columns_to_drop = ["doj", "doi", "final_seatcount", "route_key", "route", "cumsum_seats", "cumcount_journeys"]

features = [col for col in train_sorted.columns if col not in columns_to_drop]

X = train_sorted[features]
y = train_sorted['final_seatcount']


# time-based training and validation split
print("--Splitting data into training and validation sets--")

# we will keep first 80% is for training, the last 20% is for validation.
split_point = int(len(train_sorted) * 0.8)

# Training set
X_train = X.iloc[:split_point]
y_train = y.iloc[:split_point]

# Validation set
X_val = X.iloc[split_point:]
y_val = y.iloc[split_point:]


# model
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)

print("--Training the model--")
model.fit(X_train, y_train)
print("--Training complete--")



predictions = model.predict(X_val)

# Root Mean Squared Error (RMSE) to see the average error
rmse = np.sqrt(mean_squared_error(y_val, predictions))
print(f"The model's average error on the validation set (RMSE) is: {rmse:.2f} seats")

--Sorting data by date--
--Creating the expanding average feature--
Feature created

Splitting data into training and validation sets...
Training the model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_sorted["expanding_route_avg"].fillna(global_average, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_final["expanding_route_avg"].fillna(global_average, inplace=True)


Training complete!

The model's average error on the validation set (RMSE) is: 605.59 seats


In [135]:
print("Making final predictions for the submission file.")

final_predictions = rfr_model.predict(X_test_val)

final_predictions = np.round(final_predictions).astype(int)
final_predictions[final_predictions < 0] = 0

submission_df = pd.DataFrame({
    "route_key": test_df["route_key"],
    "final_seatcount": final_predictions
})

submission_df.to_csv("submission.csv", index=False)

Making final predictions for the submission file.
