# Library Imports

In [1]:
import random
import numpy as np # linear algebra
random.seed(42) #For Reproducibility
np.random.seed(42)

In [None]:
#Included matplotlib.pyplot and seaborn for visualization. However in this notebook, i only did feature extraction, preprocessing and modeling of the dataset.

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import matplotlib.pyplot as plt # data visualization
# import seaborn as sns # statistical data visualization
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import os
import statistics
import warnings
warnings.filterwarnings('ignore')

# Data Imports and Exploration

Note that this import path is for my folder setup. Please change path to where your Train, Test and Graph datasets are. 

In [3]:
DATA_PATH = '/kaggle/input/yango-accra-mobility-dataset' # change data path to your drive
# Load files
train = pd.read_csv(os.path.join(DATA_PATH, 'Train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'Test.csv'))
samplesubmission = pd.read_csv(os.path.join(DATA_PATH, 'SampleSubmission.csv'))
graph_df = pd.read_csv(os.path.join(DATA_PATH, 'Graph.csv'))

# Preview train dataset
train.head()

Unnamed: 0,ID,persistent_id,day,prediction_type,count_norm_00_0_,count_norm_00_1_,count_norm_00_2_,count_norm_00_3_,count_norm_01_0_,count_norm_01_1_,...,speed_avg_21_3_,speed_avg_22_0_,speed_avg_22_1_,speed_avg_22_2_,speed_avg_22_3_,speed_avg_23_0_,speed_avg_23_1_,speed_avg_23_2_,speed_avg_23_3_,target
0,5265971867368357888_X_lbo_weekday_X_morning_ru...,5265971867368357888,lbo_weekday,morning_rush_hour,0.000119,8.9e-05,8.9e-05,6.9e-05,2.9e-05,5.9e-05,...,,,,,,,,,,11.214111
1,17637398975726110720_X_other_holiday_X_morning...,17637398975726110720,other_holiday,morning_rush_hour,0.000554,0.000408,0.000417,0.000471,0.000427,0.000435,...,,,,,,,,,,18.486919
2,8568741015432481792_X_other_weekday_X_evening_...,8568741015432481792,other_weekday,evening_rush_hour,,,,,,,...,13.457778,11.028648,11.424647,14.729181,13.025704,,,,,9.772556
3,1464534661918074112_X_first_holiday_X_evening_...,1464534661918074112,first_holiday,evening_rush_hour,,,,,,,...,15.166433,13.408104,15.358508,16.056136,16.868352,,,,,14.378477
4,4278874665015895552_X_last_weekday_X_morning_r...,4278874665015895552,last_weekday,morning_rush_hour,2.1e-05,2e-05,,2.9e-05,,1e-05,...,,,,,,,,,,11.317734


### Let's create a function that will be used to Lookup from the graph.csv file that has more features

In [4]:
def look_up_from_graph(train,graph,lookup_columns):
  # returns dataframe by looking up columns from the graph.csv

  train = train.copy()
  graph = graph.copy()

  train = train.merge(graph[lookup_columns], on = "persistent_id", how = "left")

  return train

"""
Save columns to lookup from the graph dataset, I found that these did not work well even though they seem very useful -- they lead to overfitting quite easily didn't have time to investigate why this happened.
The best model should include at least a subset of these
"""
preliminary_columns = ["persistent_id","length", "speed_limit", "segments","category","is_residential", "traffic_side"]

train = look_up_from_graph(train,graph_df,preliminary_columns)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30306 entries, 0 to 30305
Columns: 171 entries, ID to traffic_side
dtypes: float64(167), object(3), uint64(1)
memory usage: 39.5+ MB


In [5]:
train.head()

Unnamed: 0,ID,persistent_id,day,prediction_type,count_norm_00_0_,count_norm_00_1_,count_norm_00_2_,count_norm_00_3_,count_norm_01_0_,count_norm_01_1_,...,speed_avg_23_1_,speed_avg_23_2_,speed_avg_23_3_,target,length,speed_limit,segments,category,is_residential,traffic_side
0,5265971867368357888_X_lbo_weekday_X_morning_ru...,5265971867368357888,lbo_weekday,morning_rush_hour,0.000119,8.9e-05,8.9e-05,6.9e-05,2.9e-05,5.9e-05,...,,,,11.214111,,,,,,
1,17637398975726110720_X_other_holiday_X_morning...,17637398975726110720,other_holiday,morning_rush_hour,0.000554,0.000408,0.000417,0.000471,0.000427,0.000435,...,,,,18.486919,,,,,,
2,8568741015432481792_X_other_weekday_X_evening_...,8568741015432481792,other_weekday,evening_rush_hour,,,,,,,...,,,,9.772556,,,,,,
3,1464534661918074112_X_first_holiday_X_evening_...,1464534661918074112,first_holiday,evening_rush_hour,,,,,,,...,,,,14.378477,,,,,,
4,4278874665015895552_X_last_weekday_X_morning_r...,4278874665015895552,last_weekday,morning_rush_hour,2.1e-05,2e-05,,2.9e-05,,1e-05,...,,,,11.317734,,,,,,


## Preprocessing

### Let's create a function that picks all our categorical features, we will need this for LightGBM

In [6]:
def change_object_to_cat(df):
  # changes object columns to category and returns dataframe and list of converted columns

  # Make a copy of the DataFrame to avoid modifying the original DataFrame
  df = df.copy()

  # Identify columns of type "object" (typically string columns) in the DataFrame
  list_str_obj_cols = df.columns[df.dtypes == "object"].tolist()

  # Convert each identified object-type column to category
  for str_obj_col in list_str_obj_cols:
      df[str_obj_col] = df[str_obj_col].astype("category")

  # Return the updated DataFrame and the list of columns that were converted to category type
  return df, list_str_obj_cols


In [7]:
# Get features and target
X =  train.drop(['ID', 'persistent_id', 'target'], axis = 1)
y = train.target

In [8]:
# Preview X_train
X.head(3)

Unnamed: 0,day,prediction_type,count_norm_00_0_,count_norm_00_1_,count_norm_00_2_,count_norm_00_3_,count_norm_01_0_,count_norm_01_1_,count_norm_01_2_,count_norm_01_3_,...,speed_avg_23_0_,speed_avg_23_1_,speed_avg_23_2_,speed_avg_23_3_,length,speed_limit,segments,category,is_residential,traffic_side
0,lbo_weekday,morning_rush_hour,0.000119,8.9e-05,8.9e-05,6.9e-05,2.9e-05,5.9e-05,9e-05,3e-05,...,,,,,,,,,,
1,other_holiday,morning_rush_hour,0.000554,0.000408,0.000417,0.000471,0.000427,0.000435,0.000281,0.000317,...,,,,,,,,,,
2,other_weekday,evening_rush_hour,,,,,,,,,...,,,,,,,,,,


In [9]:
# Convert object type columns to categorical and retrieve the list of categorical columns
X, cat_list = change_object_to_cat(X)

# Print DataFrame information (shows column data types and other details)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30306 entries, 0 to 30305
Columns: 168 entries, day to traffic_side
dtypes: category(2), float64(166)
memory usage: 38.4 MB


In [10]:
cat_list

['day', 'prediction_type']

## Training (With LightGBM and CatBoost)

In [11]:
# split the dataset into the training set and test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Let's define a function that trains and returns the model, will be useful for cross validation

In [12]:
## LightBGM

In [13]:
def lgbm_trainer(X_train, y_train, X_test, y_test, params, num_round, categorical):
    """
    Trains an LGBM (LightGBM) model using the training data and evaluates it on the validation data.
    Returns the trained model (bst).
    """

    # Prepare the training dataset for LightGBM
    # 'lgb.Dataset' is used to create a LightGBM dataset object from the training data
    # 'categorical_feature' specifies which columns are categorical
    train_data = lgb.Dataset(X_train, y_train, feature_name=X_train.columns.tolist(), categorical_feature=categorical, free_raw_data=False)

    # Prepare the validation dataset for LightGBM in the same way as the training data
    validation_data = lgb.Dataset(X_test, y_test, feature_name=X_train.columns.tolist(), categorical_feature=categorical, free_raw_data=False)

    # Initialize an empty dictionary to record evaluation results during training
    eval_result = {}

    # Train the LightGBM model using the specified parameters and datasets
    bst = lgb.train(
        params,  # Model parameters (e.g., learning rate, number of leaves, etc.)
        train_data,  # The training dataset
        num_round,  # The number of boosting rounds (iterations)
        valid_sets=[train_data, validation_data],  # Datasets to evaluate during training
        callbacks=[  # Callbacks to use during training
            lgb.early_stopping(stopping_rounds=17),  # Stop training early if the validation score doesn't improve for 17 rounds
            lgb.log_evaluation(100),  # Log evaluation results every 100 rounds
            lgb.record_evaluation(eval_result)  # Record evaluation results in 'eval_result'
        ]
    )

    # Return the trained model (bst) after training is complete
    return bst


In [14]:
# We start by defining default parameters and setting the objective metric
param = {"verbose": -100}  # Set verbosity to -100 to suppress detailed output
param['metric'] = 'rmse'   # Set the evaluation metric to RMSE (Root Mean Squared Error)

# Lists to save metrics and predictions from the cross-validation folds
def cv_train_lgbm(X_train, y_train, params, num_rounds, category):
    """
    Function to perform 14-fold cross-validation and train an LGBM model
    Returns the out-of-fold validation score and the models from the cross-validation
    Parameters:
    X_train (DataFrame): The feature matrix for training.
    y_train (Series): The target labels for training.
    params (dict): The parameters for training the LightGBM model.
    num_rounds (int): The number of boosting iterations (rounds) to train the model.
    cat_list (list): A list of categorical feature names or indices.
    
    Returns:
    tuple: A tuple containing the mean RMSE (float) and the list of trained models (list).
    """
    kf = KFold(n_splits=14, random_state=48, shuffle=True)  # 14-fold cross-validation
    lgbm_rmses = []  # List to store RMSE values for each fold
    lgbm_y_vals = []  # List to store true values for each fold (not used in this example)
    lgbm_y_hats = []  # List to store predicted values for each fold (not used in this example)
    lgbm_models = []  # List to store trained models for each fold

    # Loop through each fold of the cross-validation
    for trn_idx, test_idx in kf.split(X_train, y_train):  # Split the data into training and validation sets
        X_tr, X_val = X_train.iloc[trn_idx], X_train.iloc[test_idx]  # Training and validation features
        y_tr, y_val = y_train.iloc[trn_idx], y_train.iloc[test_idx]  # Training and validation labels
        
        # Train the LGBM model using the training data and validation data
        lgbm_cls = lgbm_trainer(X_tr, y_tr, X_val, y_val, params, num_rounds, category)
        lgbm_models.append(lgbm_cls)  # Save the trained model
        
        # Use the trained model to make predictions on the validation set
        lgbm_y_hat = lgbm_cls.predict(X_val, num_iteration=lgbm_cls.best_iteration)
        
        # Calculate RMSE (Root Mean Squared Error) between true and predicted values
        lgbm_rmse = mean_squared_error(y_val, lgbm_y_hat, squared=False)  # RMSE is the square root of MSE
        lgbm_rmses.append(lgbm_rmse)  # Save the RMSE for this fold
    
    # Calculate the mean RMSE across all folds
    lgbm_mean_rmse = statistics.mean(lgbm_rmses)
    print("Mean RMSE: {}".format(lgbm_mean_rmse))  # Print the average RMSE across all folds
    
    return lgbm_mean_rmse, lgbm_models  # Return the average RMSE and the list of trained models


# Run the cross-validation function with the training data, parameters, and category list
lgbm_rmse, lgbm_models = cv_train_lgbm(X_train, y_train, param, 1000, cat_list)

# Print the final average RMSE
print(lgbm_rmse)


Training until validation scores don't improve for 17 rounds
[100]	training's rmse: 1.37102	valid_1's rmse: 1.60122
[200]	training's rmse: 1.18768	valid_1's rmse: 1.54985
[300]	training's rmse: 1.06045	valid_1's rmse: 1.52802
Early stopping, best iteration is:
[302]	training's rmse: 1.05925	valid_1's rmse: 1.52777
Training until validation scores don't improve for 17 rounds
[100]	training's rmse: 1.37023	valid_1's rmse: 1.5704
[200]	training's rmse: 1.19024	valid_1's rmse: 1.52823
[300]	training's rmse: 1.06044	valid_1's rmse: 1.50647
[400]	training's rmse: 0.965589	valid_1's rmse: 1.48891
Early stopping, best iteration is:
[477]	training's rmse: 0.901422	valid_1's rmse: 1.47492
Training until validation scores don't improve for 17 rounds
[100]	training's rmse: 1.36641	valid_1's rmse: 1.66811
[200]	training's rmse: 1.17929	valid_1's rmse: 1.63107
[300]	training's rmse: 1.05447	valid_1's rmse: 1.60698
[400]	training's rmse: 0.959537	valid_1's rmse: 1.59688
Early stopping, best iteration

In [15]:
# Mean RMSE: 1.5841566980352786

In [16]:
#CatBoost

In [17]:
def catboost_trainer(X_train, y_train, X_val, y_val, num_rounds, categorical):
    """
    Trains a CatBoost regressor model using the training data and evaluates it on the validation data.
    Returns the trained CatBoost model.
    """
    
    # Create a CatBoost regressor model with specified parameters
    # 'iterations' defines the number of boosting iterations (rounds) for training
    # 'cat_features' specifies the columns that are categorical
    # 'loss_function' is set to 'RMSE' (Root Mean Squared Error) for regression tasks
    # 'random_state' ensures reproducibility of results by fixing the random seed
    model = CatBoostRegressor(
        iterations=num_rounds,  # Number of boosting iterations (rounds)
        cat_features=categorical,  # Indices or names of categorical features
        loss_function='RMSE',  # Loss function to optimize (Root Mean Squared Error)
        random_state=48  # Random state for reproducibility
    )
    
    # Create Pool objects for CatBoost (optimized data format for CatBoost models)
    # 'Pool' is used to prepare the training and validation data for CatBoost
    train_pool = Pool(X_train, y_train, cat_features=categorical)
    validation_pool = Pool(X_val, y_val, cat_features=categorical)

    # Train the CatBoost model
    # The model will be trained on the 'train_pool' and evaluated on 'validation_pool'
    # 'early_stopping_rounds' stops training if the validation performance does not improve for 17 rounds
    # 'verbose' logs progress every 100 iterations
    model.fit(
        train_pool,  # Training data
        eval_set=validation_pool,  # Validation data
        early_stopping_rounds=17,  # Stop if the validation score doesn't improve for 17 rounds
        verbose=100  # Log training progress every 100 iterations
    )

    # Return the trained model after training is complete
    return model


In [18]:
# We start with default parameters and just define the objective metric
param = {"verbose": -100}  # Silence the verbose output for training logs
param['metric'] = 'rmse'  # Set the metric for evaluation to RMSE (Root Mean Squared Error)

# Lists to save metrics and predictions from each fold
def cv_train_cat(X_train, y_train, params, num_rounds, cat_list):
    """
    Function to perform 5-fold cross-validation using the CatBoost model
    and return the out-of-fold validation score (mean RMSE) and models.
    
    Parameters:
    X_train (DataFrame): The feature matrix for training.
    y_train (Series): The target labels for training.
    params (dict): The parameters for training the CatBoost model.
    num_rounds (int): The number of boosting iterations (rounds) to train the model.
    cat_list (list): A list of categorical feature names or indices.
    
    Returns:
    tuple: A tuple containing the mean RMSE (float) and the list of trained models (list).
    """
    
    # Create KFold cross-validator (14 splits in this case, shuffle the data, and set random state for reproducibility)
    kf = KFold(n_splits=14, random_state=48, shuffle=True)  # Split data into 14 folds
    
    cat_rmses = []  # List to save the RMSE for each fold
    cat_y_vals = []  # List to store actual target values for each fold (for evaluation purposes)
    cat_y_hats = []  # List to store predicted values for each fold
    cat_models = []  # List to store the trained models from each fold

    # Loop over the splits (train-test splits generated by KFold)
    for trn_idx, test_idx in kf.split(X_train, y_train):
        # Create training and validation sets based on the current fold split
        X_tr, X_val = X_train.iloc[trn_idx], X_train.iloc[test_idx]
        y_tr, y_val = y_train.iloc[trn_idx], y_train.iloc[test_idx]
        
        # Train the model on the training set and validate on the validation set
        cat_cls = catboost_trainer(X_tr, y_tr, X_val, y_val, num_rounds, cat_list)
        cat_models.append(cat_cls)  # Save the trained model from this fold
        
        # Predict using the trained model (using the validation set for predictions)
        cat_y_hat = cat_cls.predict(X_val)
        
        # Calculate the RMSE for this fold (Root Mean Squared Error)
        cat_rmse = mean_squared_error(y_val, cat_y_hat, squared=False)  # squared=False returns RMSE instead of MSE
        cat_rmses.append(cat_rmse)  # Store the RMSE for this fold
    
    # Calculate the mean RMSE across all folds
    cat_mean_rmse = statistics.mean(cat_rmses)
    print("Mean RMSE: {}".format(cat_mean_rmse))  # Print the mean RMSE for the cross-validation
    return cat_mean_rmse, cat_models  # Return the mean RMSE and list of models

# X_train, y_train, and cat_list are already defined
cat_rmse, cat_models = cv_train_cat(X_train, y_train, param, 1000, cat_list)
print(cat_rmse)  # Print the final mean RMSE


Learning rate set to 0.083028
0:	learn: 4.4527965	test: 4.3296759	best: 4.3296759 (0)	total: 96.1ms	remaining: 1m 36s
100:	learn: 1.7615857	test: 1.7706251	best: 1.7706251 (100)	total: 3.37s	remaining: 30s
200:	learn: 1.6128713	test: 1.6657918	best: 1.6657918 (200)	total: 6.68s	remaining: 26.5s
300:	learn: 1.5223220	test: 1.6145569	best: 1.6145428 (299)	total: 9.93s	remaining: 23.1s
400:	learn: 1.4500745	test: 1.5816070	best: 1.5815207 (399)	total: 13.2s	remaining: 19.7s
500:	learn: 1.3935976	test: 1.5569435	best: 1.5569435 (500)	total: 16.5s	remaining: 16.5s
600:	learn: 1.3442354	test: 1.5385640	best: 1.5381388 (598)	total: 19.8s	remaining: 13.2s
700:	learn: 1.3020417	test: 1.5231324	best: 1.5231324 (700)	total: 23.1s	remaining: 9.84s
800:	learn: 1.2642206	test: 1.5160315	best: 1.5160315 (800)	total: 26.4s	remaining: 6.56s
Stopped by overfitting detector  (17 iterations wait)

bestTest = 1.514562392
bestIteration = 806

Shrink model to first 807 iterations.
Learning rate set to 0.0830

In [19]:
#Mean RMSE: 1.5989421099589922

## Create ensemble model and Predict on Test Set

In [20]:
# Set the data path for test data
DATA_PATH = '/kaggle/input/yango-accra-mobility-dataset'
test = pd.read_csv(os.path.join(DATA_PATH, 'Test.csv'))
test_df = test.copy()

# Preprocess the dataset to match train set (custom functions assumed to exist)
test_df = look_up_from_graph(test_df, graph_df, preliminary_columns)
test_df = test_df[X_train.columns.tolist()]
test_df, _ = change_object_to_cat(test_df)
pred_df = test_df.copy()

# --- Prediction on the Test Set ---
# Predict using LGBM models
for i, model in enumerate(lgbm_models):
    if hasattr(model, 'best_iteration') and model.best_iteration:
        pred_df["pred_lgbm_{}".format(i)] = model.predict(test_df, num_iteration=model.best_iteration)
    else:
        pred_df["pred_lgbm_{}".format(i)] = model.predict(test_df)

# Predict using CatBoost models
for i, model in enumerate(cat_models):
    pred_df["pred_catboost_{}".format(i)] = model.predict(test_df)

# --- Ensemble the Predictions (Mean and Median) ---
# Calculate mean and median of the predictions from the models
sub_df = pred_df.copy()
lgbm_preds = pred_df.filter(like='pred_lgbm').values
catboost_preds = pred_df.filter(like='pred_catboost').values

# Combine LGBM and CatBoost predictions
combined_preds = pd.concat([pd.DataFrame(lgbm_preds), pd.DataFrame(catboost_preds)], axis=1)

# Calculate the mean and median
sub_df["mean_pred"] = combined_preds.mean(axis=1)
sub_df["median_pred"] = combined_preds.median(axis=1)

In [21]:
# --- Create a Submission File ---
sub_file = pd.DataFrame({'ID': test.ID, 'target': sub_df.mean_pred})  # Choose mean or median here
sub_file.to_csv('lgbm_catboost_ensemble_mean_submission14.csv', index=False)
sub_file.head()

Unnamed: 0,ID,target
0,12170463379909937152_X_last_weekday_X_morning_...,14.294417
1,12462211467129128960_X_lbo_weekday_X_evening_r...,8.897416
2,17199689856168204288_X_last_weekday_X_evening_...,16.01305
3,12559239482074976256_X_second_weekday_X_evenin...,12.293271
4,3456135199470793216_X_other_holiday_X_evening_...,13.436907


# Thank You!