In [None]:
!pip install pycaret[full]

In [977]:
import numpy as np
import pandas as pd

from pycaret.regression import *
from pycaret.utils import *
from sklearn.metrics import mean_squared_error
import datetime

In [978]:
FEATURE_COLUMNS = ["cluster_start", "cluster_end", "humidity", "visibility", "windspeedKmph", "tempC", "is_rush_hour", "is_work_hour", "is_night_time", "is_late_night_time"]
CATEGORICAL_COLUMNS = ["weatherStatus"]
USELESS_COLUMNS = ['node_start', 'node_finish', 'lon_start', 'lon_end', 'lat_end', 'lat_start', 'average_speed']

In [979]:
dataset = pd.read_csv('datasets/big-table-train.csv')
dataset = dataset.drop(columns=[*USELESS_COLUMNS, 'completed_time'])
dataset

Unnamed: 0,Id,running_time,route_distance_km,delta_time,distance,speed,cluster_start,cluster_end,is_work_hour,is_night_time,is_late_night_time,is_rush_hour,humidity,visibility,windspeedKmph,tempC,weatherStatus
0,-4773019581999572651,2022-01-24 18:30:21,3.740,862.0,31.771489,30.0,5,83,1,0,0,1,87,10,18,-4,Light snow
1,-4773019581999572651,2022-01-24 18:30:21,3.740,862.0,3.673054,24.0,1,1,1,0,0,1,87,10,18,-4,Light snow
2,-4773019581999572651,2022-01-24 18:30:21,3.740,862.0,15.550612,27.0,1,1,1,0,0,1,87,10,18,-4,Light snow
3,-4773019581999572651,2022-01-24 18:30:21,3.740,862.0,50.034390,29.0,73,73,1,0,0,1,87,10,18,-4,Light snow
4,-4773019581999572651,2022-01-24 18:30:21,3.740,862.0,13.453126,30.0,73,73,1,0,0,1,87,10,18,-4,Light snow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401306,-6353816735923374488,2022-01-24 00:35:03,6.452,552.0,3.657596,30.0,1,1,0,0,1,0,85,10,23,-5,Clear
401307,-6353816735923374488,2022-01-24 00:35:03,6.452,552.0,114.405667,39.0,1,1,0,0,1,0,85,10,23,-5,Clear
401308,-6353816735923374488,2022-01-24 00:35:03,6.452,552.0,67.195118,47.0,1,1,0,0,1,0,85,10,23,-5,Clear
401309,-6353816735923374488,2022-01-24 00:35:03,6.452,552.0,124.796385,40.0,1,1,0,0,1,0,85,10,23,-5,Clear


In [980]:
dataset_agg = dataset.groupby(['Id', 'running_time', 'route_distance_km', 'delta_time']) \
    .agg({'distance': ['min', 'max', 'mean', 'std'], 'speed': ['min', 'max', 'mean', 'std']}).reset_index()
if len(CATEGORICAL_COLUMNS) > 0:
    dataset_agg[CATEGORICAL_COLUMNS] = dataset[CATEGORICAL_COLUMNS]
if len(FEATURE_COLUMNS) > 0:
    dataset_agg[FEATURE_COLUMNS] = dataset[FEATURE_COLUMNS]
dataset_agg = dataset_agg.drop(columns=['Id'])
dataset_agg['running_time'] = (pd.to_datetime(dataset_agg['running_time']) - datetime.datetime(1970,1,1)).dt.total_seconds()
dataset_agg

Unnamed: 0_level_0,running_time,route_distance_km,delta_time,distance,distance,distance,distance,speed,speed,speed,...,cluster_start,cluster_end,humidity,visibility,windspeedKmph,tempC,is_rush_hour,is_work_hour,is_night_time,is_late_night_time
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,min,max,mean,std,min,max,mean,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.643067e+09,3.179,469.0,2.494184,200.876922,36.177945,34.938977,11.0,43.0,25.987952,...,5,83,87,10,18,-4,1,1,0,0
1,1.643058e+09,6.137,688.0,2.664808,301.851107,51.178833,50.153951,10.0,50.0,29.945455,...,1,1,87,10,18,-4,1,1,0,0
2,1.643064e+09,6.312,683.0,1.261206,201.267382,35.889595,37.219656,7.0,57.0,33.336538,...,1,1,87,10,18,-4,1,1,0,0
3,1.643063e+09,6.379,885.0,1.777369,231.663338,39.080716,42.778210,15.0,52.0,35.449612,...,73,73,87,10,18,-4,1,1,0,0
4,1.643045e+09,2.551,612.0,1.647032,224.340608,31.138847,43.097163,9.0,83.0,24.269231,...,73,73,87,10,18,-4,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1.643065e+09,5.520,753.0,1.306968,327.558417,35.052090,47.395853,11.0,58.0,33.754658,...,64,64,87,10,18,-4,1,1,0,0
4996,1.643057e+09,3.146,462.0,1.688882,259.418013,30.392428,37.476618,18.0,41.0,30.095960,...,89,89,87,10,18,-4,1,1,0,0
4997,1.643002e+09,4.038,607.0,4.213290,184.102066,49.404830,40.030175,0.0,68.0,38.631579,...,37,37,87,10,18,-4,1,1,0,0
4998,1.643056e+09,1.641,223.0,1.306756,89.077514,25.116555,19.720309,13.0,32.0,24.155556,...,89,89,87,10,18,-4,1,1,0,0


In [981]:
dataset_agg = pd.DataFrame(dataset_agg.values, columns=['running_time', 'route_distance_km', 'delta_time', 'min_distance', 'max_distance', 'mean_distance', 'std_distance', 'min_speed', 'max_speed', 'mean_speed', 'std_speed', *CATEGORICAL_COLUMNS, *FEATURE_COLUMNS])
dataset_agg['delta_time'] = np.log1p(dataset_agg['delta_time'].astype('float'))
dataset_agg

Unnamed: 0,running_time,route_distance_km,delta_time,min_distance,max_distance,mean_distance,std_distance,min_speed,max_speed,mean_speed,...,cluster_start,cluster_end,humidity,visibility,windspeedKmph,tempC,is_rush_hour,is_work_hour,is_night_time,is_late_night_time
0,1643066575.0,3.179,6.152733,2.494184,200.876922,36.177945,34.938977,11.0,43.0,25.987952,...,5,83,87,10,18,-4,1,1,0,0
1,1643057710.0,6.137,6.535241,2.664808,301.851107,51.178833,50.153951,10.0,50.0,29.945455,...,1,1,87,10,18,-4,1,1,0,0
2,1643063917.0,6.312,6.527958,1.261206,201.267382,35.889595,37.219656,7.0,57.0,33.336538,...,1,1,87,10,18,-4,1,1,0,0
3,1643062825.0,6.379,6.786717,1.777369,231.663338,39.080716,42.77821,15.0,52.0,35.449612,...,73,73,87,10,18,-4,1,1,0,0
4,1643045148.0,2.551,6.418365,1.647032,224.340608,31.138847,43.097163,9.0,83.0,24.269231,...,73,73,87,10,18,-4,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1643065085.0,5.52,6.625392,1.306968,327.558417,35.05209,47.395853,11.0,58.0,33.754658,...,64,64,87,10,18,-4,1,1,0,0
4996,1643056621.0,3.146,6.137727,1.688882,259.418013,30.392428,37.476618,18.0,41.0,30.09596,...,89,89,87,10,18,-4,1,1,0,0
4997,1643002083.0,4.038,6.410175,4.21329,184.102066,49.40483,40.030175,0.0,68.0,38.631579,...,37,37,87,10,18,-4,1,1,0,0
4998,1643055578.0,1.641,5.411646,1.306756,89.077514,25.116555,19.720309,13.0,32.0,24.155556,...,89,89,87,10,18,-4,1,1,0,0


In [982]:
data = dataset_agg.sample(frac=0.9, random_state=786)
data_unseen = dataset_agg.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (4500, 22)
Unseen Data For Predictions: (500, 22)


In [983]:
s = setup(data = data, target = 'delta_time', categorical_features=CATEGORICAL_COLUMNS, session_id=123, use_gpu=False, normalize = True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,delta_time
2,Target type,Regression
3,Original data shape,"(4500, 22)"
4,Transformed data shape,"(4500, 22)"
5,Transformed train set shape,"(3150, 22)"
6,Transformed test set shape,"(1350, 22)"
7,Categorical features,1
8,Preprocess,True
9,Imputation type,simple


In [984]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.1773,0.057,0.2381,0.6372,0.0334,0.0285,0.227
et,Extra Trees Regressor,0.1801,0.0579,0.24,0.6312,0.0337,0.029,0.26
rf,Random Forest Regressor,0.1792,0.058,0.2401,0.6308,0.0337,0.0288,0.443
lightgbm,Light Gradient Boosting Machine,0.1793,0.0583,0.2407,0.6288,0.0338,0.0289,0.102
br,Bayesian Ridge,0.2056,0.0735,0.2706,0.5315,0.0377,0.033,0.061
lr,Linear Regression,0.2056,0.0735,0.2707,0.5314,0.0377,0.033,0.067
ridge,Ridge Regression,0.2056,0.0735,0.2707,0.5314,0.0377,0.033,0.065
lar,Least Angle Regression,0.2056,0.0735,0.2707,0.5314,0.0377,0.033,0.062
huber,Huber Regressor,0.2044,0.074,0.2716,0.5279,0.0379,0.0329,0.065
ada,AdaBoost Regressor,0.2226,0.0765,0.2761,0.5118,0.0383,0.0354,0.136


In [985]:
best_model = create_model('gbr')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1733,0.0571,0.2389,0.6251,0.0333,0.0276
1,0.1909,0.0666,0.2581,0.5722,0.0362,0.0307
2,0.1914,0.0706,0.2657,0.5623,0.0371,0.0306
3,0.1651,0.0487,0.2206,0.6935,0.031,0.0267
4,0.1718,0.0514,0.2268,0.6634,0.0319,0.0278
5,0.1864,0.0649,0.2547,0.621,0.036,0.0303
6,0.1788,0.0507,0.2251,0.6562,0.0315,0.0288
7,0.174,0.0518,0.2277,0.655,0.0318,0.0279
8,0.1838,0.0623,0.2496,0.6444,0.0357,0.03
9,0.1581,0.0457,0.2137,0.6791,0.0296,0.0251


In [None]:
tuned_best_model = tune_model(best_model, n_iter=10)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
plot_model(tuned_best_model, 'feature')

In [563]:
interpret_model(tuned_best_model)

ModuleNotFoundError: 
'shap' is a soft dependency and not included in the pycaret installation. Please run: `pip install shap` to install.
Alternately, you can install this by running `pip install pycaret[analysis]`

In [None]:
# validate on unseen data
unseen_predictions = predict_model(tuned_best_model, data=data_unseen)
unseen_predictions

In [None]:
(mean_squared_error(np.expm1(unseen_predictions['delta_time']), np.expm1(unseen_predictions['prediction_label']), squared=False))

### Model predictions on test data

In [None]:
test_dataset = pd.read_csv("datasets/big-table-test.csv")
test_dataset = test_dataset.drop(columns=USELESS_COLUMNS)

test_dataset

In [None]:
prediction_agg = test_dataset.groupby(['Id', 'running_time', 'route_distance_km']) \
    .agg({'distance': ['min', 'max', 'mean', 'std'], 'speed': ['min', 'max', 'mean', 'std']}).reset_index()
if len(CATEGORICAL_COLUMNS) > 0:
    prediction_agg[CATEGORICAL_COLUMNS] = test_dataset[CATEGORICAL_COLUMNS]
if len(FEATURE_COLUMNS) > 0:
    prediction_agg[FEATURE_COLUMNS] = test_dataset[FEATURE_COLUMNS]
prediction_agg = prediction_agg.drop(columns=['Id'])
prediction_agg['running_time'] = (pd.to_datetime(prediction_agg['running_time']) - datetime.datetime(1970,1,1)).dt.total_seconds()
prediction_agg['delta_time'] = 0
prediction_agg

In [None]:
prediction_agg = pd.DataFrame(prediction_agg.values, columns=['running_time', 'route_distance_km', 'min_distance', 'max_distance', 'mean_distance', 'std_distance', 'min_speed', 'max_speed', 'mean_speed', 'std_speed', *CATEGORICAL_COLUMNS, *FEATURE_COLUMNS, 'delta_time'])

In [None]:
prediction_agg

In [None]:
data_prediction = prediction_agg
data_prediction.reset_index(drop=True, inplace=True)

In [None]:
data_prediction.shape

In [None]:
submission_df = pd.read_csv("datasets/submission.csv")
submission_df

In [None]:
submission_predictions = predict_model(tuned_best_model, data=data_prediction)
submission_predictions

In [None]:
result_df = submission_predictions["prediction_label"].reset_index()
result_df.shape

In [None]:
submission_df["Predicted"] = np.expm1(result_df["prediction_label"])

In [None]:
submission_df.to_csv("datasets/submission.csv", index=False)

## Submission phase

Guide: https://www.kaggle.com/code/derrickmwiti/how-to-make-submissions-using-kaggle-s-api/notebook

In [None]:
!pip install kaggle

In [None]:
import os
os.environ["KAGGLE_USERNAME"]="beardimon"
os.environ["KAGGLE_KEY"]="22adc3776f244a54f4732bf8dfb077eb"

In [None]:
!kaggle competitions submit -c int20h-2023-hackathon  -f ./datasets/submission.csv -m "Baseline"

In [None]:
!kaggle competitions leaderboard -c int20h-2023-hackathon --show

In [None]:
!kaggle competitions submissions -c int20h-2023-hackathon