In [1]:
import joblib
import lightgbm as lgb
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

SEED = 42

warnings.filterwarnings('ignore')

In [2]:
df = joblib.load('joblib_files/data_processed.plk')
df.head()

Unnamed: 0,hum_quantized,dteday,temp_quantized,hum_binarized,quarter,casual,atemp,windspeed,instant,week_of_year,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weathersit_2,weathersit_3,weathersit_4
0,0.0,2011-01-01,0.0,0.0,1,3,0.2879,0.4925,1,52,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,2011-01-01,0.0,0.0,1,8,0.2727,0.4925,2,52,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,2011-01-01,0.0,0.0,1,5,0.2727,0.4925,3,52,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,2011-01-01,0.0,0.0,1,3,0.2879,0.4925,4,52,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,2011-01-01,0.0,0.0,1,0,0.2879,0.4925,5,52,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [85]:

# Dropping columns that are not required for modeling casual + registered = cnt
X = df.drop(columns=['cnt', 'dteday', 'casual', 'registered'])

# Target variable
y = df['cnt']
# Splitting the data into training (60%), validation (20%), and testing (20%) sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

X_train.shape, X_val.shape, X_test.shape


((10427, 69), (3476, 69), (3476, 69))

In [89]:
joblib.dump(y_test, 'joblib_files/y_test.plk')

['joblib_files/y_test.plk']

In [86]:
import plotly.graph_objects as go
import plotly.express as px
import plots as p


In [7]:
def predict_metric(model_p, feature=False):
    if not feature:
        # [5] Predict
        pred_train = model_p.predict(X_train) # predict!
        pred_val = model_p.predict(X_val) # predict!
        pred_test = model_p.predict(X_test) # predict!
        
        # [6] Compute metric
        mae_train = mean_absolute_error(y_train, pred_train)
        mae_val = mean_absolute_error(y_val, pred_val)
        mae_test = mean_absolute_error(y_test, pred_test)
        
        mse_train = mean_squared_error(y_train, pred_train)
        mse_val = mean_squared_error(y_val, pred_val)
        mse_test = mean_squared_error(y_test, pred_test)
        
        r2_train = r2_score(y_train, pred_train)
        r2_val = r2_score(y_val, pred_val)
        r2_test = r2_score(y_test, pred_test)
    else:
        # [5] Predict
        pred_train = model_p.predict(X_train[feature]) # predict!
        pred_val = model_p.predict(X_val[feature]) # predict!
        pred_test = model_p.predict(X_test[feature]) # predict!
        
        # [6] Compute metric
        mae_train = mean_absolute_error(y_train, pred_train)
        mae_val = mean_absolute_error(y_val, pred_val)
        mae_test = mean_absolute_error(y_test, pred_test)
        
        mse_train = mean_squared_error(y_train, pred_train)
        mse_val = mean_squared_error(y_val, pred_val)
        mse_test = mean_squared_error(y_test, pred_test)
        
        r2_train = r2_score(y_train, pred_train)
        r2_val = r2_score(y_val, pred_val)
        r2_test = r2_score(y_test, pred_test)
    print(f'MAE: train {mae_train:.4f}, validation {mae_val:.4f}, test {mae_test:.4f}')
    print(f'MSE: train {mse_train:.4f}, validation {mse_val:.4f}, test {mse_test:.4f}')
    print(f'R2: train {r2_train:.4f}, validation {r2_val:.4f}, test {r2_test:.4f}')
    return pred_test

# Linear Regression

In [8]:
linear_model = LinearRegression()

linear_model.fit(X_train, y_train)

In [9]:
pred_test_linear = predict_metric(linear_model)

MAE: train 8.2300, validation 7.8869, test 8.0224
MSE: train 428.9310, validation 409.0490, test 409.4724
R2: train 0.1718, validation 0.1824, test 0.1597


The results are quite poor, let's try with other models

In [10]:
joblib.dump(pred_test_linear, 'joblib_files/pred_linear_reg.plk')

['joblib_files/pred_linear_reg.plk']

In [87]:
p.plot_predictions(500, y_test, 'Linear Regression', pred_test_linear)

# Random Forest

In [96]:
reduced_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [100, 50],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [97]:
reduced_grid_search = GridSearchCV(RandomForestRegressor(random_state=SEED), reduced_param_grid, 
                                   cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

In [98]:
reduced_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [99]:
joblib.dump(reduced_grid_search, 'grid_search_RF.plk')

['grid_search_RF.plk']

In [100]:
grid_rf = joblib.load('joblib_files/grid_search_RF.plk')

In [101]:
if not grid_rf:
    best_model = reduced_grid_search.best_params_
else:
    best_model = grid_rf.best_params_
best_model

{'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 200}

In [42]:
if not grid_rf:
    print(reduced_grid_search.best_score_)
else:
    print(grid_rf.best_score_)

-116.77683226250875


In [43]:
model_rf = RandomForestRegressor(max_depth=best_model['max_depth'],
                              min_samples_leaf=best_model['min_samples_leaf'],
                              min_samples_split=best_model['min_samples_split'],
                              n_estimators=best_model['n_estimators'],
                              # max_features=best_model['max_features'],
                              random_state=SEED)
# [4] Train model
model_rf.fit(X_train, y_train)

In [44]:
pred_test_rf = predict_metric(model_rf)

MAE: train 0.9880, validation 1.7506, test 1.6587
MSE: train 36.4599, validation 124.4141, test 112.3184
R2: train 0.9296, validation 0.7513, test 0.7695


In [45]:
joblib.dump(pred_test_rf, 'joblib_files/pred_random_forest.plk')

['joblib_files/pred_random_forest.plk']

### ATTEMPT 1:

```
reduced_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
```

| Metric | Train   | Validation | Test    |
|--------|---------|------------|---------|
| MAE    | 0.9872  | 1.7541     | 1.6608  |
| MSE    | 36.3590 | 125.3670   | 112.5398|
| R2     | 0.9298  | 0.7494     | 0.7690  |

* MAE -> the error is too low in train but a bit bigger in validation and test suggesting overfitting
* MSE -> The overfitting is more clear
* R2 -> It performs worse in unseen data

### ATTEMPT 2:

```
reduced_param_grid = {
    'n_estimators': [300, 500],
    'max_depth': [4,5,6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [2, 4, 6],
    'max_features': ['sqrt', 'log2'],
}
```

| Metric | Train    | Validation | Test    |
|--------|----------|------------|---------|
| MAE    | 4.0496   | 4.1308     | 3.9414  |
| MSE    | 260.1930 | 271.2116   | 273.0711|
| R2     | 0.4976   | 0.4579     | 0.4396  |


* We have reduced the overfitting. However, the results are worse.

### ATTEMPT 3:

```
reduced_param_grid = {
    'n_estimators': [500],
    'max_depth': [10, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2'],
}
```

| Metric | Train    | Validation | Test    |
|--------|----------|------------|---------|
| MAE    | 1.7801   | 2.9746     | 2.7659  |
| MSE    | 59.2387  | 167.5455   | 174.0075|
| R2     | 0.8856   | 0.6651     | 0.6429  |


* The more we increase max_depth, the more overfitting we have.
    

In [49]:
plot_predictions(500, 'Random Forest Regressor', pred_test_rf)

# XGBoost

In [51]:
n_estimators_values = [200, 500]
learning_rate_values = [0.02,0.05]
max_depth_values = [12, 18]
min_child_weight_values = [15]
subsample_values = [0.8]
colsample_bytree_values = [0.2, 0.4]
# gamma_values = [0, 0.1]
# lambda_values = [1, 10]
# alpha_values = [1, 10]

params_grid = {
    'n_estimators': n_estimators_values,
    'learning_rate': learning_rate_values,
    'max_depth': max_depth_values,
    'min_child_weight': min_child_weight_values,
    'subsample': subsample_values,
    'colsample_bytree': colsample_bytree_values,
    # 'gamma': gamma_values,
    # 'lambda': lambda_values,
    # 'alpha': alpha_values
}


In [52]:
# Model initialization
model = XGBRegressor(
    early_stopping_rounds=20,
    eval_metric="rmse",
    device='cuda',
    tree_method='gpu_hist',
    random_state=SEED
)

# Grid search with cross-validation
grid_search_xgb = GridSearchCV(model, params_grid, 
                               cv=5, scoring='neg_mean_squared_error', verbose=2)

X_train['week_of_year'] = X_train['week_of_year'].astype('int32')
X_val['week_of_year'] = X_val['week_of_year'].astype('int32')
X_test['week_of_year'] = X_test['week_of_year'].astype('int32')

# Fitting the model
grid_search_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[0]	validation_0-rmse:22.34110
[1]	validation_0-rmse:22.31922
[2]	validation_0-rmse:22.27583
[3]	validation_0-rmse:22.19364
[4]	validation_0-rmse:22.15546
[5]	validation_0-rmse:22.12915
[6]	validation_0-rmse:22.10079
[7]	validation_0-rmse:22.07563
[8]	validation_0-rmse:22.02936
[9]	validation_0-rmse:22.02620
[10]	validation_0-rmse:22.01868
[11]	validation_0-rmse:21.99430
[12]	validation_0-rmse:21.92919
[13]	validation_0-rmse:21.86858
[14]	validation_0-rmse:21.84105
[15]	validation_0-rmse:21.79330
[16]	validation_0-rmse:21.75273
[17]	validation_0-rmse:21.68056
[18]	validation_0-rmse:21.66680
[19]	validation_0-rmse:21.64766
[20]	validation_0-rmse:21.56860
[21]	validation_0-rmse:21.54009
[22]	validation_0-rmse:21.39913
[23]	validation_0-rmse:21.39286
[24]	validation_0-rmse:21.26245
[25]	validation_0-rmse:21.24235
[26]	validation_0-rmse:21.17897
[27]	validation_0-rmse:21.16996
[28]	validation_0-rmse:21.15675
[29]	validation_0-rms

In [53]:
joblib.dump(grid_search_xgb, 'grid_search_XGB.plk')

['grid_search_XGB.plk']

In [55]:
grid_xgb = joblib.load('joblib_files/grid_search_XGB.plk')

In [56]:
if not grid_xgb:
    best_params_xgb = grid_search_xgb.best_params_
else:
    best_params_xgb = grid_xgb.best_params_
best_params_xgb

{'colsample_bytree': 0.4,
 'learning_rate': 0.05,
 'max_depth': 12,
 'min_child_weight': 15,
 'n_estimators': 500,
 'subsample': 0.8}

In [57]:
model_xgb = XGBRegressor(
    early_stopping_rounds=20,
    eval_metric="rmse",
    tree_method='gpu_hist',
    random_state=SEED,
    colsample_bytree=best_params_xgb['colsample_bytree'],
    learning_rate=best_params_xgb['learning_rate'],
    max_depth=best_params_xgb['max_depth'],
    min_child_weight=best_params_xgb['min_child_weight'],
    n_estimators=best_params_xgb['n_estimators'],
    subsample=best_params_xgb['subsample'],  # Corrected 'subsamble' typo to 'subsample'
    # gamma=best_params_xgb['gamma'],
    # reg_lambda=best_params_xgb['lambda'],
    # reg_alpha=best_params_xgb['alpha']
)


model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-rmse:21.96132
[1]	validation_0-rmse:21.72921
[2]	validation_0-rmse:21.48736
[3]	validation_0-rmse:21.26410
[4]	validation_0-rmse:21.12246
[5]	validation_0-rmse:21.04074
[6]	validation_0-rmse:20.74946
[7]	validation_0-rmse:20.67893
[8]	validation_0-rmse:20.57707
[9]	validation_0-rmse:20.39463
[10]	validation_0-rmse:20.31525
[11]	validation_0-rmse:20.14035
[12]	validation_0-rmse:19.80680
[13]	validation_0-rmse:19.36409
[14]	validation_0-rmse:18.94777
[15]	validation_0-rmse:18.68470
[16]	validation_0-rmse:18.61869
[17]	validation_0-rmse:18.29156
[18]	validation_0-rmse:18.27198
[19]	validation_0-rmse:18.17950
[20]	validation_0-rmse:17.82229
[21]	validation_0-rmse:17.62035
[22]	validation_0-rmse:17.41508
[23]	validation_0-rmse:17.33295
[24]	validation_0-rmse:17.19924
[25]	validation_0-rmse:17.00917
[26]	validation_0-rmse:16.93140
[27]	validation_0-rmse:16.81287
[28]	validation_0-rmse:16.78789
[29]	validation_0-rmse:16.37888
[30]	validation_0-rmse:16.30468
[31]	validation_0-

In [58]:
pred_test_xgb = predict_metric(model_xgb)

MAE: train 2.0119, validation 3.3658, test 3.1496
MSE: train 41.8780, validation 139.1574, test 138.7585
R2: train 0.9191, validation 0.7219, test 0.7152


In [59]:
joblib.dump(pred_test_xgb, 'joblib_files/pred_xgb.plk')

['joblib_files/pred_xgb.plk']

### ATTEMPT 1:

```
n_estimators_values = [200, 500]
learning_rate_values = [0.02,0.05]
max_depth_values = [12, 18]
min_child_weight_values = [15]
subsample_values = [0.8]
colsample_bytree_values = [0.2,0.4]
```

| Metric | Train   | Validation | Test    |
|--------|---------|------------|---------|
| MAE    | 2.0232  | 3.4166     | 3.1178  |
| MSE    | 42.5883 | 142.1871   | 135.3870|
| R2     | 0.9178  | 0.7158     | 0.7222  |

* The results are similar to Random Forest. Let's try to improve it.

### ATTEMPT 2 & 3:

```
n_estimators_values = [300]
learning_rate_values = [0.01]
max_depth_values = [12]
min_child_weight_values = [1, 2]
subsample_values = [0.8]
colsample_bytree_values = [0.6]
gamma_values = [0, 0.1]
lambda_values = [1, 10]
alpha_values = [1, 10]
```
```
n_estimators_values = [300]
learning_rate_values = [0.01, 0.05]
max_depth_values = [10, 12]
min_child_weight_values = [1, 2]
subsample_values = [0.8]
colsample_bytree_values = [0.4, 0.6]
gamma_values = [0, 0.1]
lambda_values = [1, 10]
alpha_values = [1, 10]
```

| Metric | Train   | Validation | Test    |
|--------|---------|------------|---------|
| MAE    | 1.3223  | 2.6211     | 2.4499  |
| MSE    | 35.1524 | 132.1773   | 135.8531|
| R2     | 0.9321  | 0.7358     | 0.7212  |

* We have slightly improved the MAE. It is clear that we have overfitting. 
* We have tried different approach as regularization Alpha and Lambda but the result is not significant better

In [50]:
pred_test_xgb = joblib.load('joblib_files/pred_xgb.plk')
plot_predictions(500, 'XGBoost Regressor', pred_test_xgb)

# Lightgbm

In [60]:
# Base parameters
lgb_params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'n_jobs': -1,
    'tree_learner': 'serial',
    'seed': SEED,
    'device_type': 'gpu',
    'metric': 'rmse',
}


params_grid = {
    'learning_rate': [0.05, 0.08],
    'colsample_bytree': [0.5, 0.6],
    'subsample': [0.7, 0.8],
    'n_estimators': [2000, 3000],
    'early_stopping_rounds': [100]
}

params_grid_2 = {
    'learning_rate': [0.02],
    'colsample_bytree': [0.4, 0.5],
    'subsample': [0.8],
    'n_estimators': [300],
    'early_stopping_rounds': [30, 50],
    'lambda_l1': [1, 5],
    'lambda_l2': [1, 5],
    'max_depth': [6, 8],
    'min_child_samples': [20, 30],
    'feature_fraction': [0.6, 0.7],
    'bagging_fraction': [0.7, 0.8],
    'bagging_freq': [5, 7],
    'num_leaves': [30, 40]
}

In [61]:
# Set up LightGBM Regressor
lgb_regressor = lgb.LGBMRegressor(**lgb_params)

# Set up grid search with cross-validation
grid_search = GridSearchCV(
    lgb_regressor, 
    param_grid=params_grid, 
    scoring='neg_mean_squared_error', 
    cv=5,
    verbose=0
)

In [62]:
# Run the grid search
grid_search.fit(X_train, y_train, eval_metric='mse', eval_set=[(X_val, y_val)])

[LightGBM] [Info] Number of positive: 8341, number of negative: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 999
[LightGBM] [Info] Number of data points in the train set: 8341, number of used features: 64
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4050 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 19 dense feature groups (0.16 MB) transferred to GPU in 0.006081 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=1.000000 -> initscore=34.539576
[LightGBM] [Info] Start training from score 34.539576
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's l2: 416510	valid_0's rmse: 645.376
[LightGBM] [Info] Number of positive: 8341, number of negative: 0
[LightGBM] [Info] This is the GPU trainer!!
[LightGB

In [63]:
joblib.dump(grid_search, 'joblib_files/grid_search_LGBM.plk')

['joblib_files/grid_search_LGBM.plk']

In [64]:
grid_lightgbm = joblib.load('joblib_files/grid_search_LGBM.plk')

In [65]:
if not grid_lightgbm:
    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
else:
    # Get the best parameters and best score
    best_params = grid_lightgbm.best_params_
    best_score = grid_lightgbm.best_score_

print("Best parameters found: ", best_params)
print("Best MSE score: ", best_score)

Best parameters found:  {'colsample_bytree': 0.5, 'early_stopping_rounds': 100, 'learning_rate': 0.05, 'n_estimators': 2000, 'subsample': 0.7}
Best MSE score:  -416752.62023976224


In [66]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [67]:
model_lgb = lgb.train(best_params,
                  train_data,
                  valid_sets=[train_data, valid_data])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002756 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1011
[LightGBM] [Info] Number of data points in the train set: 10427, number of used features: 64
[LightGBM] [Info] Start training from score 646.162655
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[464]	training's l2: 36.324	valid_1's l2: 119.332


In [68]:
joblib.dump(model_lgb, 'final_model.plk')

['final_model.plk']

In [69]:
pred_test_lbm = predict_metric(model_lgb)

MAE: train 1.8327, validation 2.9053, test 2.6601
MSE: train 36.3240, validation 119.3318, test 117.8442
R2: train 0.9299, validation 0.7615, test 0.7582


In [70]:
joblib.dump(pred_test_lbm, 'joblib_files/pred_lightgbm.plk')

['joblib_files/pred_lightgbm.plk']

### ATTEMPT 1:

```
params_grid = {
    'learning_rate': [0.05, 0.08],
    'colsample_bytree': [0.5, 0.6],
    'subsample': [0.7, 0.8],
    'n_estimators': [2000, 3000],
    'early_stopping_rounds': [100]
}
```

| Metric | Train   | Validation | Test    |
|--------|---------|------------|---------|
| MAE    | 1.7593  | 2.9279     | 2.6691  |
| MSE    | 32.4495 | 121.4059   | 113.9436|
| R2     | 0.9373  | 0.7573     | 0.7662  |


* The results are slightly better than XGboost and RandomForest.
* Lightgbm is evidently faster than the other models. It takes a few seconds comparing with XGBoost.

### ATTEMPT 2:

```
params_grid = {
    'learning_rate': [0.05, 0.08],
    'colsample_bytree': [0.5, 0.6],
    'subsample': [0.7, 0.8],
    'n_estimators': [1000],
    'early_stopping_rounds': [50],
    'lambda_l1': [0.1, 1, 10],
    'lambda_l2': [0.1,1, 10]
}
```

| Metric | Train   | Validation | Test    |
|--------|---------|------------|---------|
| MAE    | 1.8961  | 2.8865     | 2.5909  |
| MSE    | 42.7763 | 122.7128   | 114.6251|
| R2     | 0.9174  | 0.7547     | 0.7648  |



* The results have not improved with regularization


### ATTEMPT 3:

```
params_grid = {
    'learning_rate': [0.02],
    'colsample_bytree': [0.4, 0.5],
    'subsample': [0.8],
    'n_estimators': [300],
    'early_stopping_rounds': [30, 50],
    'lambda_l1': [1, 5],
    'lambda_l2': [1, 5],
    'max_depth': [6, 8],
    'min_child_samples': [20, 30],
    'feature_fraction': [0.6, 0.7],
    'bagging_fraction': [0.7, 0.8],
    'bagging_freq': [5, 7],
    'num_leaves': [30, 40]
}
```

| Metric | Train   | Validation | Test    |
|--------|---------|------------|---------|
| MAE    | 2.3217  | 2.8384     | 2.5656  |
| MSE    | 86.1163 | 134.1792   | 127.3785|
| R2     | 0.8337  | 0.7318     | 0.7386  |


* In this attempt we have included several parameters to try to avoid overfitting. However, we could not improve our model

In [71]:

plot_predictions(500, 'LightGBM Regressor', pred_test_lbm)

# Final Model

In [104]:
final_model = joblib.load('joblib_files/final_model.plk')

In [105]:
import pandas as pd

# Extract feature importance
feature_importances = final_model.feature_importance(importance_type='split')

# Create DataFrame for variable importance
var_imp = pd.DataFrame({'var': X_train.columns, 'imp': feature_importances})
var_imp.sort_values(['imp'], ascending=False, inplace=True)

In [106]:
var_imp

Unnamed: 0,var,imp
6,instant,2434
4,atemp,1262
8,temp_squared,1215
16,temp_difference,1182
7,week_of_year,1034
...,...,...
24,mnth_2,0
18,hum,0
13,hum_squared,0
2,hum_binarized,0


In [107]:
# Select top 10 variables
top_var = var_imp.nlargest(33, 'imp')['var'].tolist()
train_data = lgb.Dataset(X_train[top_var], label=y_train)
valid_data = lgb.Dataset(X_val[top_var], label=y_val, reference=train_data)
final_model = lgb.train(best_params,
                  train_data,
                  valid_sets=[train_data, valid_data])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 940
[LightGBM] [Info] Number of data points in the train set: 10427, number of used features: 33
[LightGBM] [Info] Start training from score 646.162655
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[254]	training's l2: 46.1349	valid_1's l2: 119.225


In [108]:
pred_test_final = predict_metric(final_model, top_var)

MAE: train 1.8979, validation 2.8076, test 2.5197
MSE: train 46.1349, validation 119.2248, test 111.4118
R2: train 0.9109, validation 0.7617, test 0.7714


In [113]:
joblib.dump(pred_test_final, 'joblib_files/final_model.plk')

['joblib_files/final_model.plk']

In [114]:
p.plot_predictions(500, y_test, 'Final Model', pred_test_final)

In [110]:
joblib.dump(final_model, 'joblib_files/FINAL_MODEL_IMPROVED.plk')

['joblib_files/FINAL_MODEL_IMPROVED.plk']

* We have applied feature selection and the model has slightly improved using 33 features instead of 69.

## **Conclusion**

---

### **1. Data Preprocessing** 
- The dataset underwent thorough preprocessing, addressing missing values, outliers, and encoding categorical variables. This set a strong foundation for the next steps.

### **2. Model Exploration** 
- A diverse set of models, including Random Forest, XGBoost, Linear Regression, and LightGBM, were explored. They were evaluated using metrics such as MAE, MSE, and R2.

### **3. Overfitting Challenge** 
- A common trend was overfitting, where models excelled with training data but faltered on validation/test sets. This hinted at models potentially being too intricate.

### **4. The Role of Regularization** 
- To tackle overfitting, regularization came to the rescue, especially with tree-based models. It effectively penalizes models that align too closely with training data.

### **5. The Best Performer - LightGBM** 
- Among all, LightGBM was the star. Its speed, capability to manage categories directly, and resistance to overfitting (with the right tuning) made it shine brighter than the rest.

---

### **Final Thoughts**
- LightGBM outperformed others in our tests. Thanks to careful regularization and feature selection, its efficiency was even more evident. This model's predictions will aid in better bike allocation, leading to cost savings and happier users. Plus, it ensures bikes are available as needed, cutting down on costs and amplifying user satisfaction.
