In [17]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit

#Load training and testing datasets & remove unnecessary cols
train_data = pd.read_parquet(Path("data") / "train.parquet")
test_data = pd.read_parquet(Path("data") / "test.parquet")
train_data.drop(columns=['counter_name', 'site_name','counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)
test_data.drop(columns=['counter_name', 'site_name','counter_id', 'counter_installation_date', 'counter_technical_id', 'site_id'], inplace=True)

#Load weather dataset and remove irrelevant columns
weather_data = pd.read_csv(Path("data") / "hourly-weather-data.csv")
weather_data = weather_data.drop(columns=['name', 'dew', 'precipprob', 'preciptype','uvindex','icon','stations', 'sealevelpressure', 'winddir', 'conditions', 'sealevelpressure', 'severerisk', 'solarradiation', 'solarenergy'])

#convert to datetime to merge them properly
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])

#merge them and remove one of the date cols
merged_train_data = pd.merge(train_data, weather_data, left_on='date', right_on='datetime', how='inner')
merged_test_data = pd.merge(test_data, weather_data, left_on='date', right_on='datetime', how='inner')
merged_train_data = merged_train_data.drop(columns=['datetime'])
merged_test_data = merged_test_data.drop(columns=['datetime'])

#encode the dates
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    X['date'] = pd.to_datetime(X['date'])
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    return X.drop(columns=["date"])

merged_train_data = _encode_dates(merged_train_data)
merged_test_data = _encode_dates(merged_test_data)



In [19]:
merged_test_data.head()

Unnamed: 0,bike_count,latitude,longitude,log_bike_count,temp,feelslike,humidity,precip,snow,snowdepth,windgust,windspeed,cloudcover,visibility,year,month,day,weekday,hour
0,1.0,48.846028,2.375429,0.693147,16.1,16.1,94.09,0.0,0.0,0.0,22.4,7.3,95.2,13.9,2021,8,10,1,5
1,16.0,48.846028,2.375429,2.833213,16.1,16.1,94.09,0.0,0.0,0.0,22.4,7.3,95.2,13.9,2021,8,10,1,5
2,17.0,48.83436,2.377,2.890372,16.1,16.1,94.09,0.0,0.0,0.0,22.4,7.3,95.2,13.9,2021,8,10,1,5
3,53.0,48.83436,2.377,3.988984,16.1,16.1,94.09,0.0,0.0,0.0,22.4,7.3,95.2,13.9,2021,8,10,1,5
4,10.0,48.85372,2.35702,2.397895,16.1,16.1,94.09,0.0,0.0,0.0,22.4,7.3,95.2,13.9,2021,8,10,1,5


In [20]:
merged_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455163 entries, 0 to 455162
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   bike_count      455163 non-null  float64
 1   latitude        455163 non-null  float64
 2   longitude       455163 non-null  float64
 3   log_bike_count  455163 non-null  float64
 4   temp            455163 non-null  float64
 5   feelslike       455163 non-null  float64
 6   humidity        455163 non-null  float64
 7   precip          455163 non-null  float64
 8   snow            454997 non-null  float64
 9   snowdepth       454997 non-null  float64
 10  windgust        448063 non-null  float64
 11  windspeed       455163 non-null  float64
 12  cloudcover      455163 non-null  float64
 13  visibility      454997 non-null  float64
 14  year            455163 non-null  int32  
 15  month           455163 non-null  int32  
 16  day             455163 non-null  int32  
 17  weekday   

In [22]:
# Define the target variable 'y'
y = merged_train_data['log_bike_count'].copy()

# Drop the target variable and any other non-predictor columns to define the features 'X'
X = merged_train_data.drop(['log_bike_count', 'bike_count'], axis=1)  # Assuming 'bike_count' is also not a predictor

# Now split the data into training and cross-validation sets
# Typically, a simple way to split is to use a certain percentage for training and the rest for validation
# For example, using 80% of the data for training and 20% for cross-validation

# Calculate the split index
split_index = int(len(X) * 0.7)

# Split the features and the target variable into training and cross-validation sets
X_train = X.iloc[:split_index]
X_cross_val = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_cross_val = y.iloc[split_index:]
# Your data is now split into training and cross-validation sets and is ready for model training and evaluation.
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318614 entries, 0 to 318613
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   latitude    318614 non-null  float64
 1   longitude   318614 non-null  float64
 2   temp        318614 non-null  float64
 3   feelslike   318614 non-null  float64
 4   humidity    318614 non-null  float64
 5   precip      318614 non-null  float64
 6   snow        318448 non-null  float64
 7   snowdepth   318448 non-null  float64
 8   windgust    311514 non-null  float64
 9   windspeed   318614 non-null  float64
 10  cloudcover  318614 non-null  float64
 11  visibility  318448 non-null  float64
 12  year        318614 non-null  int32  
 13  month       318614 non-null  int32  
 14  day         318614 non-null  int32  
 15  weekday     318614 non-null  int32  
 16  hour        318614 non-null  int32  
dtypes: float64(12), int32(5)
memory usage: 35.2 MB


In [23]:
num_timesteps = 24

# Standardize your features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cross_val_scaled = scaler.transform(X_cross_val)

# Function to create sequences of time steps
def create_sequences(data, y, time_steps=num_timesteps):
    Xs, ys = [], []
    for i in range(len(data) - time_steps):
        v = data.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

# Reshape the training and cross-validation data
X_train_seq, y_train_seq = create_sequences(pd.DataFrame(X_train_scaled), y_train)
X_cross_val_seq, y_cross_val_seq = create_sequences(pd.DataFrame(X_cross_val_scaled), y_cross_val)


In [26]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-macosx_10_15_x86_64.whl (239.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.1/239.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting opt-einsum>=2.3.2
  Downloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1
  Downloading gast-0.5.4-py3-none-any.whl (19 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Collecting keras<2.16,>=2.15.0
  Downloading keras-2.15.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting libclang>=13.0.0
  Downloading libclang-16.0.6-py2.py3-none-macosx_10_9_x86_64.whl (24.5 MB)
[2K     [90m━━━━━━━━━━━━━

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# Define the number of time steps and features
num_timesteps = 24   # Using 24 hours of data to predict the next hour
num_features = X_train.shape[1]

# Define the LSTM model
model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(num_timesteps, num_features)))  # Adjust input_shape
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(30))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Fit the model on the training data
model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, verbose=1)

# Test the model on the cross-validation data
y_pred_nn = model.predict(X_cross_val_seq, verbose=1)


Epoch 1/50
1681/9956 [====>.........................] - ETA: 8:47 - loss: nan

KeyboardInterrupt: 

In [31]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_pred_nn = model.predict(X_cross_val_seq, verbose=1)


# Calculate Mean Squared Error
mse = mean_squared_error(y_cross_val_seq, y_pred_nn)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)




ValueError: Input contains NaN.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the Linear Regression model
linear_regressor = LinearRegression()

# Fit the model to the training data
linear_regressor.fit(X_train_scaled, y_train)

# Predict on the cross-validation data
y_pred_linear = linear_regressor.predict(X_cross_val_scaled)

# Calculate the RMSE for the cross-validation set
rmse_linear = np.sqrt(mean_squared_error(y_cross_val, y_pred_linear))
print(f"Root Mean Squared Error (RMSE) for Linear Regression on Cross-Validation Set: {rmse_linear:.2f}")


In [45]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the XGBoost regressor model
xgb_regressor = XGBRegressor(objective='reg:squarederror')

# Fit the model to the training data
xgb_regressor.fit(X_train_scaled, y_train)

# Predict on the cross-validation data
y_pred_xgb = xgb_regressor.predict(X_cross_val_scaled)

# Calculate the RMSE for the XGBoost model
rmse_xgb = np.sqrt(mean_squared_error(y_cross_val, y_pred_xgb))
print(f"Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: {rmse_xgb:.2f}")

Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: 1.08


In [49]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Assuming y_pred_nn and y_pred_xgboost are the prediction arrays from your neural network and XGBoost models
# And assuming y_true is the actual values

y_pred_xgb_trimmed = y_pred_xgb[:len(y_pred_nn_reshaped)]
# Average predictions from both models
y_cross_val_trimmed = y_cross_val[:len(y_pred_nn_reshaped)]
average_pred = (y_pred_nn_reshaped + y_pred_xgb_trimmed) / 2
# Calculate RMSE
ARMSE = np.sqrt(mean_squared_error(y_cross_val_trimmed, average_pred))
print(f"Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: {ARMSE:.2f}")

Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: 1.11


In [54]:
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

# Assuming y_pred_nn and y_pred_xgb are the prediction arrays from your LSTM and XGBoost models

# Trim y_pred_xgb to match the length of y_pred_nn (predictions from LSTM)
y_pred_xgb_trimmed = y_pred_xgb[(num_timesteps ):]

# Ensure y_cross_val is also trimmed to align with the LSTM predictions
y_cross_val_trimmed = y_cross_val[(num_timesteps ):]

# Average predictions from both models
average_pred = (y_pred_nn + y_pred_xgb_trimmed) / 2

# Calculate RMSE
ARMSE = sqrt(mean_squared_error(y_cross_val_trimmed, average_pred))
print(f"Root Mean Squared Error (RMSE): {ARMSE:.2f}")


MemoryError: Unable to allocate 4.07 GiB for an array with shape (33071, 33071) and data type float32

In [34]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
# Define an extended parameter grid

extended_param_grid = {
    'max_depth': [6, 8, 10, 12, 14],       # Range of values for the maximum depth of the trees
    'learning_rate': [0.01, 0.02, 0.05],   # Different learning rates to explore
    'n_estimators': [500, 1000, 1500],     # Number of trees in the forest
    'subsample': [0.5, 0.7, 0.9],          # Subsample ratio of the training instances
    'colsample_bytree': [0.5, 0.7, 0.9],   # Subsample ratio of columns when constructing each tree
    'min_child_weight': [1, 3, 5]          # Minimum sum of instance weight (hessian) needed in a child
}

# Initialize XGBoost regressor
xgb = XGBRegressor()

# Set up GridSearchCV with the extended parameter grid
extended_grid_search = GridSearchCV(estimator=xgb, param_grid=extended_param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit to the data
extended_grid_search.fit(X_train_scaled, y_train)

# Best parameters and model from the extended grid search
best_params_extended = extended_grid_search.best_params_
best_model_extended = extended_grid_search.best_estimator_

Fitting 3 folds for each of 1215 candidates, totalling 3645 fits
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.5; total time=  51.0s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.7; total time=  51.1s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.5; total time=  51.7s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.5; total time=  52.2s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.7; total time=  49.5s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.7; total time=  49.9s
[CV] END colsample_bytree=0.5, learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=500, subsample=0.9; to

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predict on the validation set
y_pred = best_model.predict(X_cross_val)

# Calculate metrics
rmse = mean_squared_error(y_cross_val, y_pred, squared=False)
mae = mean_absolute_error(y_cross_val, y_pred)
r2 = r2_score(y_cross_val, y_pred)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r2}")
print(best_params)

RMSE: 0.9485270591439028
MAE: 0.6099876373993324
R^2: 0.5713355561776677
{'colsample_bytree': 0.7, 'learning_rate': 0.02, 'max_depth': 10, 'n_estimators': 1000, 'subsample': 0.7}
