In [33]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

import pandas as pd

# Loading the data
train_df = pd.read_parquet('train.parquet')
external_data_df = pd.read_csv('external_data.csv')

# Convert date columns to datetime
train_df['date'] = pd.to_datetime(train_df['date'])
external_data_df['date'] = pd.to_datetime(external_data_df['date'])

#external data has duplicates therefore we want to drop them
external_data_df = external_data_df.drop_duplicates(subset='date')

# Resample external_data_df to hourly, setting missing values to NaN
external_data_df.set_index('date', inplace=True)
external_data_df = external_data_df.resample('H').asfreq()

# Interpolate to fill missing hourly data
external_data_df.interpolate(method='time', inplace=True)

# Reset the index
external_data_df.reset_index(inplace=True)

# Drop the specified columns with 0 non-null values from both dataframes
cols_to_drop = [col for col in external_data_df.columns if external_data_df[col].notnull().sum() == 0]
external_data_df.drop(cols_to_drop, axis=1, inplace=True)

# Merge the dataframes
merged_df = pd.merge(train_df, external_data_df, on='date', how='left')

# Load lockdown data
#lockdown_df = pd.read_csv('lockdown_data.csv')

# Convert 'datetime' column to datetime type in lockdown_df
#lockdown_df['datetime'] = pd.to_datetime(lockdown_df['datetime'])

# Normalize the 'date' in aligned_df for accurate merging
#aligned_df['date'] = aligned_df['date'].dt.normalize()

# Merge aligned_df with lockdown_df
#merged_df = pd.merge(aligned_df, lockdown_df, left_on='date', right_on='datetime', how='left')

# Drop the additional 'datetime' column after merging, if necessary
#merged_df.drop('datetime', axis=1, inplace=True)

# Calculate the median for each column and replace NaN values in merged_df
medians = merged_df.median(numeric_only=True)
merged_df.fillna(medians, inplace=True)

# Drop counter_id and counter technical_id as they don't seem to be very useful
#merged_df.drop(['counter_id', 'counter_technical_id', 'counter_installation_date'], axis=1, inplace=True)

# List of categorical columns to encode
#categorical_columns = ['counter_name', 'site_name'] 

# Convert categorical columns to string type before hashing
#merged_df['counter_name'] = merged_df['counter_name'].astype(str)
#merged_df['site_name'] = merged_df['site_name'].astype(str)

# Drop unnecessary columns
merged_df.drop(['counter_id', 'counter_technical_id', 'counter_installation_date'], axis=1, inplace=True)

# Drop the original categorical columns
merged_df.drop(['counter_name', 'site_name'], axis=1, inplace=True)

# Encode categorical columns using feature hashing
#hasher = FeatureHasher(n_features=10)

In [25]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

# Load the data
train_df = pd.read_parquet('train.parquet')
external_data_df = pd.read_csv('external_data.csv')

# Drop columns with 0 non-null values from external_data_df
cols_to_drop = [col for col in external_data_df.columns if external_data_df[col].notnull().sum() == 0]
external_data_df.drop(cols_to_drop, axis=1, inplace=True)

# Convert 'date' columns to datetime
train_df['date'] = pd.to_datetime(train_df['date'])
external_data_df['date'] = pd.to_datetime(external_data_df['date'])

# Merge the data on the 'date' column
merged_df = pd.merge(train_df, external_data_df, on='date', how='inner')

# Calculate the median for each column and replace NaN values
medians = merged_df.median(numeric_only=True)
merged_df.fillna(medians, inplace=True)

# Drop unnecessary columns
merged_df.drop(['counter_id', 'counter_technical_id', 'counter_installation_date'], axis=1, inplace=True)

# Drop the original categorical columns
merged_df.drop(['counter_name', 'site_name'], axis=1, inplace=True)

# Split the data into features and target
y = merged_df['log_bike_count']
X = merged_df.drop('log_bike_count', axis=1)

In [34]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 55 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   site_id         496827 non-null  int64         
 1   bike_count      496827 non-null  float64       
 2   date            496827 non-null  datetime64[ns]
 3   coordinates     496827 non-null  category      
 4   latitude        496827 non-null  float64       
 5   longitude       496827 non-null  float64       
 6   log_bike_count  496827 non-null  float64       
 7   numer_sta       496827 non-null  float64       
 8   pmer            496827 non-null  float64       
 9   tend            496827 non-null  float64       
 10  cod_tend        496827 non-null  float64       
 11  dd              496827 non-null  float64       
 12  ff              496827 non-null  float64       
 13  t               496827 non-null  float64       
 14  td              496827 non-null  flo

In [35]:
# Extracting columns which have datatype 'category'
categorical_columns = merged_df.select_dtypes(include=['category']).columns.tolist()

# Outputting the categorical variables
print(categorical_columns)

['coordinates']


In [36]:
merged_df[categorical_columns].head()

Unnamed: 0,coordinates
0,"48.846028,2.375429"
1,"48.846028,2.375429"
2,"48.846028,2.375429"
3,"48.846028,2.375429"
4,"48.846028,2.375429"


In [37]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

# Assuming 'aligned_df' contains the 'coordinates' column in the form of 'latitude,longitude'.

# Split 'coordinates' into 'latitude' and 'longitude'
merged_df[['latitude', 'longitude']] = merged_df['coordinates'].str.split(',', expand=True).astype(float)

#applied log to make the coordinates smaller
# Replace 1 with a smaller number if necessary to avoid taking log(0).
merged_df['log_latitude'] = np.log(merged_df['latitude'] + 1)
merged_df['log_longitude'] = np.log(merged_df['longitude'] + 1)


# Initialize the ordinal encoder
#compare with featurehasher
#encoder = OrdinalEncoder()

# Fit and transform 'site_name' and 'counter_name' using ordinal encoding
#merged_df['site_name_encoded'] = encoder.fit_transform(merged_df[['site_name']])
#merged_df['counter_name_encoded'] = encoder.fit_transform(merged_df[['counter_name']])

# Drop the original 'site_name', 'counter_name', and 'coordinates' columns
#merged_df.drop(['site_name', 'counter_name', 'coordinates'], axis=1, inplace=True)
merged_df.drop(['coordinates'], axis=1, inplace=True)


In [38]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496827 entries, 0 to 496826
Data columns (total 56 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   site_id         496827 non-null  int64         
 1   bike_count      496827 non-null  float64       
 2   date            496827 non-null  datetime64[ns]
 3   latitude        496827 non-null  float64       
 4   longitude       496827 non-null  float64       
 5   log_bike_count  496827 non-null  float64       
 6   numer_sta       496827 non-null  float64       
 7   pmer            496827 non-null  float64       
 8   tend            496827 non-null  float64       
 9   cod_tend        496827 non-null  float64       
 10  dd              496827 non-null  float64       
 11  ff              496827 non-null  float64       
 12  t               496827 non-null  float64       
 13  td              496827 non-null  float64       
 14  u               496827 non-null  flo

In [39]:
# Define the target variable 'y'
y = merged_df['log_bike_count'].copy()

# Drop the target variable and any other non-predictor columns to define the features 'X'
X = merged_df.drop(['log_bike_count', 'bike_count'], axis=1)  # Assuming 'bike_count' is also not a predictor

# Now split the data into training and cross-validation sets
# Typically, a simple way to split is to use a certain percentage for training and the rest for validation
# For example, using 80% of the data for training and 20% for cross-validation

# Calculate the split index
split_index = int(len(X) * 0.7)

# Split the features and the target variable into training and cross-validation sets
X_train = X.iloc[:split_index]
X_cross_val = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_cross_val = y.iloc[split_index:]
# Your data is now split into training and cross-validation sets and is ready for model training and evaluation.

In [40]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler

# Define the number of time steps and features
num_timesteps = 3  # for example, using 9 hours of data to predict the next count
num_features = X_train.shape[1]
# Convert datetime column to a numeric feature, e.g., hour of the day

# Reset index if your datetime data is in the index of the DataFrame
X_train = X_train.reset_index()
X_cross_val = X_cross_val.reset_index()

# Extract time-related features from the datetime column
# Replace 'index' with the actual column name of your datetime data if it's different
X_train['hour'] = X_train['date'].dt.hour
X_cross_val['hour'] = X_cross_val['date'].dt.hour

# Drop the original datetime column
X_train = X_train.drop('date', axis=1)
X_cross_val = X_cross_val.drop('date', axis=1)

# Now you can standardize your features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cross_val_scaled = scaler.transform(X_cross_val)

# Function to create sequences of time steps
def create_sequences(data, y, time_steps=num_timesteps):
    Xs, ys = [], []
    for i in range(len(data) - time_steps):
        v = data.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

# Reshape the training and cross-validation data
X_train_seq, y_train_seq = create_sequences(pd.DataFrame(X_train_scaled), y_train)
X_cross_val_seq, y_cross_val_seq = create_sequences(pd.DataFrame(X_cross_val_scaled), y_cross_val)


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense


model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(3, 55)))  # First LSTM layer with 100 neurons
model.add(LSTM(50, return_sequences=True))  # Second LSTM layer with 50 neurons, returning sequences
model.add(LSTM(30))  # Third LSTM layer with 30 neurons
model.add(Dense(1))  # Output layer


# Compile and fit the model as usual

model.compile(optimizer='adam', loss='mean_squared_error')

# Fit the model on the training data
model.fit(X_train_seq, y_train_seq, epochs=50, batch_size=32, verbose=1)

# Test the model on the cross-validation data
y_pred_nn = model.predict(X_cross_val_seq, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50

In [21]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Calculate Mean Squared Error
mse = mean_squared_error(y_cross_val_seq, y_pred_nn)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)

print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 0.9996524837671578


In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the Linear Regression model
linear_regressor = LinearRegression()

# Fit the model to the training data
linear_regressor.fit(X_train_scaled, y_train)

# Predict on the cross-validation data
y_pred_linear = linear_regressor.predict(X_cross_val_scaled)

# Calculate the RMSE for the cross-validation set
rmse_linear = np.sqrt(mean_squared_error(y_cross_val, y_pred_linear))
print(f"Root Mean Squared Error (RMSE) for Linear Regression on Cross-Validation Set: {rmse_linear:.2f}")


Root Mean Squared Error (RMSE) for Linear Regression on Cross-Validation Set: 1.23


In [23]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the XGBoost regressor model
xgb_regressor = XGBRegressor(objective='reg:squarederror')

# Fit the model to the training data
xgb_regressor.fit(X_train_scaled, y_train)

# Predict on the cross-validation data
y_pred_xgb = xgb_regressor.predict(X_cross_val_scaled)

# Calculate the RMSE for the XGBoost model
rmse_xgb = np.sqrt(mean_squared_error(y_cross_val, y_pred_xgb))
print(f"Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: {rmse_xgb:.2f}")

Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: 0.98


In [49]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Assuming y_pred_nn and y_pred_xgboost are the prediction arrays from your neural network and XGBoost models
# And assuming y_true is the actual values

y_pred_xgb_trimmed = y_pred_xgb[:len(y_pred_nn_reshaped)]
# Average predictions from both models
y_cross_val_trimmed = y_cross_val[:len(y_pred_nn_reshaped)]
average_pred = (y_pred_nn_reshaped + y_pred_xgb_trimmed) / 2
# Calculate RMSE
ARMSE = np.sqrt(mean_squared_error(y_cross_val_trimmed, average_pred))
print(f"Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: {ARMSE:.2f}")

Root Mean Squared Error (RMSE) for XGBoost on Cross-Validation Set: 1.11


In [54]:
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

# Assuming y_pred_nn and y_pred_xgb are the prediction arrays from your LSTM and XGBoost models

# Trim y_pred_xgb to match the length of y_pred_nn (predictions from LSTM)
y_pred_xgb_trimmed = y_pred_xgb[(num_timesteps ):]

# Ensure y_cross_val is also trimmed to align with the LSTM predictions
y_cross_val_trimmed = y_cross_val[(num_timesteps ):]

# Average predictions from both models
average_pred = (y_pred_nn + y_pred_xgb_trimmed) / 2

# Calculate RMSE
ARMSE = sqrt(mean_squared_error(y_cross_val_trimmed, average_pred))
print(f"Root Mean Squared Error (RMSE): {ARMSE:.2f}")


MemoryError: Unable to allocate 4.07 GiB for an array with shape (33071, 33071) and data type float32

In [21]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [12,10],
    'learning_rate': [0.02],
    'n_estimators': [1000],
    'subsample': [0.7],
    'colsample_bytree': [0.7]
}

# Initialize XGBoost regressor
xgb = XGBRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit to the data
grid_search.fit(X_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 2 candidates, totalling 6 fits


ValueError: 
All the 6 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\sklearn.py", line 1051, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\sklearn.py", line 534, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\sklearn.py", line 954, in _create_dmatrix
    return QuantileDMatrix(
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 1528, in __init__
    self._init(
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 1587, in _init
    it.reraise()
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 575, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 556, in _handle_exception
    return fn()
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\core.py", line 631, in input_data
    dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\data.py", line 1331, in dispatch_proxy_set_data
    _check_data_shape(data)
  File "C:\Users\Hoecine\New folder (2)\lib\site-packages\xgboost\data.py", line 57, in _check_data_shape
    raise ValueError("Please reshape the input data into 2-dimensional matrix.")
ValueError: Please reshape the input data into 2-dimensional matrix.


In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predict on the validation set
y_pred = best_model.predict(X_cross_val)

# Calculate metrics
rmse = mean_squared_error(y_cross_val, y_pred, squared=False)
mae = mean_absolute_error(y_cross_val, y_pred)
r2 = r2_score(y_cross_val, y_pred)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R^2: {r2}")
print(best_params)

RMSE: 0.9485270591439028
MAE: 0.6099876373993324
R^2: 0.5713355561776677
{'colsample_bytree': 0.7, 'learning_rate': 0.02, 'max_depth': 10, 'n_estimators': 1000, 'subsample': 0.7}
