### Imports

In [None]:
pip install pandas numpy matplotlib seaborn scikit-learn pyarrow tensorflow

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyarrow.parquet as pq
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, concatenate, Dropout

### Data Loading

In [None]:
trip_jun = pq.read_table('../data/yellow_tripdata_2025-06.parquet').to_pandas()
trip_jul = pq.read_table('../data/yellow_tripdata_2025-07.parquet').to_pandas()
trip_aug = pq.read_table('../data/yellow_tripdata_2025-08.parquet').to_pandas()
trip_sep = pq.read_table('../data/yellow_tripdata_2025-09.parquet').to_pandas()
trip_sep.info()

In [None]:
df = pd.concat([trip_jun, trip_jul, trip_aug, trip_sep], ignore_index=True)
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df.info()

### Time Series

In [None]:
df = df.dropna(subset=['tpep_pickup_datetime'])

start_date = '2025-06-01'
end_date = '2025-10-01'
mask = (df['tpep_pickup_datetime'] >= start_date) & (df['tpep_pickup_datetime'] <= end_date)

df = df[mask].copy()

In [None]:
df = df.set_index('tpep_pickup_datetime')

hourly_counts = df.resample('h').size()
df = hourly_counts.to_frame(name='trip_count')

df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek
df['weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

df.info()

In [None]:
plt.plot(df.index, df['trip_count'])
plt.xlabel('Time')
plt.ylabel('Trips')
plt.title('Hourly Trip Count')
plt.show()

### Scaling & Windowing

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)

scaler_y = StandardScaler()
scaler_x = StandardScaler()

df_train[['trip_count']] = scaler_y.fit_transform(df_train[['trip_count']])
df_train = df_train.fillna(0.0)
df_train['lag_24'] = df_train['trip_count'].shift(24)
df_train['lag_168'] = df_train['trip_count'].shift(168)
df_train = df_train.dropna()

df_test[['trip_count']] = scaler_y.transform(df_test[['trip_count']])
df_test = df_test.fillna(0.0)
df_test['lag_24'] = df_test['trip_count'].shift(24)
df_test['lag_168'] = df_test['trip_count'].shift(168)
df_test = df_test.dropna()
df_test.head()

In [None]:
plt.plot(df_train.index, df_train['trip_count'], label='Train')
plt.plot(df_test.index, df_test['trip_count'], label='Test')
plt.xlabel('Time')
plt.ylabel('Scaled Trips')
plt.legend()
plt.show()

In [None]:
def create_sequences(data, window_size):
    trip_counts = data[['trip_count']].values
    features = data[['hour', 'day_of_week', 'weekend', 'lag_24', 'lag_168']].values
    
    x_seq, x_feat, y = [], [], []

    for i in range(window_size, len(data)):
        seq_window = trip_counts[i-window_size:i]
        feat_window = features[i]
        target = trip_counts[i]

        x_seq.append(seq_window)
        x_feat.append(feat_window)
        y.append(target)

    return np.array(x_seq), np.array(x_feat), np.array(y)

window_size = 24
x_seq_train, x_feat_train, y_train = create_sequences(df_train, window_size)
x_seq_test, x_feat_test, y_test = create_sequences(df_test, window_size)
print(x_seq_train.shape, x_feat_train.shape, y_train.shape)
print(x_seq_test.shape, x_feat_test.shape, y_test.shape)

### Model Construction

In [None]:
#Uncomment if u want to load the model!
model_multivar = tf.keras.models.load_model('../models/taxi_demand_forecaster_1h.keras')

In [None]:
input_sequence = Input(shape=(window_size, 1), name='sequence_input')
input_features = Input(shape=(5,), name='feature_input')

lstm = LSTM(50, activation='relu')(input_sequence)
dense = Dense(10, activation='relu')(input_features)

combined = concatenate([lstm, dense])

output = Dense(1)(combined)

model_multivar = Model(inputs=[input_sequence, input_features], outputs=output)
model_multivar.compile(optimizer='adam', loss='mse')
model_multivar.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history_multivar = model_multivar.fit(
    {'sequence_input': x_seq_train, 'feature_input': x_feat_train},
    y_train,
    epochs=200,
    batch_size=32,
    validation_data=(
        {'sequence_input': x_seq_test, 'feature_input': x_feat_test},
        y_test,
    ),
    callbacks=[early_stopping]
)

In [None]:
# Uncomment if u want to save the model!
#model_multivar.save('../models/new_model.keras')

### Model Results

In [None]:
y_pred_scaled = model_multivar.predict([x_seq_test, x_feat_test])
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_true = scaler_y.inverse_transform(y_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'RMSE: {rmse}')

rmse_percentage = (rmse / np.mean(y_true)) * 100
print(f'RMSE Percentage: {rmse_percentage:.2f}%')

mse = mean_squared_error(y_true, y_pred)
print(f'MSE: {mse}')

mse_percentage = (mse / (np.mean(y_true) ** 2)) * 100
print(f'MSE Percentage: {mse_percentage:.2f}%')

r2 = r2_score(y_true, y_pred)
print(f'R-squared: {r2}')

In [None]:
plt.plot(history_multivar.history['loss'], label='Train Loss')
plt.plot(history_multivar.history['val_loss'], label='Test Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Model Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(y_true, label='Actual Data', color='blue')
plt.plot(y_pred, label='Predicted Data', color='orange', alpha=0.7)
plt.xlabel('Time Steps')
plt.ylabel('Trip Count')
plt.title('Model Predictions')
plt.legend()
plt.show()