In [None]:
import tensorflow as tf
import os
import shutil

path_to_zip = tf.keras.utils.get_file("bike_sharing_dataset.zip", "https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip", cache_dir=".", extract=True)
extracted_dir = os.path.splitext(path_to_zip)[0] + "_extracted"
source_csv = os.path.join("datasets", "bike_sharing_dataset_extracted", "hour.csv")
target_csv = os.path.join("datasets", "hour.csv")

os.makedirs("datasets", exist_ok=True)

shutil.copy(source_csv, target_csv)

print(f"Plik hour.csv został skopiowany do {target_csv}")

In [None]:
import pandas as pd

df = pd.read_csv('datasets/bike_sharing_dataset_extracted/hour.csv')
df['datetime'] = pd.to_datetime(df['dteday'] + ' ' + df['hr'].astype(str).str.zfill(2), format='%Y-%m-%d %H')

df.set_index('datetime', inplace=True)

print((df.index.min(), df.index.max()))
print((365 + 366) * 24 - len(df))

In [None]:
full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='H')
missing = full_index.difference(df.index)
print(missing)
print(df.columns)

In [None]:
resampled = pd.DataFrame()
resampled[['holiday', 'weekday', 'workingday', 'weathersit']] = df.resample(rule='1h').ffill()[['holiday', 'weekday', 'workingday', 'weathersit']]
resampled[['temp', 'atemp', 'hum', 'windspeed']] = df.resample(rule='1h').interpolate(method="linear")[['temp', 'atemp', 'hum', 'windspeed']]
resampled[['casual', 'registered', 'cnt']] = df.resample(rule='1h').asfreq(0)[['casual', 'registered', 'cnt']]
# print(df.loc["2011-01-02 00:00:00":"2011-01-02 10:00:00"])
print(resampled.loc["2011-01-02 03:00:00":"2011-01-02 7:00:00"])

In [None]:
df = resampled
df.notna().sum()

In [None]:
df[['casual', 'registered', 'cnt', 'weathersit']].describe()

df.casual /= 1e3
df.registered /= 1e3
df.cnt /= 1e3
df.weathersit /= 4

df_2weeks = df[:24 * 7 * 2]
df_2weeks[['casual', 'registered', 'cnt', 'temp']].plot(figsize=(10, 3))

df_daily = df.resample('W').mean()
df_daily[['casual', 'registered', 'cnt', 'temp']].plot(figsize=(10, 3))

In [None]:
import numpy as np
import pickle

df['cnt_scaled'] = df['cnt'] * 10**3

cnt_pred_day = df['cnt_scaled'].shift(24)
mae_daily = np.mean(np.abs(df['cnt_scaled'] - cnt_pred_day))

cnt_pred_week = df['cnt_scaled'].shift(24 * 7)
mae_weekly = np.mean(np.abs(df['cnt_scaled'] - cnt_pred_week))

mae_daily = round(mae_daily, 2)
mae_weekly = round(mae_weekly, 2)

with open("mae_baseline.pkl", "wb") as f:
    pickle.dump((mae_daily, mae_weekly), f)

print(f"MAE daily: {mae_daily}, MAE weekly: {mae_weekly}")

In [None]:
cnt_train = df['cnt']['2011-01-01 00:00':'2012-06-30 23:00']
cnt_valid = df['cnt']['2012-07-01 00:00':]

seq_len = 1 * 24
train_ds = tf.keras.utils.timeseries_dataset_from_array(
            cnt_train.to_numpy(),
            targets=cnt_train[seq_len:],
            sequence_length=seq_len,
            batch_size=32,
            shuffle=True,
            seed=42
            )

valid_ds = tf.keras.utils.timeseries_dataset_from_array(
            cnt_valid.to_numpy(),
            targets=cnt_valid[seq_len:],
            sequence_length=seq_len,
            batch_size=32
            )

In [None]:
import keras 

def build_model(learning_rate):
    model = tf.keras.Sequential([tf.keras.Input(shape=(seq_len,)), tf.keras.layers.Dense(1)])
    model.compile(optimizer=keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9), loss=keras.losses.Huber(), metrics=['mae'])
    return model

def build_model_LSTM(learning_rate):
    model = tf.keras.Sequential([tf.keras.Input(shape=[None, 1]), tf.keras.layers.LSTM(1)])
    model.compile(optimizer=keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9), loss=keras.losses.Huber(), metrics=['mae'])
    return model

def build_model_RNN(learning_rate):
    model = tf.keras.Sequential([tf.keras.Input(shape=[None, 1]), tf.keras.layers.LSTM(32), tf.keras.layers.Dense(1)])
    model.compile(optimizer=keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9), loss=keras.losses.Huber(), metrics=['mae'])
    return model

def build_model_DEEP_RNN(learning_rate):
    model = tf.keras.Sequential([tf.keras.Input(shape=[None, 1])])
    model.add(tf.keras.layers.LSTM(32, return_sequences=True))
    model.add(tf.keras.layers.LSTM(32, return_sequences=True))
    model.add(tf.keras.layers.LSTM(32))
    model.add(tf.keras.layers.Dense(1))
    model.compile(optimizer=keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9), loss=keras.losses.Huber(), metrics=['mae'])
    return model

In [None]:
models = {
    "linear"  : build_model,
    "rnn1"    : build_model_LSTM,
    "rnn32"   : build_model_RNN,
    "rnn_deep": build_model_DEEP_RNN
}

for name in models.keys():
    lr = 0.1
    lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.3,
        patience=3,
        verbose=1,
        min_lr=1e-6
    )

    model = models[name](lr)
    model.fit(train_ds, epochs=20, validation_data=valid_ds, verbose=0, callbacks=[lr_schedule])
    mea_linear = model.evaluate(valid_ds)[1]
    model.save(f'model_{name}.keras')

    with open(f"mae_{name}.pkl", "wb") as file:
        pickle.dump((mea_linear * 10**3, ), file)

In [None]:
# nie rozumiem czemu nie tak jak jest mniejszy mae ale ok
# cnt_train = df[['workingday', 'weathersit', 'atemp']]['2011-01-01 00:00':'2012-06-30 23:00']
# cnt_train_y = df[['cnt']]['2011-01-01 00:00':'2012-06-30 23:00']
# cnt_valid = df[['workingday', 'weathersit', 'atemp']]['2012-07-01 00:00':]
# cnt_valid_y = df[['cnt']]['2012-07-01 00:00':]

# seq_len = 1 * 24
# train_ds = tf.keras.utils.timeseries_dataset_from_array(
#             cnt_train.to_numpy(),
#             targets=cnt_train_y.to_numpy()[seq_len:],
#             sequence_length=seq_len,
#             batch_size=32,
#             shuffle=True,
#             seed=42
#             )

# valid_ds = tf.keras.utils.timeseries_dataset_from_array(
#             cnt_valid.to_numpy(),
#             targets=cnt_valid_y.to_numpy()[seq_len:],
#             sequence_length=seq_len,
#             batch_size=32
#             )

In [None]:
cnt_train = df[['cnt', 'workingday', 'weathersit', 'atemp']]['2011-01-01 00:00':'2012-06-30 23:00']
cnt_valid = df[['cnt', 'workingday', 'weathersit', 'atemp']]['2012-07-01 00:00':]

seq_len = 1 * 24
train_ds = tf.keras.utils.timeseries_dataset_from_array(
            cnt_train.to_numpy(),
            targets=cnt_train[seq_len:],
            sequence_length=seq_len,
            batch_size=32,
            shuffle=True,
            seed=42
            )

valid_ds = tf.keras.utils.timeseries_dataset_from_array(
            cnt_valid.to_numpy(),
            targets=cnt_valid[seq_len:],
            sequence_length=seq_len,
            batch_size=32
            )

In [None]:
import keras 

def build_model_deep_rnn_multi(learning_rate):
    model = tf.keras.Sequential([tf.keras.Input(shape=[None, 4])])
    model.add(tf.keras.layers.LSTM(32))
    model.add(tf.keras.layers.Dense(1))
    model.compile(optimizer=keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9), loss=keras.losses.Huber(), metrics=['mae'])
    return model

lr = 0.1
lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.3,
    patience=3,
    verbose=1,
    min_lr=1e-6
)

model = build_model_deep_rnn_multi(lr)
model.fit(train_ds, epochs=20, validation_data=valid_ds, verbose=0, callbacks=[lr_schedule])
mae = model.evaluate(valid_ds)[1]

model.save('model_rnn_mv.keras')

with open("mae_rnn_mv.pkl", "wb") as file:
    pickle.dump((mae * 10 **3, ), file)