In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import date
import holidays
from datetime import datetime

In [3]:
features = ["temperature", "rain", "month", "day_of_month", "day_of_week", "is_national_holiday", "is_school_holiday"]
target = "count"

In [4]:
location = "vrsic"

In [5]:
hikers_data = pd.read_csv(f"../data/hikers/test/{location}.csv")
weather_data = pd.read_csv(f"../data/weather/aggregated/{location}.csv")

In [6]:
# Cut off hikers data at 2024
hikers_data_cut = hikers_data[hikers_data["datum"] < "2024-01-01"]

In [7]:
# List of national holidays
national_holidays = holidays.SI()

In [8]:
# List of school holidays and code to check if a date is holiday

school_holidays = [["2021-12-25", "2022-01-02"], ["2022-02-21", "2022-02-25"], ["2022-02-28", "2022-03-04"], ["2022-04-27", "2022-05-02"], ["2022-06-25", "2022-08-31"], ["2022-10-31", "2022-11-04"], ["2022-12-26", "2023-01-02"], ["2023-01-30", "2023-02-01"], ["2023-02-06", "2023-02-10"], ["2023-04-27", "2023-05-02"], ["2023-06-26", "2023-08-31"],["2023-10-30", "2023-11-3"], ["2023-12-25", "2024-01-02"],["2024-02-19", "2024-02-23"], ["2024-02-26", "2023-03-01"], ["2024-4-27", "2023-05-02"], ["2024-06-26", "2024-08-31"]]
school_holidays = [[datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d")] for start, end in school_holidays]

def is_school_holiday(date):
    for start, end in school_holidays:
        if start <= date <= end:
            return True
    return False

In [9]:
# Calculate our features and prepare data

def prepare_data(row):
    dt = datetime.fromisoformat(row["datetime"])
    row["month"] = dt.month
    row["day_of_month"] = dt.day
    row["day_of_week"] = dt.weekday()
    row["is_national_holiday"] = dt in national_holidays
    row["is_school_holiday"] = is_school_holiday(dt)

    if not (counts := hikers_data_cut.loc[hikers_data_cut["datum"] == row["datetime"]]).empty:
        row["count"] = counts["vhodi"].values[0] + counts["izhodi"].values[0]

    return row

data = weather_data.apply(prepare_data, axis=1).dropna()

In [10]:
data

Unnamed: 0,count,datetime,day_of_month,day_of_week,is_national_holiday,is_school_holiday,location,month,rain,temperature
195,352.0,2022-07-15,15,4,False,True,vrsic,7,0.454167,14.934028
196,677.0,2022-07-16,16,5,False,True,vrsic,7,0.364583,13.847917
197,1565.0,2022-07-17,17,6,False,True,vrsic,7,0.000000,13.490278
198,954.0,2022-07-18,18,0,False,True,vrsic,7,0.000000,14.661111
199,1016.0,2022-07-19,19,1,False,True,vrsic,7,0.000000,16.581250
...,...,...,...,...,...,...,...,...,...,...
725,7.0,2023-12-27,27,2,False,True,vrsic,12,0.000000,1.984722
726,0.0,2023-12-28,28,3,False,True,vrsic,12,0.000000,-2.121528
727,25.0,2023-12-29,29,4,False,True,vrsic,12,0.000000,-1.415972
728,34.0,2023-12-30,30,5,False,True,vrsic,12,0.000000,-0.792361


In [11]:
# Prepare x and y data
x_data = data[features]
y_data = data[target]

In [12]:
x_data

Unnamed: 0,temperature,rain,month,day_of_month,day_of_week,is_national_holiday,is_school_holiday
195,14.934028,0.454167,7,15,4,False,True
196,13.847917,0.364583,7,16,5,False,True
197,13.490278,0.000000,7,17,6,False,True
198,14.661111,0.000000,7,18,0,False,True
199,16.581250,0.000000,7,19,1,False,True
...,...,...,...,...,...,...,...
725,1.984722,0.000000,12,27,2,False,True
726,-2.121528,0.000000,12,28,3,False,True
727,-1.415972,0.000000,12,29,4,False,True
728,-0.792361,0.000000,12,30,5,False,True


In [13]:
y_data

195     352.0
196     677.0
197    1565.0
198     954.0
199    1016.0
        ...  
725       7.0
726       0.0
727      25.0
728      34.0
729       6.0
Name: count, Length: 324, dtype: float64

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [15]:
# Normalize features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [16]:
# Convert to PyTorch tensors
x_train_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
x_test_tensor = torch.tensor(x_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [18]:
class HikerPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(HikerPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [19]:
# Instantiate the model
input_size = len(features)
hidden_size = 64
output_size = 1
model = HikerPredictor(input_size, hidden_size, output_size)

In [20]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
# Training loop
num_epochs = 1000000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(x_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    if epoch % 100 == 0:
        print(f"Epoch {epoch}: {loss}")

    loss.backward()
    optimizer.step()

Epoch 0: 970148.875
Epoch 100: 965349.3125
Epoch 200: 951938.75
Epoch 300: 924764.75
Epoch 400: 884205.0
Epoch 500: 832439.8125
Epoch 600: 772771.625
Epoch 700: 708968.4375
Epoch 800: 644128.125
Epoch 900: 581064.1875
Epoch 1000: 522258.4375
Epoch 1100: 469283.09375
Epoch 1200: 423308.71875
Epoch 1300: 384567.53125
Epoch 1400: 352703.375
Epoch 1500: 326880.21875
Epoch 1600: 306056.375
Epoch 1700: 289098.21875
Epoch 1800: 275055.1875
Epoch 1900: 263125.9375
Epoch 2000: 252889.421875
Epoch 2100: 243990.09375
Epoch 2200: 236151.015625
Epoch 2300: 229201.1875
Epoch 2400: 223016.34375
Epoch 2500: 217568.28125
Epoch 2600: 212772.8125
Epoch 2700: 208523.4375
Epoch 2800: 204770.34375
Epoch 2900: 201455.875
Epoch 3000: 198553.546875
Epoch 3100: 196049.5625
Epoch 3200: 193853.90625
Epoch 3300: 191914.0
Epoch 3400: 190183.734375
Epoch 3500: 188590.078125
Epoch 3600: 187137.421875
Epoch 3700: 185755.6875
Epoch 3800: 184448.0
Epoch 3900: 183206.6875
Epoch 4000: 182009.828125
Epoch 4100: 180859.75
E

In [22]:
# Evaluate on test set
with torch.no_grad():
    model.eval()
    test_predictions = model(x_test_tensor).numpy()

In [23]:
# Calculate test RMSE (Root Mean Squared Error)
rmse = ((test_predictions - y_test.values) ** 2).mean() ** 0.5
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 1283.75


In [24]:
predictions = model(x_test_tensor)
mse = mean_squared_error(y_test_tensor.detach().numpy(), predictions.detach().numpy())
print(f'MSE: {mse}')

MSE: 801975.125


In [25]:
# Test the model
testing_date = "2023-08-15"
testing_data = data.loc[data["datetime"] == testing_date]

testing_parameters = scaler.transform(testing_data[features])

with torch.no_grad():
    input_features = torch.tensor([testing_parameters], dtype=torch.float32)
    predicted_count = model(input_features)

  input_features = torch.tensor([testing_parameters], dtype=torch.float32)


In [26]:
testing_data

Unnamed: 0,count,datetime,day_of_month,day_of_week,is_national_holiday,is_school_holiday,location,month,rain,temperature
591,2011.0,2023-08-15,15,1,True,True,vrsic,8,0.0,16.128472


In [27]:
predicted_count

tensor([[[1694.9517]]])

In [28]:
# Generate for each date
predictions = []

for _, neki in data.iterrows():
    neki2 = data.loc[data["datetime"] == neki["datetime"]]
    parameters = scaler.transform(neki2[features])
    with torch.no_grad():
        input_features = torch.tensor([parameters], dtype=torch.float32)
        predicted_count = model(input_features)
        predictions.append(f"{neki['datetime']},{float(predicted_count)}\n")

print(predictions)

with open(f"../predictions/{location}.csv", "w") as file:
    file.writelines(predictions)

['2022-07-15,352.0003356933594\n', '2022-07-16,677.0003662109375\n', '2022-07-17,1565.0025634765625\n', '2022-07-18,321.2597351074219\n', '2022-07-19,1015.9926147460938\n', '2022-07-20,889.7272338867188\n', '2022-07-21,1327.937744140625\n', '2022-07-22,1826.660888671875\n', '2022-07-23,1854.0013427734375\n', '2022-07-24,1241.310302734375\n', '2022-07-25,1417.9990234375\n', '2022-07-26,610.0004272460938\n', '2022-07-27,1134.999755859375\n', '2022-07-28,1730.0029296875\n', '2022-07-29,789.9996948242188\n', '2022-07-30,1066.0009765625\n', '2022-07-31,2675.5078125\n', '2022-08-01,880.9730834960938\n', '2022-08-02,946.0014038085938\n', '2022-08-03,1816.99755859375\n', '2022-08-04,1499.00048828125\n', '2022-08-05,1207.000244140625\n', '2022-08-06,-243.34970092773438\n', '2022-08-07,580.0000610351562\n', '2022-08-08,2120.2197265625\n', '2022-08-09,1501.18896484375\n', '2022-08-10,1888.99755859375\n', '2022-08-11,1817.002197265625\n', '2022-08-12,580.9993286132812\n', '2022-08-13,1363.00024414

In [29]:
# Save the model
torch.save(model.state_dict(), f"../models/{location}-03.pth")