In [194]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import date
import holidays
from datetime import datetime

In [149]:
features = ["temperature", "rain", "month", "day_of_month", "day_of_week", "is_national_holiday", "is_school_holiday"]
target = "count"

In [104]:
location = "vrsic"

In [105]:
hikers_data = pd.read_csv(f"../data/hikers/test/{location}.csv")
weather_data = pd.read_csv(f"../data/weather/aggregated/{location}.csv")

In [106]:
# Cut off hikers data at 2024
hikers_data_cut = hikers_data[hikers_data["datum"] < "2024-01-01"]

In [107]:
# List of national holidays
national_holidays = holidays.SI()

In [108]:
# List of school holidays and code to check if a date is holiday

school_holidays = [["2021-12-25", "2022-01-02"], ["2022-02-21", "2022-02-25"], ["2022-02-28", "2022-03-04"], ["2022-04-27", "2022-05-02"], ["2022-06-25", "2022-08-31"], ["2022-10-31", "2022-11-04"], ["2022-12-26", "2023-01-02"], ["2023-01-30", "2023-02-01"], ["2023-02-06", "2023-02-10"], ["2023-04-27", "2023-05-02"], ["2023-06-26", "2023-08-31"],["2023-10-30", "2023-11-3"], ["2023-12-25", "2024-01-02"],["2024-02-19", "2024-02-23"], ["2024-02-26", "2023-03-01"], ["2024-4-27", "2023-05-02"], ["2024-06-26", "2024-08-31"]]
school_holidays = [[datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d")] for start, end in school_holidays]

def is_school_holiday(date):
    for start, end in school_holidays:
        if start <= date <= end:
            return True
    return False

In [146]:
# Calculate our features and prepare data

def prepare_data(row):
    dt = datetime.fromisoformat(row["datetime"])
    row["month"] = dt.month
    row["day_of_month"] = dt.day
    row["day_of_week"] = dt.weekday()
    row["is_national_holiday"] = dt in national_holidays
    row["is_school_holiday"] = is_school_holiday(dt)

    if not (counts := hikers_data_cut.loc[hikers_data_cut["datum"] == row["datetime"]]).empty:
        row["count"] = counts["vhodi"].values[0] + counts["izhodi"].values[0]

    return row

data = weather_data.apply(prepare_data, axis=1).dropna()

In [147]:
data

Unnamed: 0,count,datetime,day_of_month,day_of_week,is_national_holiday,is_school_holiday,location,month,rain,temperature
195,352.0,2022-07-15,15,4,False,True,vrsic,7,0.454167,14.934028
196,677.0,2022-07-16,16,5,False,True,vrsic,7,0.364583,13.847917
197,1565.0,2022-07-17,17,6,False,True,vrsic,7,0.000000,13.490278
198,954.0,2022-07-18,18,0,False,True,vrsic,7,0.000000,14.661111
199,1016.0,2022-07-19,19,1,False,True,vrsic,7,0.000000,16.581250
...,...,...,...,...,...,...,...,...,...,...
725,7.0,2023-12-27,27,2,False,True,vrsic,12,0.000000,1.984722
726,0.0,2023-12-28,28,3,False,True,vrsic,12,0.000000,-2.121528
727,25.0,2023-12-29,29,4,False,True,vrsic,12,0.000000,-1.415972
728,34.0,2023-12-30,30,5,False,True,vrsic,12,0.000000,-0.792361


In [150]:
# Prepare x and y data
x_data = data[features]
y_data = data[target]

In [151]:
x_data

Unnamed: 0,temperature,rain,month,day_of_month,day_of_week,is_national_holiday,is_school_holiday
195,14.934028,0.454167,7,15,4,False,True
196,13.847917,0.364583,7,16,5,False,True
197,13.490278,0.000000,7,17,6,False,True
198,14.661111,0.000000,7,18,0,False,True
199,16.581250,0.000000,7,19,1,False,True
...,...,...,...,...,...,...,...
725,1.984722,0.000000,12,27,2,False,True
726,-2.121528,0.000000,12,28,3,False,True
727,-1.415972,0.000000,12,29,4,False,True
728,-0.792361,0.000000,12,30,5,False,True


In [152]:
y_data

195     352.0
196     677.0
197    1565.0
198     954.0
199    1016.0
        ...  
725       7.0
726       0.0
727      25.0
728      34.0
729       6.0
Name: count, Length: 324, dtype: float64

In [153]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [154]:
# Normalize features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [155]:
# Convert to PyTorch tensors
x_train_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
x_test_tensor = torch.tensor(x_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [160]:
class HikerPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(HikerPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [161]:
# Instantiate the model
input_size = len(features)
hidden_size = 64
output_size = 1
model = HikerPredictor(input_size, hidden_size, output_size)

In [162]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [189]:
# Training loop
num_epochs = 100000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(x_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

In [190]:
# Evaluate on test set
with torch.no_grad():
    model.eval()
    test_predictions = model(x_test_tensor).numpy()

In [191]:
# Calculate test RMSE (Root Mean Squared Error)
rmse = ((test_predictions - y_test.values) ** 2).mean() ** 0.5
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 1270.76


In [198]:
predictions = model(x_test_tensor)
mse = mean_squared_error(y_test_tensor.detach().numpy(), predictions.detach().numpy())
print(f'MSE: {mse}')

MSE: 903973.25


In [192]:
# Test the model
testing_date = "2023-08-15"
testing_data = data.loc[data["datetime"] == testing_date]

testing_parameters = scaler.transform(testing_data[features])

with torch.no_grad():
    input_features = torch.tensor([testing_parameters], dtype=torch.float32)
    predicted_count = model(input_features)

In [193]:
predicted_count

tensor([[[1845.3959]]])