In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from datetime import date
import holidays
from datetime import datetime

In [2]:
features = ["temperature", "rain", "month", "day_of_month", "day_of_week", "is_national_holiday", "is_school_holiday"]
target = "count"

In [3]:
location = "vrsic"

In [4]:
hikers_data = pd.read_csv(f"../data/hikers/test/{location}.csv")
weather_data = pd.read_csv(f"../data/weather/aggregated/{location}.csv")

In [5]:
# List of national holidays
national_holidays = holidays.SI()

In [6]:
# List of school holidays and code to check if a date is holiday

school_holidays = [["2021-12-25", "2022-01-02"], ["2022-02-21", "2022-02-25"], ["2022-02-28", "2022-03-04"], ["2022-04-27", "2022-05-02"], ["2022-06-25", "2022-08-31"], ["2022-10-31", "2022-11-04"], ["2022-12-26", "2023-01-02"], ["2023-01-30", "2023-02-01"], ["2023-02-06", "2023-02-10"], ["2023-04-27", "2023-05-02"], ["2023-06-26", "2023-08-31"],["2023-10-30", "2023-11-3"], ["2023-12-25", "2024-01-02"],["2024-02-19", "2024-02-23"], ["2024-02-26", "2023-03-01"], ["2024-4-27", "2023-05-02"], ["2024-06-26", "2024-08-31"]]
school_holidays = [[datetime.strptime(start, "%Y-%m-%d"), datetime.strptime(end, "%Y-%m-%d")] for start, end in school_holidays]

def is_school_holiday(date):
    for start, end in school_holidays:
        if start <= date <= end:
            return True
    return False

In [7]:
# Calculate our features and prepare data

def prepare_data(row):
    dt = datetime.fromisoformat(row["datetime"])
    row["month"] = dt.month
    row["day_of_month"] = dt.day
    row["day_of_week"] = dt.weekday()
    row["is_national_holiday"] = dt in national_holidays
    row["is_school_holiday"] = is_school_holiday(dt)

    if not (counts := hikers_data.loc[hikers_data["datum"] == row["datetime"]]).empty:
        row["count"] = counts["vhodi"].values[0] + counts["izhodi"].values[0]

    return row

data = weather_data.apply(prepare_data, axis=1).dropna()

In [8]:
data

Unnamed: 0,count,datetime,day_of_month,day_of_week,is_national_holiday,is_school_holiday,location,month,rain,temperature
195,352.0,2022-07-15,15,4,False,True,vrsic,7,0.454167,14.934028
196,677.0,2022-07-16,16,5,False,True,vrsic,7,0.364583,13.847917
197,1565.0,2022-07-17,17,6,False,True,vrsic,7,0.000000,13.490278
198,954.0,2022-07-18,18,0,False,True,vrsic,7,0.000000,14.661111
199,1016.0,2022-07-19,19,1,False,True,vrsic,7,0.000000,16.581250
...,...,...,...,...,...,...,...,...,...,...
815,0.0,2024-03-26,26,1,False,False,vrsic,3,0.106250,-0.842361
816,0.0,2024-03-27,27,2,False,False,vrsic,3,0.837500,2.068056
817,0.0,2024-03-28,28,3,False,False,vrsic,3,0.756250,-0.504861
818,0.0,2024-03-29,29,4,False,False,vrsic,3,0.091667,2.828472


In [9]:
# Prepare x and y data
x_data = data[features]
y_data = data[target]

In [10]:
x_data

Unnamed: 0,temperature,rain,month,day_of_month,day_of_week,is_national_holiday,is_school_holiday
195,14.934028,0.454167,7,15,4,False,True
196,13.847917,0.364583,7,16,5,False,True
197,13.490278,0.000000,7,17,6,False,True
198,14.661111,0.000000,7,18,0,False,True
199,16.581250,0.000000,7,19,1,False,True
...,...,...,...,...,...,...,...
815,-0.842361,0.106250,3,26,1,False,False
816,2.068056,0.837500,3,27,2,False,False
817,-0.504861,0.756250,3,28,3,False,False
818,2.828472,0.091667,3,29,4,False,False


In [11]:
y_data

195     352.0
196     677.0
197    1565.0
198     954.0
199    1016.0
        ...  
815       0.0
816       0.0
817       0.0
818       0.0
819       0.0
Name: count, Length: 414, dtype: float64

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [13]:
# Normalize features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [14]:
# Convert to PyTorch tensors
x_train_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
x_test_tensor = torch.tensor(x_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [15]:
class HikerPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(HikerPredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [16]:
# Instantiate the model
input_size = len(features)
hidden_size = 64
output_size = 1
model = HikerPredictor(input_size, hidden_size, output_size)

In [20]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
# Load our model
model.load_state_dict(torch.load(f"../models/{location}-03.pth"))
model.eval()

HikerPredictor(
  (fc1): Linear(in_features=7, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

In [18]:
# Evaluate on test set
with torch.no_grad():
    model.eval()
    test_predictions = model(x_test_tensor).numpy()

In [19]:
# Calculate test RMSE (Root Mean Squared Error)
rmse = ((test_predictions - y_test.values) ** 2).mean() ** 0.5
print(f"Test RMSE: {rmse:.2f}")

Test RMSE: 1840.92


In [20]:
predictions = model(x_test_tensor)
mse = mean_squared_error(y_test_tensor.detach().numpy(), predictions.detach().numpy())
print(f'MSE: {mse}')

MSE: 2795175.5


In [21]:
# Test the model
testing_date = "2023-08-15"
testing_data = data.loc[data["datetime"] == testing_date]

testing_parameters = scaler.transform(testing_data[features])

with torch.no_grad():
    input_features = torch.tensor([testing_parameters], dtype=torch.float32)
    predicted_count = model(input_features)

  input_features = torch.tensor([testing_parameters], dtype=torch.float32)


In [22]:
testing_data

Unnamed: 0,count,datetime,day_of_month,day_of_week,is_national_holiday,is_school_holiday,location,month,rain,temperature
591,2011.0,2023-08-15,15,1,True,True,vrsic,8,0.0,16.128472


In [23]:
predicted_count

tensor([[[632.9374]]])

In [25]:
# Generate for each date
predictions = []

for _, neki in data.iterrows():
    neki2 = data.loc[data["datetime"] == neki["datetime"]]
    parameters = scaler.transform(neki2[features])
    with torch.no_grad():
        input_features = torch.tensor([parameters], dtype=torch.float32)
        predicted_count = model(input_features)
        predictions.append(f"{neki['datetime']},{float(predicted_count)}\n")

print(predictions)

with open(f"../predictions/{location}2.csv", "w") as file:
    file.writelines(predictions)

['2022-07-15,2249.64697265625\n', '2022-07-16,1041.20166015625\n', '2022-07-17,2039.493408203125\n', '2022-07-18,1950.912353515625\n', '2022-07-19,2265.013671875\n', '2022-07-20,2682.65869140625\n', '2022-07-21,3032.37841796875\n', '2022-07-22,2970.36083984375\n', '2022-07-23,2806.1748046875\n', '2022-07-24,1669.33740234375\n', '2022-07-25,2876.40478515625\n', '2022-07-26,1679.24951171875\n', '2022-07-27,1628.97802734375\n', '2022-07-28,2180.77587890625\n', '2022-07-29,1493.9130859375\n', '2022-07-30,905.8256225585938\n', '2022-07-31,1701.61376953125\n', '2022-08-01,4157.869140625\n', '2022-08-02,4514.9541015625\n', '2022-08-03,3797.578369140625\n', '2022-08-04,3857.47265625\n', '2022-08-05,3320.0625\n', '2022-08-06,2566.75244140625\n', '2022-08-07,1630.7159423828125\n', '2022-08-08,3640.0966796875\n', '2022-08-09,3167.36767578125\n', '2022-08-10,2745.537353515625\n', '2022-08-11,2632.911376953125\n', '2022-08-12,2164.286865234375\n', '2022-08-13,2196.2841796875\n', '2022-08-14,2478.39