In [1]:
# load libraries
import pandas as pd
import os
import numpy as np
import datetime
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from torch import save

In [2]:
# mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# change me, have to point to the Meteotrentino drive folder
wk =  "/content/drive/My Drive/10. Meteotrentino/"

# folder for the csv
subfolder = "Dati csv convalidati/"

data_path = wk+subfolder+"umidità_A.csv"

In [15]:
# select the filename of the dataset (Dario: dont judge me)
selected_dataset = os.path.basename(data_path).split('_A')[0] 
if selected_dataset > "vel":
  dataset_sensor = "vento"
elif selected_dataset > "umidita":
  dataset_sensor = "umidità"
elif selected_dataset > "temperatura":    
  dataset_sensor = "temperatura"
elif selected_dataset > "radiazione":
  dataset_sensor = "radiazione"
elif selected_dataset > "pioggia":
  dataset_sensor = "pioggia"
elif selected_dataset > "direzione":
  dataset_sensor = "vento"
else:
  dataset_sensor = "Altezza neve"

In [16]:
df_row_data = pd.read_csv(
  data_path,
  sep = ";", # not a CSV, separator is ';'
  header = None, # no header
  dtype = {'0' : str, '1': str, '2': str, '3' : np.float64, '4':int} # Daniele: added the validation code so we can train on validated data (no need of classes, just distinguish between good and bad data)
  ) 

# define a header 
df_row_data = df_row_data.set_axis(
    ['label', 'date', 'time', 'value','validation_code'],
    axis = 1,
    inplace = False
)
df_row_data['datetime'] = pd.to_datetime(df_row_data['date']+" "+df_row_data['time'], format = '%d/%m/%Y %H:%M')
df_row_data

Unnamed: 0,label,date,time,value,validation_code,datetime
0,T0010,01/01/2014,00:00,98.0,1,2014-01-01 00:00:00
1,T0010,01/01/2014,00:15,98.0,1,2014-01-01 00:15:00
2,T0010,01/01/2014,00:30,98.0,1,2014-01-01 00:30:00
3,T0010,01/01/2014,00:45,98.0,1,2014-01-01 00:45:00
4,T0010,01/01/2014,01:00,98.0,1,2014-01-01 01:00:00
...,...,...,...,...,...,...
8588959,T0469,31/12/2021,22:45,73.0,145,2021-12-31 22:45:00
8588960,T0469,31/12/2021,23:00,75.0,145,2021-12-31 23:00:00
8588961,T0469,31/12/2021,23:15,74.0,145,2021-12-31 23:15:00
8588962,T0469,31/12/2021,23:30,69.0,145,2021-12-31 23:30:00


In [6]:
station_path = "/content/drive/MyDrive/10. Meteotrentino/Informazioni sulle stazioni/lista stazioni con sensori_aggiornato al 06072022.xlsx"

In [7]:
# define station dataset
df_station = pd.read_excel(station_path) 

In [8]:
df_station = df_station.drop([df_station.index[118]]) #Drop the total column 

In [9]:
df_station

Unnamed: 0,codice,nome,temperatura,pioggia,vento,pressione,umidità,termometro acqua,pettini neve,bagnatura,Altezza neve,radiazione,livello acqua,temperatura superficiale neve,temperatura terreno,catena termometrica
0,T0009,Centa S. Nicolò,x,x,,,,,,,,,,,,
1,T0010,Levico Terme,x,x,x,x,x,,,,,x,,,,
2,T0014,Telve (Pontarso),x,x,,,,,,,,,,,,
3,T0015,Bieno,x,x,,,,,,,,,,,,
4,T0024,Passo Cereda,x,x,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,T0454,Trento (Liceo Galilei),x,x,,x,,,,,,,,,,
114,T0469,Castello Tesino (Le parti),x,x,x,x,x,,,,,x,,,,
115,T0473,Ghiacciaio Careser,x,,x,x,x,,,,x,x,,,,
116,T0474,Ghiacciaio Mandrone,x,,x,x,x,,,,x,x,,,,


In [17]:
dataset_station_vector = []  # Array of station that have the sensor
for i in df_station.index: 
     if df_station[dataset_sensor][i] == "x":
          dataset_station_vector.append(df_station["codice"][i])

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
  def __init__(self, df_train, df_test, transform = None, target_transform = None):
    self.features = torch.FloatTensor(df_train.drop(['sensor_code','in_datetime'],axis=1).values)
    ### standardize the data
    means = self.features.mean(dim=1, keepdim=True)
    stds = torch.clamp(self.features.std(dim=1, keepdim=True),min=1)
    self.features = (self.features - means) / stds
    self.target = torch.FloatTensor(df_test.drop(['sensor_code','in_datetime']+[f"val_{idx}" for idx in range(n_previsions)],axis=1).values)
    self.target = (self.target - means) / stds
    
    self.sensor_code = df_train['sensor_code'].values
    self.in_datetime = df_train['in_datetime'].values
    
    self.labels = torch.FloatTensor(df_test[[f"val_{idx}" for idx in range(n_previsions)]].values)
    self.transform = transform
    self.target_transform = target_transform

  def __len__(self):
    return len(self.sensor_code)

  def __getitem__(self, idx):
    sensor_code = self.sensor_code[idx]
    date = self.in_datetime[idx]
    features = self.features[idx]
    labels = self.labels[idx]
    target = self.target[idx]
    return {"sensor_code": sensor_code, "date":date,"features": features, "target":target,"labels":labels}

In [None]:
#import TSF as you want
from importlib import reload  # Python 3.4+
import sys
sys.path.append("/content/drive/MyDrive")
import data_splitter
data_splitter = reload(data_splitter)
from data_splitter import *
from torch import save,load


In [None]:
PERIOD = 24
PREDICTION = 1

for station in dataset_station_vector:
  df_single_station = df_row_data[df_row_data['label'] == station].reset_index(drop=True)
  if not df_single_station.empty: #some station have the sensor but not the data idk why
    df = df_single_station
    df_train,df_prediction,n_previsions = split_df_for_TSF(df, PERIOD, PREDICTION)
    df_prediction.to_csv("prediction.csv")
    splits = ['train','test']
    shuffle = {'train':True,'test':False}
    dataset = {}

    # split data in training and test (to check generalization)
    training_features = df_train.sample(frac=0.8)
    training_target = df_prediction.iloc[training_features.index.values]
    test_features = df_train.drop(training_features.index)
    test_target = df_prediction.iloc[test_features.index]

    training_features = training_features.reset_index(drop=True)
    training_target = training_target.reset_index(drop=True)
    test_features = test_features.reset_index(drop=True)
    test_target = test_target.reset_index(drop=True)

    dataset['train'] = CustomDataset(training_features, training_target)
    dataset['test'] = CustomDataset(test_features, test_target)
    save(dataset['train'], station + "_train_"+ selected_dataset + "_tsf.pt")
    save(dataset['test'], station + "_test_"+ selected_dataset + "_tsf.pt")

    b_size = {'train':64,
              'test':len(dataset['test'])}

    dataloader = {x: torch.utils.data.DataLoader(dataset=dataset[x],
                                                batch_size=b_size[x],
                                                shuffle=shuffle[x],
                                                collate_fn=lambda x: x,
                                                drop_last=True)
                                                #num_workers=int(opt.workers),
                                                #worker_init_fn=(None if opt.manualseed == -1
                                                #else lambda x: np.random.seed(opt.manualseed)))
                for x in splits}