In [19]:
# load libraries
import pandas as pd
import os
import numpy as np
import datetime
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from torch import save

In [2]:
# mount drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# change me, have to point to the Meteotrentino drive folder
wk =  "/content/drive/My Drive/10. Meteotrentino/"

# folder for the csv
subfolder = "CSVs/"

# select the filename, make me dynamic
selected_dataset = "temperature.csv"

In [4]:
data_path = wk+subfolder+selected_dataset

In [5]:
df_row_data = pd.read_csv(
  data_path,
  sep = ";", # not a CSV, separator is ';'
  header = None, # no header
  dtype = {'0' : str, '1': str, '2': str, '3' : np.float64}
  ) 



In [6]:
# define a header 
df_row_data = df_row_data.set_axis(
    ['label', 'date', 'time', 'value'],
    axis = 1,
    inplace = False

)

In [7]:
df_row_data['datetime'] = pd.to_datetime(df_row_data['date']+" "+df_row_data['time'], format = '%d/%m/%Y %H:%M')


In [8]:
# econde the date as integer
df_row_data['datetime_enconded'] = np.int64(df_row_data['datetime'].astype(int)/10**9)
df_row_data

  


Unnamed: 0,label,date,time,value,datetime,datetime_enconded
0,T0009,01/01/2014,00:00,-2.1,2014-01-01 00:00:00,1388534400
1,T0009,01/01/2014,00:15,-2.3,2014-01-01 00:15:00,1388535300
2,T0009,01/01/2014,00:30,-2.2,2014-01-01 00:30:00,1388536200
3,T0009,01/01/2014,00:45,-2.3,2014-01-01 00:45:00,1388537100
4,T0009,01/01/2014,01:00,-2.5,2014-01-01 01:00:00,1388538000
...,...,...,...,...,...,...
30579354,T0994,31/12/2021,22:45,9.8,2021-12-31 22:45:00,1640990700
30579355,T0994,31/12/2021,23:00,9.0,2021-12-31 23:00:00,1640991600
30579356,T0994,31/12/2021,23:15,9.4,2021-12-31 23:15:00,1640992500
30579357,T0994,31/12/2021,23:30,9.6,2021-12-31 23:30:00,1640993400


In [65]:
# split the dataframe by date

date_start = df_row_data['datetime'].min()

date_end = df_row_data['datetime'].max()


# period
period_split = 24 #hours

current_data = date_start

df_row_data['period_index'] = [-1]*len(df_row_data['datetime'])



i = 0
while current_data < date_end:

  offset_data = current_data+ pd.offsets.Hour(period_split)
  
  
  df_row_data['period_index'][(df_row_data['datetime'] < offset_data) & (df_row_data['datetime'] >= current_data)] = i

  current_data = offset_data
  i = i+1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [78]:
df_row_data

Unnamed: 0,label,date,time,value,datetime,datetime_enconded,period_index
0,T0009,01/01/2014,00:00,-2.1,2014-01-01 00:00:00,1388534400,0
1,T0009,01/01/2014,00:15,-2.3,2014-01-01 00:15:00,1388535300,0
2,T0009,01/01/2014,00:30,-2.2,2014-01-01 00:30:00,1388536200,0
3,T0009,01/01/2014,00:45,-2.3,2014-01-01 00:45:00,1388537100,0
4,T0009,01/01/2014,01:00,-2.5,2014-01-01 01:00:00,1388538000,0
...,...,...,...,...,...,...,...
30579354,T0994,31/12/2021,22:45,9.8,2021-12-31 22:45:00,1640990700,2921
30579355,T0994,31/12/2021,23:00,9.0,2021-12-31 23:00:00,1640991600,2921
30579356,T0994,31/12/2021,23:15,9.4,2021-12-31 23:15:00,1640992500,2921
30579357,T0994,31/12/2021,23:30,9.6,2021-12-31 23:30:00,1640993400,2921


In [84]:
# create pythorc class for the data

class CustomDataset(Dataset):
  def __init__(self, df, transform = None, target_transform = None):
    self.features = df['value']
    self.labels = df['label']
    self.date = df['datetime']
    self.parameter = df['datetime_enconded']
    self.transform = transform
    self.target_transform = target_transform

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    label = self.labels.iloc[idx]
    parameter = self.parameter.iloc[idx]
    value = self.features.iloc[idx]
    date = self.date.iloc[idx]

    return {"label": label, "parameter": parameter, "value":value, "date":date}


In [88]:
data_class_arr = []
for pk in np.unique(df_row_data['period_index']):

  df_subset = df_row_data[df_row_data['period_index'] == pk]

  data_class = CustomDataset(df_subset)

  data_class_arr.append(data_class)

<__main__.CustomDataset object at 0x7f594ede9590>


{'label': 'T0009',
 'parameter': 1389312000,
 'value': 2.4,
 'date': Timestamp('2014-01-10 00:00:00')}

In [None]:
# save the classes in the typical pytorch class format

for i, class_d in enumerate(data_class_arr):
  
  save(class_d, wk + subfolder + "split_by_period/"+str(period_split)+"h/"+str(selected_dataset[:-4])+"/split_"+str(i)+".pt")
