In [54]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [55]:
spec = pd.read_csv('spec.csv', index_col = [0])
spec.index = pd.to_datetime(spec.index)
sgx = pd.read_csv('sgx.csv', index_col = [0])
sgx.index = pd.to_datetime(sgx.index)
ref = pd.read_csv('reference.csv', index_col = [0])
ref.index = pd.to_datetime(ref.index)

data = pd.concat([ref, sgx, spec], axis = 1)
data = data.reindex(data.index, fill_value=np.nan)

data.head()

Unnamed: 0,REF-O3,REF-CO,REF-NO2,REF-SO2,REF-PM2.5,REF-PM10,REF-RH,REF-AMB_TEMP,SGX-O3,SGX-CO,SGX-NO2,SGX-SO2,SPEC-O3,SPEC-CO,SPEC-NO2,SPEC-SO2
2023-01-01 00:00:00,38.7,0.23,4.8,1.0,14.0,14.0,86.0,18.4,,,,,,,,
2023-01-01 01:00:00,38.3,0.22,5.3,0.4,9.0,13.0,87.0,18.6,,,,,,,,
2023-01-01 02:00:00,38.3,0.22,4.4,0.2,6.0,8.0,89.0,18.5,,,,,,,,
2023-01-01 03:00:00,38.3,0.21,4.1,0.1,8.0,3.0,88.0,18.6,,,,,,,,
2023-01-01 04:00:00,38.2,0.2,3.9,0.5,5.0,4.0,88.0,18.9,,,,,,,,


In [56]:
columns =  ['SPEC-O3', 'SGX-CO', 'SGX-NO2', 'SPEC-SO2', 'REF-AMB_TEMP', 'REF-RH'] + ['REF-O3']
data.dropna(subset=columns, inplace=True)
data = data.abs()
data.head()

Unnamed: 0,REF-O3,REF-CO,REF-NO2,REF-SO2,REF-PM2.5,REF-PM10,REF-RH,REF-AMB_TEMP,SGX-O3,SGX-CO,SGX-NO2,SGX-SO2,SPEC-O3,SPEC-CO,SPEC-NO2,SPEC-SO2
2023-01-07 11:00:00,48.1,0.34,9.5,0.6,16.0,38.0,46.0,20.6,5.410172,7.202759,34.592069,183.857241,2.386834,5.747358,14.257812,71.975528
2023-01-07 12:00:00,49.7,0.33,9.2,0.6,13.0,41.0,44.0,21.3,2.405,6.976,41.545333,211.573833,2.239583,6.233724,14.397786,80.398763
2023-01-07 13:00:00,49.7,0.32,8.8,0.5,17.0,40.0,43.0,21.5,3.231833,6.804167,46.232833,243.2225,2.301432,6.930339,15.050456,90.367839
2023-01-07 14:00:00,49.7,0.33,8.4,0.7,19.0,42.0,47.0,20.9,13.816833,10.936333,29.967,198.117667,2.664388,5.748698,14.05599,75.003255
2023-01-07 15:00:00,50.4,0.35,9.3,0.7,27.0,52.0,49.0,20.1,9.133167,10.4035,28.534833,162.663667,2.438151,5.3125,12.535807,66.230469


In [57]:
def create_sequences(data, window_len, dates, use_consecutive=False):
    def is_consecutive(time_list):
        for i in range(len(time_list)-1):
            time_diff = time_list[i+1] - time_list[i]
            if time_diff != pd.Timedelta(hours=1):
                return False
        return True
    xs = []
    ys = []
    new_dates = []
    for i in range(data.shape[1]-window_len+1):
        if use_consecutive:
            if is_consecutive(dates[i:i+window_len]):
                x = data[:-1, i:i+window_len]
                y = data[-1:, i+window_len-1]
                xs.append(x)
                ys.append(y)
                new_dates.append(dates[i+window_len-1])
            else:
                continue
        else:
            x = data[:-1, i:i+window_len]
            y = data[-1:, i+window_len-1]
            xs.append(x)
            ys.append(y)
            new_dates.append(dates[i+window_len-1])


    return np.stack(xs), np.stack(ys), new_dates

In [58]:
dates = data.index
n_batches = 10
batches = np.array_split(dates, n_batches)

# First two batches are training data, 0+1
source = np.concatenate(batches[:2])
source_data = data.loc[source, columns]

# Standardize source data
scaler = MinMaxScaler()
scaler.fit(source_data)
source_data = scaler.transform(source_data)

X_source, y_source, source = create_sequences(np.transpose(source_data), 24, source, use_consecutive=False)
train_0 = {'samples': X_source, 'labels': y_source}
torch.save(train_0, '0.pt')

# Rest are target data
for i in range(2, n_batches): # 2~10
    # Standardize target data
    target_data = data.loc[batches[i], columns]
    target_data = scaler.transform(target_data)
    X_target, y_target, target = create_sequences(np.transpose(target_data), 24, batches[i], use_consecutive=False)
    train = {'samples': X_target, 'labels': y_target}
    torch.save(train, '{}.pt'.format(i))
    

In [59]:
torch.load('0.pt')['samples'].shape

(785, 6, 24)

In [60]:
torch.load('0.pt')['labels'].shape

(785, 1)