In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from numpy import zeros, newaxis
from sklearn.preprocessing import MinMaxScaler
from common.data_loader import *
from common.preprocessing import*

from os import path, environ
import matplotlib.pyplot as plt
from pytz import timezone

## Transformer

## load data

In [2]:
# Load dataset
full_data = pd.read_csv("data/data5_gi.csv")
submission = pd.read_csv("data/sample_submission.csv")
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4228623 entries, 0 to 4228622
Data columns (total 15 columns):
 #   Column   Dtype  
---  ------   -----  
 0   TurbID   int64  
 1   Day      int64  
 2   Tmstamp  object 
 3   Wspd     float64
 4   Wdir     float64
 5   Etmp     float64
 6   Itmp     float64
 7   Ndir     float64
 8   Pab1     float64
 9   Pab2     float64
 10  Pab3     float64
 11  Prtv     float64
 12  Patv     float64
 13  Date     object 
 14  Time     int64  
dtypes: float64(10), int64(3), object(2)
memory usage: 483.9+ MB


In [3]:
full_data

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv,Date,Time
0,1,1,00:00,6.17,-3.99,30.73,41.80,25.92,1.00,1.00,1.00,-0.25,494.66,2021-01-01 00:00:00,1
1,1,1,00:10,6.17,-3.99,30.73,41.80,25.92,1.00,1.00,1.00,-0.25,494.66,2021-01-01 00:10:00,2
2,1,1,00:20,6.27,-2.18,30.60,41.63,20.91,1.00,1.00,1.00,-0.24,509.76,2021-01-01 00:20:00,3
3,1,1,00:30,6.42,-0.73,30.52,41.52,20.91,1.00,1.00,1.00,-0.26,542.53,2021-01-01 00:30:00,4
4,1,1,00:40,6.25,0.89,30.49,41.38,20.91,1.00,1.00,1.00,-0.23,509.36,2021-01-01 00:40:00,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4228618,134,222,23:10,4.58,-4.41,-0.99,3.37,194.37,0.01,0.01,0.01,-51.69,238.01,2021-08-10 23:10:00,31964
4228619,134,222,23:20,3.74,0.88,-1.05,3.16,187.96,0.01,0.01,0.01,-31.13,172.71,2021-08-10 23:20:00,31965
4228620,134,222,23:30,3.54,0.34,-1.21,2.92,187.96,0.03,0.03,0.03,-58.67,157.63,2021-08-10 23:30:00,31966
4228621,134,222,23:40,4.46,4.13,-1.24,2.81,191.76,0.03,0.03,0.03,-56.66,244.92,2021-08-10 23:40:00,31967


## preprocess

In [4]:
full_data = feature_engineering(full_data, compute_Pmax_method ='clipping', compute_Pmax_clipping=False)
full_data = marking_data(full_data, 0)

* Data name: Feature engineering
  - Number of data: 4228623
  - Number of nan rows: 864


## feature selecting

In [5]:
## n to patv prediction
features = ['TurbID', 'Day', 'Tmstamp','PatvMA3','WspdMA3', 'RPMMA3','Patv5','Wspd5', 'RPM5','locX', 'locY','DayX', 'DayY', 'WdirX', 'WdirY','Wspd', 'Wspd_cube','Etmp_abs','Pab1', 'Pab2', 'Pab3', 'Bspd1', 'Bspd3', 'Bspd2','Prtv','Patv']
full_data_selected = full_data[features]

In [6]:
## n to wspd predicion and calculate patv
features = ['TurbID', 'Day', 'Tmstamp','WspdMA3','Wspd5', 'locX', 'locY','DayX', 'DayY','Weekday', 'WdirX', 'WdirY', 'Wspd_cube','Etmp_abs','Wspd']
full_data_selected = full_data[features]

In [7]:
data = full_data_selected[(full_data_selected['Day']>=6)&(full_data_selected['Day']<=222)]
data

Unnamed: 0,TurbID,Day,Tmstamp,WspdMA3,Wspd5,locX,locY,DayX,DayY,Weekday,WdirX,WdirY,Wspd_cube,Etmp_abs,Wspd
720,1,6,00:00,10.235417,6.17,3000.0,6000.0,0.994678,0.103033,6,-0.958770,-0.284183,229.130355,264.20,6.12
721,1,6,00:10,10.226273,6.17,3000.0,6000.0,0.994678,0.103033,6,-0.960050,-0.279829,339.998645,264.05,6.98
722,1,6,00:20,10.222338,6.27,3000.0,6000.0,0.994678,0.103033,6,-0.945916,-0.324413,722.854363,263.95,8.99
723,1,6,00:30,10.215301,6.42,3000.0,6000.0,0.994678,0.103033,6,-0.929455,-0.368936,613.844440,263.90,8.50
724,1,6,00:40,10.208796,6.25,3000.0,6000.0,0.994678,0.103033,6,-0.902886,-0.429881,477.512047,263.81,7.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4228618,134,222,23:10,4.089606,6.90,0.0,6500.0,-0.779190,-0.626788,5,-0.984929,-0.172961,95.221129,242.16,4.58
4228619,134,222,23:20,4.091481,6.80,0.0,6500.0,-0.779190,-0.626788,5,-0.988121,-0.153676,52.295116,242.10,3.74
4228620,134,222,23:30,4.092755,6.75,0.0,6500.0,-0.779190,-0.626788,5,-0.989526,-0.144356,44.359521,241.94,3.54
4228621,134,222,23:40,4.096065,7.28,0.0,6500.0,-0.779190,-0.626788,5,-0.961789,-0.273791,88.027195,241.91,4.46


## make dataset

In [8]:
SEQ_LEN = 2
week = 144*7
day = 144
hour = 6
ten_minute = 1
val_x, val_y, train_x, train_y, test_x = make_train_val_test_data(data, in_seq_len=SEQ_LEN*144, out_seq_len=SEQ_LEN*144, stride=day, shuffle=False, test_size=0.9)

100%|██████████| 134/134 [00:14<00:00,  8.99it/s]


* Data Split
  - Train data(X, y)     : (2540, 288, 12) (2540, 288, 12)
  - Validation data(X, y): (23790, 288, 12) (23790, 288, 12)
  - Test data(X)         : (134, 288, 12)


In [9]:
scaler = MinMaxScaler()
scaler.fit(train_x.reshape(-1,train_x.shape[2]))
train_x = scaler.transform(train_x.reshape(-1,train_x.shape[2])).reshape(-1,*train_x.shape[1:])
train_y= scaler.transform(train_y.reshape(-1,train_x.shape[2])).reshape(-1,*train_x.shape[1:])
val_x = scaler.transform(val_x.reshape(-1,train_x.shape[2])).reshape(-1,*train_x.shape[1:])
val_y = scaler.transform(val_y.reshape(-1,train_x.shape[2])).reshape(-1,*train_x.shape[1:])
test_x = scaler.transform(test_x.reshape(-1,train_x.shape[2])).reshape(-1,*train_x.shape[1:])
print("Train data(X, y):", train_x.shape, train_y.shape)
print("Validation data(X, y):", val_x.shape, val_y.shape)
print("Test data(X):", test_x.shape)

Train data(X, y): (23790, 288, 12) (23790, 288, 12)
Validation data(X, y): (2540, 288, 12) (2540, 288, 12)
Test data(X): (134, 288, 12)


In [10]:
output_shape = 1
train_y = train_y[:,:,-output_shape:].reshape(-1,train_y.shape[1],output_shape)
val_y = val_y[:,:,-output_shape:].reshape(-1,train_y.shape[1],output_shape)

In [11]:
print("Train data(X, y):", train_x.shape, train_y.shape)
print("Validation data(X, y):", val_x.shape, val_y.shape)
print("Test data(X):", test_x.shape)

Train data(X, y): (23790, 288, 12) (23790, 288, 1)
Validation data(X, y): (2540, 288, 12) (2540, 288, 1)
Test data(X): (134, 288, 12)


In [12]:
seed = 42
import torch
os.environ["PYTHONHASHSEED"] = str(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)  # type: ignore
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True  # type: ignore

In [15]:
import datetime

import numpy as np
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
from tqdm import tqdm
import seaborn as sns

sys.path.append('transformer')
from tst import Transformer
from tst import loss
from src.utils import compute_loss
from src.visualization import map_plot_function, plot_values_distribution, plot_error_distribution, plot_errors_threshold, plot_visual_sample
from livelossplot import PlotLosses

# # Config
sns.set()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

Using device cuda:0


In [16]:
BATCH_SIZE = 32
NUM_WORKERS = 1
LR = 1e-4
EPOCHS = 100
SEQ_LEN = 2
MARKING = 0

# Model parameters
d_model = 512 # Lattent dim
q = 32 # Query size
v = 32 # Value size
h = 16 # Number of heads
N = 16 # Number of encoder and decoder to stack
attention_size = None # Attention window size
dropout = 0.5 # Dropout rate
pe = None # Positional encoding
chunk_mode = None

d_input = train_x.shape[-1] # From dataset
d_output = train_y.shape[-1] # From dataset

In [17]:
class CondLoss(nn.Module):
    def __init__(self, loss_fn, marked_target_value, **kwargs):
        self.loss_fn = loss_fn
        self.marked_target_value = marked_target_value
        super().__init__(**kwargs)
        
        if self.loss_fn == 'rmse':
            self.loss_fn = torch.sqrt(nn.MSELoss())
        elif self.loss_fn == 'mse':
            self.loss_fn = nn.MSELoss()
        elif self.loss_fn == 'mae':
            self.loss_fn = nn.L1Loss()
        elif self.loss_fn == 'huber':
            self.loss_fn = nn.HuberLoss()
        else:
            raise NotImplementedError
        
    def forward(self,
                y_true: torch.Tensor,
                y_pred: torch.Tensor) -> torch.Tensor:
        # y_true, y_pred: [B, S, D]
        
        assert y_true.shape == y_pred.shape, f'Shape mismatch for output and ground truth array {y_true.shape} and {y_pred.shape}'

        _, S, D = y_true.shape  # Batch, Sequence, Dim of features
        y_true = torch.reshape(y_true, (-1, D))
        y_pred = torch.reshape(y_pred, (-1, D))

        idxs_valid = (y_true[:, -1] != self.marked_target_value)
        y_true_valid, y_pred_valid = y_true[idxs_valid], y_pred[idxs_valid]
        return self.loss_fn(y_pred_valid, y_true_valid)

In [18]:
net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size, dropout=dropout, chunk_mode=chunk_mode, pe=pe).to(device)
optimizer = optim.Adam(net.parameters(), lr=LR)
loss_function = CondLoss('mse',0 )
mae= nn.L1Loss()

In [19]:
net.modules

<bound method Module.modules of Transformer(
  (layers_encoding): ModuleList(
    (0): Encoder(
      (_selfAttention): MultiHeadAttention(
        (_W_q): Linear(in_features=512, out_features=512, bias=True)
        (_W_k): Linear(in_features=512, out_features=512, bias=True)
        (_W_v): Linear(in_features=512, out_features=512, bias=True)
        (_W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (_feedForward): PositionwiseFeedForward(
        (_linear1): Linear(in_features=512, out_features=2048, bias=True)
        (_linear2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (_layerNorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (_layerNorm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (_dopout): Dropout(p=0.5, inplace=False)
    )
    (1): Encoder(
      (_selfAttention): MultiHeadAttention(
        (_W_q): Linear(in_features=512, out_features=512, bias=True)
        (_W_k): Linear(in_features=512,

In [20]:
class Dataset(Dataset):
    def __init__(self, text, labels):
        self.labels = labels
        self.text = text
    def __len__(self):
            return len(self.labels)
    def __getitem__(self, idx):
        label = self.labels[idx]
        text = self.text[idx]
        sample =(text, label)
        return sample

In [21]:
dataset_train = Dataset(train_x, train_y)
dataset_val = Dataset(val_x, val_y)
dataset_test = Dataset(test_x, test_x)

In [22]:
dataloader_train = DataLoader(dataset_train,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_WORKERS,
                              pin_memory=False
                             )
dataloader_val = DataLoader(dataset_val,
                             batch_size=BATCH_SIZE,
                             shuffle=False,
                             num_workers=NUM_WORKERS
                            )
dataloader_test = DataLoader(dataset_test,
                             batch_size=BATCH_SIZE,
                             shuffle=False,
                             num_workers=NUM_WORKERS
                            )

## Train

In [None]:
val_loss_best = np.inf
EPOCHS = 100
SCORE = 9999
# Prepare loss history
hist_loss = np.zeros(EPOCHS)
hist_loss_val = np.zeros(EPOCHS)
hist_loss_mae = np.zeros(EPOCHS)
hist_loss_mae_val = np.zeros(EPOCHS)
liveloss = PlotLosses()

os.makedirs('saved_models', exist_ok=True)

for idx_epoch in range(EPOCHS):
    running_loss = 0
    running_mae = 0
    logs = {}
    with tqdm(total=len(dataloader_train.dataset), desc=f"[Epoch {idx_epoch+1:3d}/{EPOCHS}]") as pbar:
        for idx_batch, (x, y) in enumerate(dataloader_train):
            optimizer.zero_grad()

            # Propagate input
            netout = net(x.to(device))

            # Comupte loss
            loss = loss_function(y.to(device), netout)
            loss_mae = mae(y.to(device), netout)

            # Backpropage loss
            loss.backward()

            # Update weights
            optimizer.step()

            running_loss += loss.item()
            running_mae += loss_mae.item()
            pbar.set_postfix({'loss_mse': running_loss/(idx_batch+1),'mae': running_mae/(idx_batch+1) })
            pbar.update(x.shape[0])
            
        
        train_loss = running_loss/len(dataloader_train)
        train_loss_mae = running_mae/len(dataloader_train)
        val_loss = compute_loss(net, dataloader_val, loss_function, device).item()
        val_loss_mae = compute_loss(net, dataloader_val, mae, device).item()

        pbar.set_postfix({'loss': train_loss, 'val_loss': val_loss,'mae': train_loss_mae, 'val_mae': val_loss_mae})

        hist_loss[idx_epoch] = train_loss
        hist_loss_val[idx_epoch] = val_loss
        hist_loss_mae[idx_epoch] = train_loss_mae
        hist_loss_mae_val[idx_epoch] = val_loss_mae
        
        if val_loss < val_loss_best:
            val_loss_best = val_loss
            model_save_path = f'saved_models/model_{val_loss}.pth'
            torch.save(net.state_dict(), model_save_path)
        
    liveloss.update({ 'MSE': train_loss, 'val_MSE':val_loss, 'MAE': train_loss_mae, 'val_MAE':val_loss_mae})
    liveloss.send()
        
plt.plot(hist_loss, 'o-', label='train')
plt.plot(hist_loss_val, 'o-', label='val')
plt.title('MSE')
plt.legend()
plt.show()

print(f"model exported to {model_save_path} with loss {val_loss_best:5f}")

# predict

In [None]:
predictions = np.empty(shape=(len(dataloader_test.dataset), *train_y.shape[1:]))

idx_prediction = 0
with torch.no_grad():
    for x, y in tqdm(dataloader_test, total=len(dataloader_test)):
        netout = net(x.to(device)).cpu().numpy()
        predictions[idx_prediction:idx_prediction+x.shape[0]] = netout
        idx_prediction += x.shape[0]

pred = predictions.reshape(-1,predictions.shape[2])
pred = np.pad(pred, ((0,0),(test_x.shape[2]-output_shape,0)))
pred_inversed = scaler.inverse_transform(pred)
wspd = pred_inversed[:,-1]
## Wspd^3/Etmp_abs*(A*rou)*C
target = np.clip(wspd**3/300*807*0.2,0,1500)
pred_inversed[:,-1] = target
submission['Patv'] = pred_inversed[:,-1]

os.makedirs('output', exist_ok=True)
submission.to_csv(f"output/prediction_transformer.csv", index=False)