In [1]:
import random, glob, os, torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
CFG = {
    'WINDOW_SIZE':500,
    'EPOCHS':100,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':128,
    'SEED':41,
    'FOLD':5
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

In [4]:
train_paths = glob.glob('./train/*.csv')
test_paths = pd.read_csv('./test.csv')['data_path'].values

In [5]:
extremes = {
    'Time[s]': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Signal A': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Signal B': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Signal C': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Sensor A': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Sensor B': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Sensor C': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')},
    'Sensor D': {'min': float('inf'), 'max': float('-inf'), 'mean': float('inf'), 'std': float('inf'), 
                 'median': float('inf'), 'sum': float('inf'), 'q1': float('inf'), 'q3': float('inf')}
}

for path in train_paths:
    data = pd.read_csv(path)
    for column in extremes.keys():
        extremes[column]['min'] = min(extremes[column]['min'], data[column].min())
        extremes[column]['max'] = max(extremes[column]['max'], data[column].max())
        extremes[column]['median'] = data[column].median()
        extremes[column]['mean'] = data[column].mean()
        extremes[column]['sum'] = data[column].sum()
        extremes[column]['std'] = data[column].std()
        extremes[column]['q1'] = data[column].quantile(0.25)
        extremes[column]['q3'] = data[column].quantile(0.75)

In [6]:
def make_train_data(train_paths, window_size=CFG['WINDOW_SIZE'], stride=10):
    sequences = []
    sequence_labels = []
    for path in tqdm(train_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data['driver'] = 0 if driver == 'A' else 1
        label = float(path.split('/')[-1].split('.')[0].split('_')[0][:-2])
        label = label / 902.
        for i in range(0, len(data) - window_size + 1, stride):
            window_data = data.iloc[i:i + window_size].copy()
            for i, (column, stats) in enumerate(extremes.items()):
                if column in ['Signal A', 'Signal B', 'Signal C']:
                    window_data[column] = (window_data[column] - stats['min']) / (stats['max'] - stats['min'])
                elif column == 'Time[s]':
                    window_data[column] = (window_data[column] - stats['mean']) / stats['std']
                elif column in ['Sensor A', 'Sensor B', 'Sensor C', 'Sensor D']:
                    window_data[column] = (window_data[column] - stats['median']) / (stats['q3'] - stats['q1'])
            sequences.append(window_data.to_numpy())
            sequence_labels.append(label)
    return np.array(sequences), np.array(sequence_labels)

In [7]:
train_window_data, train_labels = make_train_data(train_paths)

  0%|          | 0/16 [00:00<?, ?it/s]

In [8]:
train_window_data.shape

(18453, 500, 9)

In [9]:
def make_predict_data(test_paths, window_size=CFG['WINDOW_SIZE']):
    sequences = []
    for path in tqdm(test_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data['driver'] = 0 if driver == 'A' else 1
        window_data = np.zeros((window_size, data.shape[1]))
        window_data[:len(data), :] = data.iloc[:len(data)].to_numpy()
        for i, (column, stats) in enumerate(extremes.items()):
            if column in ['Signal A', 'Signal B', 'Signal C']:
                window_data[:, i] = (window_data[:, i] - stats['min']) / (stats['max'] - stats['min'])
            elif column == 'Time[s]':
                window_data[:, i] = (window_data[:, i] - stats['mean']) / stats['std']
            elif column in ['Sensor A', 'Sensor B', 'Sensor C', 'Sensor D']:
                window_data[:, i] = (window_data[:, i] - stats['median']) / (stats['q3'] - stats['q1'])
        sequences.append(window_data)
    return np.array(sequences)

In [10]:
test_window_data = make_predict_data(test_paths)

  0%|          | 0/4048 [00:00<?, ?it/s]

In [11]:
test_window_data.shape

(4048, 500, 9)

In [12]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), self.Y[index]
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [13]:
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_size=9, num_layers=4, nhead=9, max_seq_length=500):
        super(TimeSeriesTransformer, self).__init__()
        self.input_size = input_size
        self.hidden_size = input_size
        self.num_layers = num_layers
        self.nhead = nhead
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=self.hidden_size,
            nhead=nhead,
            dim_feedforward=self.hidden_size * 4,
            dropout=0.1,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc_out = nn.Linear(self.hidden_size, 1)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_seq_length, self.input_size))

    def forward(self, x):
        batch_size, seq_length, features = x.size()
        pos_encoding = self.positional_encoding[:, :seq_length, :]
        x = x + pos_encoding
        x = x.permute(1, 0, 2)
        transformer_out = self.transformer_encoder(x)
        last_output = transformer_out[-1, :, :]
        output = self.fc_out(last_output)
        return output.squeeze(1)

In [14]:
test_dataset = CustomDataset(test_window_data, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [15]:
target_values = np.array([102, 200, 300, 402, 500, 602, 702, 802, 902])
submit = pd.read_csv('./sample_submission.csv')

def inference(model, test_loader, device):
    predictions = []
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            output = model(X)
            output = output * 902.
            output = output.cpu().numpy()
            predictions.extend(output)
    return np.array(predictions)

def map_predictions(predictions, target_values):
    mapped_predictions = []
    for pred in predictions:
        differences = np.abs(target_values - pred)
        index_of_min_diff = np.argmin(differences)
        mapped_predictions.append(target_values[index_of_min_diff])
    return np.array(mapped_predictions)

In [16]:
model00 = TimeSeriesTransformer(input_size=9, num_layers=4, nhead=9, max_seq_length=500)
model01 = TimeSeriesTransformer(input_size=9, num_layers=4, nhead=9, max_seq_length=500)
model02 = TimeSeriesTransformer(input_size=9, num_layers=4, nhead=9, max_seq_length=500)
model03 = TimeSeriesTransformer(input_size=9, num_layers=4, nhead=9, max_seq_length=500)
model04 = TimeSeriesTransformer(input_size=9, num_layers=4, nhead=9, max_seq_length=500)

model00 = torch.load('./fold/0_model_weights.pth')
model01 = torch.load('./fold/1_model_weights.pth')
model02 = torch.load('./fold/2_model_weights.pth')
model03 = torch.load('./fold/3_model_weights.pth')
model04 = torch.load('./fold/4_model_weights.pth')

model00.to(device)
model01.to(device)
model02.to(device)
model03.to(device)
model04.to(device)

pred00 = inference(model00, test_loader, device)
pred01 = inference(model01, test_loader, device)
pred02 = inference(model02, test_loader, device)
pred03 = inference(model03, test_loader, device)
pred04 = inference(model04, test_loader, device)

pred00 = np.round(pred00, 0).astype(int)
pred01 = np.round(pred01, 0).astype(int)
pred02 = np.round(pred02, 0).astype(int)
pred03 = np.round(pred03, 0).astype(int)
pred04 = np.round(pred04, 0).astype(int)

pred00, pred01, pred02, pred03, pred04

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

(array([191, 406, 639, ..., 476, 654, 401]),
 array([203, 430, 606, ..., 349, 620,  -5]),
 array([257, 354, 684, ..., 658, 602, 126]),
 array([ -3, 489, 595, ..., 722, 636,  15]),
 array([323, 389, 640, ..., 533, 689,  -2]))

In [17]:
Predictions_reproducibility = (pred00 + pred01 + pred02 + pred03 + pred04)/5
Predictions_reproducibility = np.round(Predictions_reproducibility, 0).astype(int)
Predictions_reproducibility

array([194, 414, 633, ..., 548, 640, 107])

In [18]:
Mapped_Predictions_Reproducibility = map_predictions(Predictions_reproducibility, target_values)
Mapped_Predictions_Reproducibility

array([200, 402, 602, ..., 500, 602, 102])

In [19]:
submit['weight'] = Mapped_Predictions_Reproducibility
submit.to_csv('./Mapped_Predictions_Reproducibility.csv', index=False)