In [84]:
import pandas as pd 
import torch
import pytorch_lightning as pl
from pathlib import Path
from tqdm.notebook import tqdm
from torch import nn, optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.progress import TQDMProgressBar

In [None]:
##

### Notebook parameters

In [79]:
DATA_PATH = Path('./data/')
TEST_SPLIT_SIZE = .2
SEED = 125501
N_EPOCHS=10

In [9]:
df = pd.read_parquet(DATA_PATH / 'input_dataset-2.parquet')

(1874087, 22)

## Features generation

In [59]:
FEATURE_COLUMNS = df.columns.tolist()[:7]
LABEL_COLUMN = ['Bolt_1_Tensile']

FEATURE_COLUMNS = FEATURE_COLUMNS + LABEL_COLUMN

In [60]:
pl.seed_everything(seed=SEED)

Global seed set to 125501


125501

In [61]:
feature_df = df[FEATURE_COLUMNS].copy()

feature_df.dropna(inplace=True)

mode_le = LabelEncoder()

feature_df['mode'] = mode_le.fit_transform(feature_df['mode'].values)

In [62]:
train_df, test_df = train_test_split(feature_df, test_size=TEST_SPLIT_SIZE)

In [63]:
scaler = MinMaxScaler(feature_range=(-1, 1))

scaler.fit(train_df)

train_df = pd.DataFrame(
    scaler.transform(train_df), index=train_df.index, columns=train_df.columns
)


In [65]:
print(train_df.shape)
print(test_df.shape)

(1400000, 8)
(350000, 8)


In [66]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length=60):
    sequences = []
    data_size = len(input_data)

    for i in tqdm(range(data_size - sequence_length)):
        sequence = input_data[i: i + sequence_length]
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]

        sequences.append((sequence, label))

    return sequences

In [67]:
train_sequences = create_sequences(train_df, LABEL_COLUMN)

  0%|          | 0/1399940 [00:00<?, ?it/s]

In [70]:
test_sequences = create_sequences(test_df, LABEL_COLUMN)

  0%|          | 0/349940 [00:00<?, ?it/s]

## Dataset wrappers

In [None]:
class KrafthackDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]

        return dict(sequence=torch.Tensor(sequence.to_numpy()), label=label)

In [None]:
class KrafthackDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=8):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self):
        self.train_dataset = KrafthackDataset(self.train_sequences)
        self.test_dataset = KrafthackDataset(self.test_sequences)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, shuffle=False, num_workers=1)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, shuffle=False, num_workers=1)


In [None]:
data_module = KrafthackDataModule(
    train_sequences=train_sequences, test_sequences=test_sequences, batch_size=128
)


In [75]:
class TensilePredictionModel(nn.Module):
    def __init__(self, n_features, n_hidden=128, n_layers=2):
        super().__init__()

        self.n_hidden = n_hidden

        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            batch_first=True,
            num_layers=n_layers,
            dropout=0.2,
        )

        self.regressor = nn.Linear(n_hidden, 1)

    def forward(self, x):
        self.lstm.flatten_parameters()

        _, (hidden, _) = self.lstm(x)

        logits = hidden[-1]

        return self.regressor(logits)


In [73]:
class TensilePredictor(pl.LightningModule):
    def __init__(self, n_features: int):
        super().__init__()

        self.model = TensilePredictionModel(n_features)

        self.criterion = nn.MSELoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0

        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))

        return loss, output

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, _ = self(sequences, labels)
        self.log("training_loss", loss, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, _ = self(sequences, labels)
        self.log("validation_loss", loss, prog_bar=True, logger=True)

        return loss

    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, _ = self(sequences, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)


In [76]:
model = TensilePredictor(n_features = train_df.shape[1])

In [86]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

logger = TensorBoardLogger("lightning_logs", name="tensile-pred")

progress_callback = TQDMProgressBar(refresh_rate=10)

early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    patience=2
)

trainer = pl.Trainer(
    logger=logger,
    enable_checkpointing=checkpoint_callback,
    callbacks=[early_stopping_callback, progress_callback],
    max_epochs=N_EPOCHS
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, data_module)