In [1]:
import pandas as pd 
import numpy as np
import torch
import pytorch_lightning as pl
from pathlib import Path
from tqdm.notebook import tqdm
from torch import nn, optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.progress import TQDMProgressBar

### Notebook parameters

In [6]:
DATA_PATH = Path('./data/')
TEST_SPLIT_SIZE = .2
SEED = 125501
N_EPOCHS=10
FRAC=.1

In [7]:
df = pd.read_parquet(DATA_PATH / 'input_dataset-2.parquet')

## Features generation

In [8]:
FEATURE_COLUMNS = df.columns.tolist()[:7]
LABEL_COLUMN = 'Bolt_1_Tensile'

In [9]:
pl.seed_everything(seed=SEED)

Global seed set to 125501


125501

In [15]:
features_df = df.sample(frac=FRAC, random_state=SEED)[FEATURE_COLUMNS].copy()

features_df.dropna(inplace=True)

features_df['label'] = df[LABEL_COLUMN]

mode_le = LabelEncoder()

features_df['mode'] = mode_le.fit_transform(features_df['mode'].values)

In [17]:
train_df, test_df = train_test_split(features_df, test_size=TEST_SPLIT_SIZE)

In [32]:
scaler = MinMaxScaler(feature_range=(-1, 1))

scaler.fit(train_df)

train_df = pd.DataFrame(
    scaler.transform(train_df), index=train_df.index, columns=train_df.columns
)


In [33]:
print(train_df.shape)
print(test_df.shape)

(140016, 8)
(35005, 8)


In [38]:
def create_sequences(input_data: pd.DataFrame, feature_columns, target_column, sequence_length=60):
    sequences = []
    data_size = len(input_data)

    sequence = input_data[feature_columns][:sequence_length].to_numpy().tolist()
    all_labels = input_data[target_column].to_list()

    for i in tqdm(range(1, data_size - sequence_length)):
        row_dict = input_data.iloc[i].to_dict()
        feats = [row_dict[x] for x in feature_columns]
        
        sequence.pop()
        sequence.append(feats)

        label_position = i + sequence_length
        label = all_labels[label_position]

        sequences.append((sequence, label))

    return sequences

  0%|          | 0/139955 [00:00<?, ?it/s]

In [39]:
train_sequences = create_sequences(train_df, FEATURE_COLUMNS, 'label')

  0%|          | 0/139955 [00:00<?, ?it/s]

In [42]:
test_sequences = create_sequences(test_df, FEATURE_COLUMNS, 'label')

  0%|          | 0/34944 [00:00<?, ?it/s]

## Dataset wrappers

In [43]:
class KrafthackDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]

        return dict(
            sequence=torch.Tensor(sequence.to_numpy()), label=torch.tensor(label)
        )


In [45]:
class KrafthackDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=256):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size

    def setup(self):
        self.train_dataset = KrafthackDataset(self.train_sequences)
        self.test_dataset = KrafthackDataset(self.test_sequences)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
        )

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, shuffle=False, num_workers=1)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, shuffle=False, num_workers=1)


In [46]:
data_module = KrafthackDataModule(
    train_sequences=train_sequences, test_sequences=test_sequences, batch_size=128
)

data_module.setup()

In [47]:
class TensilePredictionModel(nn.Module):
    def __init__(self, n_features, n_hidden=128, n_layers=2):
        super().__init__()

        self.n_hidden = n_hidden

        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            batch_first=True,
            num_layers=n_layers,
            dropout=0.2,
        )

        self.regressor = nn.Linear(n_hidden, 1)

    def forward(self, x):
        self.lstm.flatten_parameters()

        _, (hidden, _) = self.lstm(x)

        logits = hidden[-1]

        return self.regressor(logits)


In [48]:
class TensilePredictor(pl.LightningModule):
    def __init__(self, n_features: int):
        super().__init__()

        self.model = TensilePredictionModel(n_features)

        self.criterion = nn.MSELoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0

        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))

        return loss, output

    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, _ = self(sequences, labels)
        self.log("training_loss", loss, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, _ = self(sequences, labels)
        self.log("validation_loss", loss, prog_bar=True, logger=True)

        return loss

    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, _ = self(sequences, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)


In [49]:
model = TensilePredictor(n_features = train_df.shape[1])

In [51]:
checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min",
)

logger = TensorBoardLogger("lightning_logs", name="tensile-pred")

progress_callback = TQDMProgressBar(refresh_rate=10)

early_stopping_callback = EarlyStopping(
    monitor="val_loss",
    patience=2
)

trainer = pl.Trainer(
    logger=logger,
    enable_checkpointing=checkpoint_callback,
    callbacks=[early_stopping_callback, progress_callback],
    max_epochs=N_EPOCHS
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [52]:
trainer.fit(model, data_module)

  rank_zero_deprecation(
Missing logger folder: lightning_logs/tensile-pred

  | Name      | Type                   | Params
-----------------------------------------------------
0 | model     | TensilePredictionModel | 202 K 
1 | criterion | MSELoss                | 0     
-----------------------------------------------------
202 K     Trainable params
0         Non-trainable params
202 K     Total params
0.812     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.9/3.9.10/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/homebrew/Cellar/python@3.9/3.9.10/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'KrafthackDataset' on <module '__main__' (built-in)>
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [55]:
data_module.train_dataset[0]

AttributeError: 'list' object has no attribute 'to_numpy'