In [57]:
import pandas as pd

In [58]:
df = pd.read_csv("../data/gt.csv")
df.rename(columns={
    'Pieces1': 'Pieces',
    'Manufacturer1': "Manufacturer",
    "SubType1": "SubType",
    "HxType1": "HxType",
    "NominelEffectEach1": "NominalEffectEach",
    "Year1": "Year"
}, inplace=True)
df.head()

Unnamed: 0,S_text,L_text,Pieces,Manufacturer,SubType,HxType,NominalEffectEach,Year
0,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...,1.0,Unknown,,Isoleret varmeveksler,,
1,Eksisterende fjernvarme,,1.0,Unknown,,,,
2,Fjernvarme med uisoleret veksler (indirekte an...,Bygningen opvarmes med fjernvarme. Anlægget er...,1.0,Unknown,,Uisoleret varmeveksler,,After 1980
3,Fjernvarme med isoleret veksler (indirekte anl...,Ejendommen opvarmes med fjernvarme fra HOFOR.\...,1.0,Danfoss Redan,,Isoleret varmeveksler,,
4,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...,1.0,Unknown,,Isoleret varmeveksler,,Before 1970


Create a number for each label in the dataset. This approach is not extensible: if the manufacturer is not present in the training data, the model won't be able to predict it.

In [59]:
target_cols = df.columns[2:]

label_maps = {}
for col in target_cols:
    if df[col].dtype == "object":
        df.fillna({col: "NaN"})
        label_maps[col] = {label: idx for idx, label in enumerate(df[col].unique())}
        df[f"{col}_idx"] = df[col].map(label_maps[col])

# NER

In [82]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm.auto import trange
import sys
sys.path.append("../scripts")
from NERmodel import TokenClassifier
from NER_ground_truth import ner_data

In [84]:
ner_gt = ner_data('../data/data_district_heating.xlsx')
ner_gt.head()

100%|██████████| 2272/2272 [02:20<00:00, 16.14it/s]


Unnamed: 0,text,words,tokens,labels
0,Fjernvarme med isoleret veksler (indirekte anl...,"[fjernvarme, med, isoleret, veksler, (, indire...","[[tensor(1.0188), tensor(1.2181), tensor(0.024...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [1.0, 0...."
1,Eksisterende fjernvarme,"[eksisterende, fjernvarme]","[[tensor(1.0928), tensor(0.8145), tensor(0.337...","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0...."
2,Fjernvarme med uisoleret veksler (indirekte an...,"[fjernvarme, med, uisoleret, veksler, (, indir...","[[tensor(1.0735), tensor(1.2092), tensor(0.108...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [1.0, 0...."
3,Fjernvarme med isoleret veksler (indirekte anl...,"[fjernvarme, med, isoleret, veksler, (, indire...","[[tensor(0.8679), tensor(1.1397), tensor(-0.12...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [1.0, 0...."
4,Fjernvarme med isoleret veksler (indirekte anl...,"[fjernvarme, med, isoleret, veksler, (, indire...","[[tensor(1.0224), tensor(1.1423), tensor(0.114...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], [1.0, 0...."


In [96]:
class TokenDataset(Dataset):
    def __init__(self, tokens_list, labels_list):
        """
        Args:
            tokens_list: List of token embeddings (each item is a tensor of shape [seq_len, embedding_dim])
            labels_list: List of labels (each item is a tensor of shape [seq_len, 7])
        """
        self.tokens_list = tokens_list
        self.labels_list = labels_list

    def __len__(self):
        return len(self.tokens_list)

    def __getitem__(self, idx):
        tokens = torch.stack(self.tokens_list[idx])
        labels = torch.from_numpy(self.labels_list[idx])
        return tokens, labels

In [97]:
def collate_fn(batch):
    """
    Custom collate function to handle variable-length sequences using torch's pad_sequence
    """
    tokens_list, labels_list = zip(*batch)

    # Pad tokens and labels using torch's built-in function
    padded_tokens = pad_sequence(tokens_list, batch_first=True, padding_value=0.0)
    padded_labels = pad_sequence(labels_list, batch_first=True, padding_value=0.0)

    # Create attention masks
    attention_masks = torch.zeros(len(tokens_list), padded_tokens.shape[1])
    for i, tokens in enumerate(tokens_list):
        attention_masks[i, :len(tokens)] = 1

    return padded_tokens, padded_labels, attention_masks

In [102]:
def fit(df, embedding_dim=768, hidden_dim=49, num_classes=7,
        initial_batch_size=8, max_batch_size=64, num_epochs=1000,
        eval_split=0.2):
    """
    Train the token classifier

    Args:
        df: DataFrame with 'tokens' and 'labels' columns
            - tokens: List of token embeddings (arrays of shape [seq_len, embedding_dim])
            - labels: List of label arrays (arrays of shape [seq_len, 7])
        embedding_dim: Token embedding dimension
        hidden_dim: Hidden layer dimension
        num_classes: Number of output classes
        initial_batch_size: Starting batch size
        max_batch_size: Maximum batch size
        num_epochs: Number of training epochs
        eval_split: Fraction of data for evaluation

    Returns:
        dict: Training and evaluation losses every 50 epochs
    """

    # Split data
    train_tokens, eval_tokens, train_labels, eval_labels = train_test_split(
        df['tokens'].tolist(), df['labels'].tolist(),
        test_size=eval_split, random_state=42
    )

    # Create datasets
    train_dataset = TokenDataset(train_tokens, train_labels)
    eval_dataset = TokenDataset(eval_tokens, eval_labels)

    # Initialize model
    model = TokenClassifier(embedding_dim, hidden_dim, num_classes)

    # Loss function and optimizer
    criterion = nn.BCEWithLogitsLoss(reduction='none')  # For multi-label classification
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training tracking
    train_losses = []
    eval_losses = []
    current_batch_size = initial_batch_size

    print(f"Starting training with batch size {current_batch_size}")

    pbar = trange(num_epochs)
    for epoch in pbar:
        # Create data loaders with current batch size
        train_loader = DataLoader(train_dataset, batch_size=current_batch_size,
                                 shuffle=True, collate_fn=collate_fn)

        model.train()
        epoch_loss = 0.0
        num_batches = 0

        for tokens, labels, attention_mask in train_loader:

            optimizer.zero_grad()

            # Forward pass
            logits = model(tokens)

            # Calculate loss only for non-padded tokens
            loss_per_token = criterion(logits, labels)
            loss_per_token = loss_per_token * attention_mask.unsqueeze(-1)
            loss = loss_per_token.sum() / attention_mask.sum()

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches

        # Evaluate every 50 epochs using the entire evaluation dataset
        if (epoch + 1) % 50 == 0:
            model.eval()

            # Create evaluation loader with batch size 1 to process entire dataset
            eval_loader = DataLoader(eval_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

            total_eval_loss = 0.0
            total_tokens = 0

            with torch.no_grad():
                for tokens, labels, attention_mask in eval_loader:
                    logits = model(tokens)

                    loss_per_token = criterion(logits, labels)
                    loss_per_token = loss_per_token * attention_mask.unsqueeze(-1)

                    # Accumulate total loss and total tokens
                    total_eval_loss += loss_per_token.sum().item()
                    total_tokens += attention_mask.sum().item()

            # Calculate average loss across all tokens in evaluation set
            avg_eval_loss = total_eval_loss / total_tokens

            train_losses.append(avg_epoch_loss)
            eval_losses.append(avg_eval_loss)

            pbar.set_postfix({
                "Train Loss": f"{avg_epoch_loss:.4f}",
                "Eval Loss": f"{avg_eval_loss:.4f}",
                "Batch Size": f"{current_batch_size}",
            })

        # Increase batch size when loss reaches 0 (or very close to 0)
        if avg_epoch_loss < 0.01 and current_batch_size < max_batch_size:
            current_batch_size = min(current_batch_size * 2, max_batch_size)
            print(f"Increasing batch size to {current_batch_size}")

    return {
        'train_losses': train_losses,
        'eval_losses': eval_losses,
        'model': model
    }

In [None]:
losses_and_model = fit(ner_gt)

Starting training with batch size 8


  0%|          | 0/1000 [00:00<?, ?it/s]

Increasing batch size to 16
Increasing batch size to 32
Increasing batch size to 64
