# Import Dataset

In [1]:
import pandas as pd
import pandas as pd
from tqdm.notebook import trange
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch

In [2]:
d_gt = pd.read_excel("../data/data_district_heating.xlsx", sheet_name="Ground Truth")
d_gt['text'] = d_gt['S_text'].fillna("").astype(str) + ". " + d_gt['L_text'].fillna("").astype(str)
d_gt['SingleHx'] = d_gt.iloc[:, 13:31].isna().all(axis=1)
d_gt.head()

Unnamed: 0,SerialID,building_id,zone_id,SEEB,S_text,L_text,Type,Pieces1,Manufacturer1,SubType1,...,SpecifiedSubscriptionUnitAmount,DistrictHeatingPlantName,HeatLoss,BFactor,NominalEffect,MinimumTemperature,StandbyPowerUsage,HeatsHotWater,text,SingleHx
0,311117034,0,0,2-1-3-0,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...,DistrictHeatWithExchanger,1.0,Unknown,,...,15.0,HOFOR (Københavns Energi) - MWh (04-07-2014),1.1,0.0,20.0,60.0,0.0,1.0,Fjernvarme med isoleret veksler (indirekte anl...,True
1,311117893,2050872,2050872,2-1-3-0,Eksisterende fjernvarme,,DistrictHeatWithExchanger,1.0,Unknown,,...,0.0,Københavns Energi MWh (293),1.5,0.7,16.0,0.0,5.0,0.0,Eksisterende fjernvarme.,True
2,311119019,0,0,2-1-3-0,Fjernvarme med uisoleret veksler (indirekte an...,Bygningen opvarmes med fjernvarme. Anlægget er...,DistrictHeatWithExchanger,1.0,Unknown,,...,190.0,HOFOR (Københavns Energi) - MWh (04-07-2014),5.0,0.0,20.0,65.0,0.0,0.0,Fjernvarme med uisoleret veksler (indirekte an...,True
3,311121828,0,0,2-1-3-0,Fjernvarme med isoleret veksler (indirekte anl...,Ejendommen opvarmes med fjernvarme fra HOFOR.\...,DistrictHeatWithExchanger,1.0,Danfoss Redan,,...,502.0,HOFOR (Københavns Energi) - MWh (04-07-2014),1.1,0.0,60.0,60.0,0.0,0.0,Fjernvarme med isoleret veksler (indirekte anl...,True
4,311122000,0,0,2-1-3-0,Fjernvarme med isoleret veksler (indirekte anl...,Bygningen opvarmes med fjernvarme. Anlægget er...,DistrictHeatWithExchanger,1.0,Unknown,,...,25.0,HOFOR (Københavns Energi) - MWh (04-07-2014),7.0,0.0,20.0,65.0,0.0,0.0,Fjernvarme med isoleret veksler (indirekte anl...,True


# Create Dataset

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, tokens_list, labels_list):
        """
        Args:
            tokens_list: List of token embeddings (each item is a tensor of shape [seq_len, embedding_dim])
            labels_list: List of labels (each item is a tensor of shape [len(tokens_list), 1])
        """
        self.tokens = tokens_list
        self.labels = labels_list

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.tokens[idx], self.labels[idx]

In [7]:
class Tokenizer:
    def __init__(self, model_name="saattrupdan/nbailab-base-ner-scandi"):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name
        )
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()

    def tokenize(self, text):
        """
        Tokenize text and return their embeddings.

        Args:
            text (str): Input text to tokenize

        Returns:
            embeddings: List of tokens' embeddings
        """
        # Get tokenized inputs and model outputs
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            output = self.model(**inputs).last_hidden_state.squeeze(0)
        return output


In [15]:
# model_name = "Qwen/Qwen3-0.6B"
tokenizer = Tokenizer()

X = []
for i in trange(d_gt.shape[0], desc="Tokenizing"):
    X.append(tokenizer.tokenize(d_gt.loc[i, 'text']))

Some weights of BertModel were not initialized from the model checkpoint at saattrupdan/nbailab-base-ner-scandi and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing:   0%|          | 0/2272 [00:00<?, ?it/s]

In [16]:
y = d_gt['SingleHx'].astype(int).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class EmbeddingRNNClassifier(nn.Module):
    def __init__(self, embed_dim, hidden_size, output_size=1):
        super().__init__()
        # Custom RNN (using GRU for efficiency)
        self.rnn = nn.GRU(
            input_size=embed_dim,
            hidden_size=hidden_size,
            batch_first=True,
            # bidirectional=False # default is unidirectional
        )
        
        # Binary classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, hidden_size),
            nn.Sigmoid(),
            nn.Linear(hidden_size, output_size),
            nn.Sigmoid()
        )

    def forward(self, embeddings):
        # Process embeddings sequentially with RNN
        rnn_out, _ = self.rnn(embeddings)
        
        # Extract final timestep output (aggregated representation)
        last_hidden = rnn_out[:, -1, :]
        
        # Binary classification
        return self.classifier(last_hidden)


In [None]:
class Model

In [None]:
class MLModel(BaseEstimator, ClassifierMixin):
    def __init__(
        self,
        embedding_dim=768,
        hidden_size=14,
        num_classes=1,
        nn=EmbeddingRNNClassifier,
        initial_batch_size=16,
        max_batch_size=64,
        num_epochs=1000,
        lr=0.001,
    ):
        # Store all parameters as attributes (required for sklearn)
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        self.nn = nn
        self.initial_batch_size = initial_batch_size
        self.max_batch_size = max_batch_size
        self.num_epochs = num_epochs
        self.lr = lr

        # Internal attributes
        self.model = None
        self.is_fitted_ = False

    def _initialize_model(self):
        """Initialize the neural network model"""
        if self.nn is None:
            raise ValueError("Neural network class (nn) must be provided")
        self.model = self.nn(self.embedding_dim, self.hidden_size, self.num_classes)

    def fit(self, X, y):
        """
        Fit the token classifier.

        Args:
            X: list/array of tokens
            y: labels
        Returns:
            self: Returns the instance itself
        """
        self._initialize_model()
        train_dataset = TensorDataset(X, y)

        criterion = nn.BCEWithLogitsLoss(reduction="none")
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)

        train_losses = []
        train_accs = []
        current_batch_size = self.initial_batch_size

        print(f"Starting training with batch size {current_batch_size}")
        self.is_fitted_ = True
        pbar = trange(self.num_epochs)
        for epoch in pbar:
            train_loader = DataLoader(
                train_dataset,
                batch_size=current_batch_size,
                shuffle=True,
                collate_fn=collate_fn,
            )

            self.model.train()
            epoch_loss = 0.0
            num_batches = 0

            for tokens, labels, attention_mask in train_loader:
                optimizer.zero_grad()

                logits = self.model(tokens)

                loss_per_token = criterion(logits, labels)
                loss_per_token = loss_per_token * attention_mask.unsqueeze(-1)
                loss = loss_per_token.sum() / attention_mask.sum()

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                num_batches += 1

            avg_epoch_loss = epoch_loss / num_batches

            pbar.set_postfix(
                {
                    "Train Loss": f"{avg_epoch_loss:.4f}",
                    "Batch Size": f"{current_batch_size}",
                }
            )

            if avg_epoch_loss < 0.01 and current_batch_size < self.max_batch_size:
                current_batch_size = min(current_batch_size * 2, self.max_batch_size)

        self.training_results_ = {
            "train_losses": train_losses,
            "train_accuracies": train_accs,
            "model": self.model,
        }

        return self

    def predict(self, X, batch_size=32):
        """
        Predict class labels for samples in X.

        Args:
            X: list/array of tokens (same format as fit method)
            batch_size: batch size for prediction (default: 32)

        Returns:
            list: Predicted labels for each token sequence
        """
        if not self.is_fitted_:
            raise ValueError(
                "This classifier has not been fitted yet. Call 'fit' first."
            )

        dummy_labels = [np.zeros_like(token_list) for token_list in X]
        predict_dataset = TokenDataset(X, dummy_labels)

        # Create data loader using the same collate_fn as in fit
        predict_loader = DataLoader(
            predict_dataset,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=collate_fn,
        )

        predictions = []
        self.model.eval()

        with torch.no_grad():
            for tokens, _, attention_mask in predict_loader:
                logits = self.model(tokens)
                pred = torch.argmax(logits, dim=-1)

                for i in range(pred.size(0)):
                    seq_pred = pred[i]
                    seq_mask = attention_mask[i]
                    valid_predictions = seq_pred[seq_mask.bool()].cpu().numpy().tolist()
                    predictions.append(valid_predictions)

        return predictions

    def score(self, X, y=None):
        """
        Return the mean accuracy on the given test data and labels.

        Args:
            X: Test samples
            y: True labels

        Returns:
            float: Mean accuracy score
        """
        if not self.is_fitted_:
            raise ValueError(
                "This classifier has not been fitted yet. Call 'fit' first."
            )

        # Create evaluation dataset and loader
        eval_dataset = TokenDataset(X, y)
        eval_loader = DataLoader(
            eval_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn
        )

        return self.eval(eval_loader)

    def eval(self, eval_loader):
        """Evaluate the model on the given data loader"""
        if not self.is_fitted_:
            raise ValueError(
                "This classifier has not been fitted yet. Call 'fit' first."
            )

        acc_sum = 0.0
        total_tokens = 0

        self.model.eval()
        with torch.no_grad():
            for tokens, labels, attention_mask in eval_loader:
                logits = self.model(tokens)

                pred = torch.argmax(logits, dim=-1)
                target = torch.argmax(labels, dim=-1)

                acc_per_token = (pred == target).float() * attention_mask
                acc_sum += acc_per_token.sum().item()
                total_tokens += attention_mask.sum().item()

        acc = acc_sum / total_tokens if total_tokens > 0 else 0.0
        return acc

    def get_params(self, deep=True):
        """
        Get parameters for this estimator.

        Args:
            deep (bool): If True, return parameters for sub-estimators too.

        Returns:
            dict: Parameter names mapped to their values.
        """
        return {
            "embedding_dim": self.embedding_dim,
            "num_classes": self.num_classes,
            "nn": self.nn,
            "initial_batch_size": self.initial_batch_size,
            "max_batch_size": self.max_batch_size,
            "num_epochs": self.num_epochs,
            "eval_split": self.eval_split,
            "seed": self.seed,
            "lr": self.lr,
        }

    def set_params(self, **params):
        """
        Set the parameters of this estimator.

        Args:
            **params: Estimator parameters.

        Returns:
            self: Estimator instance.
        """
        valid_params = set(self.get_params().keys())
        for key, value in params.items():
            if key not in valid_params:
                raise ValueError(
                    f"Invalid parameter {key} for estimator {type(self).__name__}"
                )
            setattr(self, key, value)

        # Reset fitted state when parameters change
        self.is_fitted_ = False
        self.model = None
        self.training_results_ = None

        return self

    def save_model(self, file_name):
        """
        Save the trained model to a file.

        Args:
            file_name (str): Path where to save the model
        """
        if not self.is_fitted_:
            raise ValueError("Cannot save model that hasn't been fitted yet.")

        try:
            # Save the model state dict along with architecture parameters
            save_dict = {
                "state_dict": self.model.state_dict(),
                "embedding_dim": self.embedding_dim,
                "num_classes": self.num_classes,
                "nn_class": self.nn.__name__ if self.nn else None,
            }
            torch.save(save_dict, file_name)
            print(f"Model saved successfully to {file_name}")
        except Exception as e:
            print(f"Error saving model: {e}")
            raise

    def load_model(self, file_name, nn_class=None):
        """
        Load a previously saved model.

        Args:
            file_name (str): Path to the saved model
            nn_class: Neural network class to instantiate (optional if saved with model)
        """
        try:
            save_dict = torch.load(file_name)

            # Use saved architecture parameters or class parameters
            embedding_dim = save_dict.get("embedding_dim", self.embedding_dim)
            num_classes = save_dict.get("num_classes", self.num_classes)

            if nn_class is None:
                nn_class = self.nn
            if nn_class is None:
                raise ValueError(
                    "Neural network class must be provided either in constructor or as parameter"
                )

            # Initialize model architecture
            self.model = nn_class(embedding_dim, num_classes=num_classes)
            # Load the saved state dict
            self.model.load_state_dict(save_dict["state_dict"])
            self.model.eval()  # Set to evaluation mode
            self.is_fitted_ = True
            print(f"Model loaded successfully from {file_name}")
        except Exception as e:
            print(f"Error loading model: {e}")
  Model     raise