In [None]:
!pip install spacy
!pip install numpy
!pip install torch
!pip install scikit-learn
!pip install tqdm

In [2]:
"""
Unified Relation Extraction Model


This model integrates three sources of features:
- A BiLSTM with attention that processes token embeddings.
- Rule-based features computed via simple regex-based matching.
- SVM-style features computed via a TF-IDF representation.

These three representations are projected into a common space,
concatenated, and fed to a final classifier so that all
information is jointly learned during training.
"""

import json
import math
import spacy
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from collections import Counter
import re
import random
import logging
from tqdm import tqdm
import warnings

In [3]:
warnings.filterwarnings('ignore')

nlp = spacy.load("en_core_web_sm")

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [4]:
class Config:
    def __init__(self):
        # Model parameters
        self.vocab_size = 1000
        self.embed_dim = 300  
        self.hidden_dim = 128
        self.max_length = 128
        self.num_classes = None 
        self.batch_size = 16
        self.learning_rate = 1e-3
        self.weight_decay = 0.01
        self.max_grad_norm = 1.0
        self.max_epochs = 30
        self.accumulation_steps = 4
        self.label_smoothing = 0.1 

        self.model_dir = "models"
        self.model_name = "best_unified_model.pth"
        self.model_path = os.path.join(self.model_dir, self.model_name)

        self.rule_feature_dim = None
        self.svm_feature_dim = 5000
        self.projection_dim = 100

        self.glove_file = "glove.6B.300d.txt"

In [5]:
def load_tacred_dataset():
    """Load TACRED dataset from the dataset directory."""
    data_dir = "dataset"
    with open(os.path.join(data_dir, "train.json"), "r") as f:
        train_data = json.load(f)
    with open(os.path.join(data_dir, "dev.json"), "r") as f:
        dev_data = json.load(f)
    with open(os.path.join(data_dir, "test.json"), "r") as f:
        test_data = json.load(f)
    return train_data, dev_data, test_data

In [6]:
def load_glove_embeddings(glove_file, tokenizer, embed_dim=300):
    """
    Loads pretrained GloVe embeddings and returns a numpy array
    aligned with the tokenizer's word indices.
    """
    vocab_size = len(tokenizer.token2id)
    embeddings = np.random.normal(scale=0.1, size=(vocab_size, embed_dim)).astype(np.float32)

    glove_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.strip().split()
            word = split_line[0]
            vec = np.array(split_line[1:], dtype=np.float32)
            glove_index[word] = vec

    for token, idx in tokenizer.token2id.items():
        if token in glove_index:
            embeddings[idx] = glove_index[token]
    return embeddings

In [7]:
class Tokenizer:
    """
    Simple tokenizer that builds a vocabulary from tokens.
    (Used by the BiLSTM branch.)
    """
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.token2id = {}
        self.id2token = {}
        self.special_tokens = {
            '<PAD>': 0,
            '<UNK>': 1,
            '<CLS>': 2,
            '<SEP>': 3
        }

    def build_vocab(self, texts):
        # texts is a list of token lists
        self.token2id = self.special_tokens.copy()
        current_idx = len(self.special_tokens)
        token_freq = Counter()
        for text in texts:
            token_freq.update(text)
        for token, _ in token_freq.most_common(self.vocab_size - len(self.special_tokens)):
            self.token2id[token] = current_idx
            current_idx += 1
        self.id2token = {v: k for k, v in self.token2id.items()}

    def convert_tokens_to_ids(self, tokens):
        return [self.token2id.get(token, self.special_tokens['<UNK>']) for token in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.id2token.get(id_, '<UNK>') for id_ in ids]

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads=4):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.head_dim = hidden_dim // num_heads

        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        Q = self.query(query).view(batch_size, -1, self.num_heads, self.head_dim)
        K = self.key(key).view(batch_size, -1, self.num_heads, self.head_dim)
        V = self.value(value).view(batch_size, -1, self.num_heads, self.head_dim)

        Q = Q.transpose(1, 2)
        K = K.transpose(1, 2)
        V = V.transpose(1, 2)

        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attention_weights = F.softmax(scores, dim=-1)
        context = torch.matmul(attention_weights, V)

        context = context.transpose(1, 2).contiguous().view(
            batch_size, -1, self.num_heads * self.head_dim)
        output = self.output(context)
        return output, attention_weights

In [9]:
class BiLSTMWithAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding = nn.Embedding(config.vocab_size, config.embed_dim, padding_idx=0)
        self.dropout = nn.Dropout(0.3)
        self.lstm = nn.LSTM(
            config.embed_dim,
            config.hidden_dim,
            num_layers=2,
            bidirectional=True,
            batch_first=True,
            dropout=0.3
        )
        self.attn_fc = nn.Linear(config.hidden_dim * 2, config.hidden_dim * 2)  # for simple attention
        self.fc1 = nn.Linear(config.hidden_dim * 2, config.embed_dim)

    def forward(self, token_ids, positions=None, entity_masks=None, attention_mask=None):
        """
        Forward pass for BiLSTM with a simple self-attention mechanism.
        
        """
        x = self.embedding(token_ids)
        x = self.dropout(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out)

        weights = self.attn_fc(lstm_out)  
        weights = torch.tanh(weights)
        weights = weights.sum(dim=-1)  

  
        if attention_mask is not None:
            weights = weights.masked_fill(attention_mask == 0, -1e9)
        attn_scores = F.softmax(weights, dim=1).unsqueeze(-1) 

       
        weighted_out = lstm_out * attn_scores
        rep = weighted_out.sum(dim=1)  

        rep = F.relu(self.fc1(rep))  
        rep = self.dropout(rep)
        return rep

In [10]:
class RuleBasedExtractor:
    """
    Simple rule-based extractor using regex patterns.
    Instead of returning a prediction, we will use it to create a one-hot
    feature vector (over the set of possible relations).
    """
    def __init__(self):
        self.patterns = {
            'org:founded_by': [
                r'.found.',
                r'.start.',
                r'.create.',
                r'.establish.'
            ],
            'per:employee_of': [
                r'.work.',
                r'.join.',
                r'.lead.',
                r'.head.'
            ],
            'org:city_of_headquarters': [
                r'.located.*in.',
                r'.based.*in.',
                r'.headquartered.*in.'
            ]
        }

    def predict(self, text, subj_span, obj_span):
        try:
            tokens = text.split()
            # Use the text between subject and object (assuming subj then obj)
            between_text = ' '.join(tokens[subj_span[1]:obj_span[0]])
            for relation, patterns in self.patterns.items():
                for pattern in patterns:
                    if re.search(pattern, between_text, re.IGNORECASE):
                        return relation
            return "no_relation"
        except Exception:
            return "no_relation"

In [11]:
def compute_rule_features(text, subj_span, obj_span, relation_map, rule_extractor):
    """
    Computes a one-hot vector (of dimension len(relation_map))
    where the index corresponding to the rule-based prediction is 1.
    """
    pred = rule_extractor.predict(text, subj_span, obj_span)
    one_hot = np.zeros(len(relation_map), dtype=np.float32)
    if pred in relation_map:
        one_hot[relation_map[pred]] = 1.0
    return one_hot


In [12]:
def compute_svm_features(text, subj_span, obj_span, tfidf_vectorizer):
    """
    Computes SVM-style TF-IDF features.
    Uses the pattern: "subj [SEP] between [SEP] obj"
    """
    tokens = text.split()
    subj = ' '.join(tokens[subj_span[0]:subj_span[1]])
    obj = ' '.join(tokens[obj_span[0]:obj_span[1]])
    between = ' '.join(tokens[subj_span[1]:obj_span[0]])
    feature_str = f"{subj} [SEP] {between} [SEP] {obj}"
    vec = tfidf_vectorizer.transform([feature_str]).toarray()[0]
    return vec.astype(np.float32)

In [13]:
class UnifiedRelationDataset(Dataset):
    def __init__(self, data, tokenizer, config, relation_map, tfidf_vectorizer, rule_extractor):
        self.data = data
        self.tokenizer = tokenizer
        self.config = config
        self.relation_map = relation_map
        self.tfidf_vectorizer = tfidf_vectorizer
        self.rule_extractor = rule_extractor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        tokens = example['token']
        subj_span = (example['subj_start'], example['subj_end'])
        obj_span = (example['obj_start'], example['obj_end'])

        # Tokenize
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < self.config.max_length:
            token_ids += [0] * (self.config.max_length - len(token_ids))
        else:
            token_ids = token_ids[:self.config.max_length]

        positions = torch.arange(self.config.max_length)
        entity_masks = torch.zeros(self.config.max_length, dtype=torch.long)
        attention_mask = torch.tensor([1 if tid != 0 else 0 for tid in token_ids])

        for i in range(subj_span[0], min(subj_span[1], self.config.max_length)):
            entity_masks[i] = 1
        for i in range(obj_span[0], min(obj_span[1], self.config.max_length)):
            entity_masks[i] = 2 if entity_masks[i] != 1 else 3

        text = ' '.join(tokens)
        rule_feat = compute_rule_features(text, subj_span, obj_span, self.relation_map, self.rule_extractor)
        svm_feat = compute_svm_features(text, subj_span, obj_span, self.tfidf_vectorizer)

        return {
            'token_ids': torch.tensor(token_ids, dtype=torch.long),
            'positions': positions,  # shape [max_length]
            'entity_masks': entity_masks,
            'attention_mask': attention_mask,
            'label': torch.tensor(self.relation_map[example['relation']], dtype=torch.long),
            'rule_features': torch.tensor(rule_feat, dtype=torch.float),
            'svm_features': torch.tensor(svm_feat, dtype=torch.float)
        }

In [14]:
class LabelSmoothingLoss(nn.Module):
    """
    Replaces CrossEntropy with label smoothing, so the model doesn't become
    too confident in a single class. This can help with class imbalance.
    """
    def __init__(self, num_classes, smoothing=0.1):
        super().__init__()
        self.num_classes = num_classes
        self.smoothing = smoothing
        self.kl_div = nn.KLDivLoss(reduction='batchmean')

    def forward(self, logits, target):
        with torch.no_grad():
            true_dist = torch.zeros_like(logits)
            true_dist.fill_(self.smoothing / (self.num_classes - 1))
            true_dist.scatter_(1, target.unsqueeze(1), 1.0 - self.smoothing)
        log_probs = F.log_softmax(logits, dim=1)
        return self.kl_div(log_probs, true_dist)

In [15]:
class UnifiedRelationExtractor(nn.Module):
    def __init__(self, config, tfidf_dim, rule_dim):
        """
        tfidf_dim: dimension of the TF-IDF vector (e.g. 5000)
        rule_dim: dimension of the rule-based one-hot vector (equal to num_relations)
        """
        super(UnifiedRelationExtractor, self).__init__()
        self.encoder = BiLSTMWithAttention(config)

        # Project rule features and svm features to a common dimension
        self.rule_fc = nn.Linear(rule_dim, config.projection_dim)
        self.svm_fc = nn.Linear(tfidf_dim, config.projection_dim)

       
        in_dim = config.embed_dim + 2 * config.projection_dim  # 300 + 2*100 = 500

        hidden = 128
        self.classifier = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, config.num_classes)
        )

    def forward(self, token_ids, positions, entity_masks, attention_mask, rule_features, svm_features):
        lstm_rep = self.encoder(
            token_ids,
            positions=positions,
            entity_masks=entity_masks,
            attention_mask=attention_mask
        )

        rule_rep = F.relu(self.rule_fc(rule_features))  # shape: [batch_size, 100]
        svm_rep = F.relu(self.svm_fc(svm_features))     # shape: [batch_size, 100]

        combined = torch.cat([lstm_rep, rule_rep, svm_rep], dim=1)

        logits = self.classifier(combined)
        return logits

In [16]:
def train_unified(model, train_loader, dev_loader, config, device):
    if config.label_smoothing > 0:
        criterion = LabelSmoothingLoss(config.num_classes, smoothing=config.label_smoothing)
    else:
        criterion = nn.CrossEntropyLoss()

    optimizer = optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=2, factor=0.5)

    for epoch in range(config.max_epochs):
        model.train()
        total_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{config.max_epochs}')
        optimizer.zero_grad()

        for batch_idx, batch in enumerate(progress_bar):
            token_ids = batch['token_ids'].to(device)
            positions = batch['positions'].to(device)        
            entity_masks = batch['entity_masks'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            rule_features = batch['rule_features'].to(device)
            svm_features = batch['svm_features'].to(device)
            labels = batch['label'].to(device)

            logits = model(
                token_ids, positions, entity_masks, attention_mask,
                rule_features, svm_features
            )
            loss = criterion(logits, labels)
            loss = loss / config.accumulation_steps
            loss.backward()

            if (batch_idx + 1) % config.accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item() * config.accumulation_steps
            progress_bar.set_postfix({'loss': total_loss / (batch_idx + 1)})

        dev_metrics = evaluate_unified(model, dev_loader, device)
        logger.info(
            f"Epoch {epoch+1}/{config.max_epochs} - "
            f"Train Loss: {total_loss/len(train_loader):.4f} - "
            f"Dev F1: {dev_metrics['f1']:.4f}"
        )

        scheduler.step(dev_metrics['f1'])

    logger.info("Training completed. Saving final model...")
    torch.save(model.state_dict(), config.model_path)
    logger.info(f"Final model saved to {config.model_path}")

In [17]:
def evaluate_unified(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            token_ids = batch['token_ids'].to(device)
            positions = batch['positions'].to(device)
            entity_masks = batch['entity_masks'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            rule_features = batch['rule_features'].to(device)
            svm_features = batch['svm_features'].to(device)
            labels = batch['label'].to(device)

            logits = model(
                token_ids, positions, entity_masks, attention_mask,
                rule_features, svm_features
            )
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return {
        'f1': f1_score(all_labels, all_preds, average='macro'),
        'accuracy': accuracy_score(all_labels, all_preds)
    }

In [18]:
def main():
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    config = Config()
    os.makedirs(config.model_dir, exist_ok=True)

    # Load TACRED dataset
    logger.info("Loading TACRED dataset...")
    train_data, dev_data, test_data = load_tacred_dataset()

    all_data = train_data + dev_data + test_data
    unique_relations = sorted(set(ex['relation'] for ex in all_data))
    relation_map = {rel: idx for idx, rel in enumerate(unique_relations)}
    config.num_classes = len(relation_map)
    config.rule_feature_dim = config.num_classes  # one-hot over relations

    all_tokens = [token for ex in train_data for token in ex['token']]
    tokenizer = Tokenizer(config.vocab_size)
    tokenizer.build_vocab([all_tokens])

    # Load GloVe embeddings
    glove_matrix = load_glove_embeddings(config.glove_file, tokenizer, embed_dim=config.embed_dim)

    
    train_texts = []
    for ex in train_data:
        tokens = ex['token']
        subj_span = (ex['subj_start'], ex['subj_end'])
        obj_span = (ex['obj_start'], ex['obj_end'])
        subj = ' '.join(tokens[subj_span[0]:subj_span[1]])
        obj = ' '.join(tokens[obj_span[0]:obj_span[1]])
        between = ' '.join(tokens[subj_span[1]:obj_span[0]])
        train_texts.append(f"{subj} [SEP] {between} [SEP] {obj}")

    tfidf_vectorizer = TfidfVectorizer(max_features=config.svm_feature_dim)
    tfidf_vectorizer.fit(train_texts)
    actual_feature_dim = len(tfidf_vectorizer.get_feature_names_out())
    config.svm_feature_dim = actual_feature_dim


    rule_extractor = RuleBasedExtractor()

    # Create datasets
    train_dataset = UnifiedRelationDataset(train_data, tokenizer, config, relation_map,
                                           tfidf_vectorizer, rule_extractor)
    dev_dataset = UnifiedRelationDataset(dev_data, tokenizer, config, relation_map,
                                         tfidf_vectorizer, rule_extractor)
    test_dataset = UnifiedRelationDataset(test_data, tokenizer, config, relation_map,
                                          tfidf_vectorizer, rule_extractor)

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=config.batch_size)
    test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = UnifiedRelationExtractor(
        config,
        tfidf_dim=config.svm_feature_dim,
        rule_dim=config.rule_feature_dim
    )
    
    model.encoder.embedding.weight.data.copy_(torch.from_numpy(glove_matrix))
    model.to(device)

    # ---------------------------
    # Training Phase
    # ---------------------------
    # The training phase is commented out so that the professors can run inference
    # on pre-trained model without needing to retrain it.
    # logger.info("Starting training of the unified model...")
    # train_unified(model, train_loader, dev_loader, config, device)
    
    model.load_state_dict(torch.load(config.model_path, map_location=device))
    test_metrics = evaluate_unified(model, test_loader, device)
    logger.info(f"Test Accuracy: {test_metrics['accuracy']:.4f}, Test F1: {test_metrics['f1']:.4f}")

    
    logger.info("Sample predictions on test instances:")
    model.eval()
    for ex in test_data[800:805]:
        tokens = ex['token']
        text = ' '.join(tokens)
        subj_span = (ex['subj_start'], ex['subj_end'])
        obj_span = (ex['obj_start'], ex['obj_end'])

        # Prepare inputs for the BiLSTM branch
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < config.max_length:
            token_ids += [0] * (config.max_length - len(token_ids))
        else:
            token_ids = token_ids[:config.max_length]

        positions = torch.arange(config.max_length).unsqueeze(0).to(device)
        entity_masks = torch.zeros(config.max_length, dtype=torch.long)
        for i in range(subj_span[0], min(subj_span[1], config.max_length)):
            entity_masks[i] = 1
        for i in range(obj_span[0], min(obj_span[1], config.max_length)):
            entity_masks[i] = 2 if entity_masks[i] != 1 else 3
        entity_masks = entity_masks.unsqueeze(0).to(device)

        attention_mask = torch.tensor([1 if tid != 0 else 0 for tid in token_ids]).unsqueeze(0).to(device)

        
        rule_feat = compute_rule_features(text, subj_span, obj_span, relation_map, rule_extractor)
        svm_feat = compute_svm_features(text, subj_span, obj_span, tfidf_vectorizer)
        rule_feat = torch.tensor(rule_feat, dtype=torch.float).unsqueeze(0).to(device)
        svm_feat = torch.tensor(svm_feat, dtype=torch.float).unsqueeze(0).to(device)

        token_ids_tensor = torch.tensor([token_ids], dtype=torch.long).to(device)

        with torch.no_grad():
            logits = model(
                token_ids_tensor,
                positions,
                entity_masks,
                attention_mask,
                rule_feat,
                svm_feat
            )
            pred_idx = torch.argmax(logits, dim=1).item()
            pred_relation = list(relation_map.keys())[list(relation_map.values()).index(pred_idx)]

        print(f"\nInput: {text}")
        print(f"Subject: {' '.join(tokens[subj_span[0]:subj_span[1]])}")
        print(f"Object: {' '.join(tokens[obj_span[0]:obj_span[1]])}")
        print(f"Predicted Relation: {pred_relation}")

if __name__ == "__main__":
    main()


2025-03-07 12:28:49,129 - INFO - Loading TACRED dataset...
2025-03-07 12:29:38,541 - INFO - Test Accuracy: 0.7592, Test F1: 0.1010
2025-03-07 12:29:38,542 - INFO - Sample predictions on test instances:



Input: For Cephalon , too , the payoff was a bargain : Chief executive Frank Baldino Jr. acknowledged that it made about $ 4 billion `` that no one expected . ''
Subject: Frank Baldino
Object: 
Predicted Relation: no_relation

Input: -LCB- EXCERPT -RCB- New York Times , United States Jamie Leigh Jones , left , a former employee for the military contractor KBR , told Congress that she had been gang-raped by co-workers in Iraq in 2005 ... .
Subject: Jamie Leigh
Object: 
Predicted Relation: no_relation

Input: Information Services Group purchased TPI in October .
Subject: 
Object: 
Predicted Relation: no_relation

Input: Midfielders : Yang Hao , Yan Xiangchuang , Deng Zhuoxiang , Zhou Haibin , Yu Tao , Feng Renliang , Chen Tao , Yu Hai , Zhao Xuri , Yu Hanchao ,
Subject: Chen
Object: Zhou
Predicted Relation: no_relation

Input: Letter from Havana : There is only one kosher butcher in Cuba , but if community president Adela Dworin knows her community -- and she does -- there is also only 