## Classifier - Try 3

Classify articles frame using SRL and a classifier

In [1]:
import os

try:
  import google.colab

  from google.colab import drive
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  os.chdir('drive/MyDrive/Git/MasterThesis/data')
else:
  os.chdir('../../data/')

labels_path = "data/en/train-labels-subtask-2.txt"
articles_path = "data/en/train-articles-subtask-2/"

In [2]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep="\t")

# Rename the columns for easier processing
labels_df.columns = ["article_id", "frames"]


labels_df.head()

Unnamed: 0,article_id,frames
0,832959523,"Morality,Security_and_defense,Policy_prescript..."
1,833039623,"Political,Crime_and_punishment,External_regula..."
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq..."
3,814777937,"Political,Morality,Fairness_and_equality,Exter..."
4,821744708,"Policy_prescription_and_evaluation,Political,L..."


In [3]:
# A function to read the article text given its ID
def get_article_content(article_id):
    try:
        with open(f"{articles_path}/article{article_id}.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        return None

df = labels_df

# Apply the function to get the article content
df["content"] = df["article_id"].apply(get_article_content)

# Drop rows where content could not be found
df.dropna(subset=["content"], inplace=True)

df.head()


Unnamed: 0,article_id,frames,content
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...


In [4]:
# Split the frames column into a list of frames
df["frames_list"] = df["frames"].str.split(",")

# create for each frame a new column with the frame as name and 1 if the frame is present in the article and 0 if not
for frame in df["frames_list"].explode().unique():
    df[frame] = df["frames_list"].apply(lambda x: 1 if frame in x else 0)

df.head()

Unnamed: 0,article_id,frames,content,frames_list,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...,"[Morality, Security_and_defense, Policy_prescr...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...,"[Political, Crime_and_punishment, External_reg...",0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...,"[Political, Crime_and_punishment, Fairness_and...",0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...,"[Political, Morality, Fairness_and_equality, E...",1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...,"[Policy_prescription_and_evaluation, Political...",0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [5]:
X = df["content"]
y = df.drop(columns=["article_id", "frames", "frames_list", "content"])

In [6]:
X.head()

0    How Theresa May Botched\n\nThose were the time...
1    Robert Mueller III Rests His Case—Dems NEVER W...
2    Robert Mueller Not Recommending Any More Indic...
3    The Far Right Is Trying to Co-opt the Yellow V...
4    ‘Special place in hell’ for those who promoted...
Name: content, dtype: object

In [7]:
y.head()

Unnamed: 0,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [8]:
len(X), len(y)

(432, 432)

### Create Dataset

In [24]:
import torch

# Dataset
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

In [25]:
class ArticleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels.values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.float)  # Note the float type for BCEWithLogitsLoss
        }

In [26]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='transformers')

In [28]:
# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 965kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.96MB/s]


In [29]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create DataLoaders for train and test sets
BATCH_SIZE = 16

train_dataset = ArticleDataset(X_train, y_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = ArticleDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

### Create the PyTorch Model

In [11]:
!pip install transformers



In [12]:
import torch
import torch.nn as nn
from transformers import BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
class MultiViewAutoencoder(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', hidden_dim=384, descriptor_dim=15, num_frames=14):
        super(MultiViewAutoencoder, self).__init__()

        # Shared BERT Sentence Encoder
        self.sentence_encoder = BertModel.from_pretrained(bert_model_name)

        # Predicate Autoencoder
        self.encoder_p = self._create_encoder(hidden_dim, descriptor_dim)
        self.dictionary_p = nn.Parameter(torch.randn(descriptor_dim, 768))
        self.decoder_p = nn.Linear(descriptor_dim, 768)

        # ARG0 Autoencoder
        self.encoder_a0 = self._create_encoder(hidden_dim, descriptor_dim)
        self.dictionary_a0 = nn.Parameter(torch.randn(descriptor_dim, 768))
        self.decoder_a0 = nn.Linear(descriptor_dim, 768)

        # ARG1 Autoencoder
        self.encoder_a1 = self._create_encoder(hidden_dim, descriptor_dim)
        self.dictionary_a1 = nn.Parameter(torch.randn(descriptor_dim, 768))
        self.decoder_a1 = nn.Linear(descriptor_dim, 768)

        # Classifier for frame prediction
        self.fc = nn.Linear(descriptor_dim * 3, num_frames)

    def _create_encoder(self, hidden_dim, descriptor_dim):
        return nn.Sequential(
            nn.Linear(2*768, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, descriptor_dim)
        )

    def forward(self, input_ids, attention_mask):
        # Assuming input_ids and attention_mask are for a single sentence
        # Get the token embeddings from BERT
        outputs = self.sentence_encoder(input_ids, attention_mask)
        token_embeddings = outputs.last_hidden_state

        # Compute span embeddings (e.g., for predicate) by averaging token embeddings in the span
        # Here, you'd need to define the start and end indices for each span (predicate, ARG0, ARG1)
        # span_p = token_embeddings[start_idx_p:end_idx_p].mean(dim=1)
        # span_a0 = token_embeddings[start_idx_a0:end_idx_a0].mean(dim=1)
        # span_a1 = token_embeddings[start_idx_a1:end_idx_a1].mean(dim=1)

        # For simplicity in this example, I'm assuming the entire sentence is the span for each role
        span_p = token_embeddings.mean(dim=1)
        span_a0 = token_embeddings.mean(dim=1)
        span_a1 = token_embeddings.mean(dim=1)

        # Concatenate with sentence embedding
        sentence_embedding = token_embeddings.mean(dim=1)
        input_p = torch.cat((span_p, sentence_embedding), dim=1)
        input_a0 = torch.cat((span_a0, sentence_embedding), dim=1)
        input_a1 = torch.cat((span_a1, sentence_embedding), dim=1)

        # Pass through autoencoders
        latent_p = self.encoder_p(input_p)
        latent_a0 = self.encoder_a0(input_a0)
        latent_a1 = self.encoder_a1(input_a1)

        # Decoder (Reconstruction)
        recon_p = torch.matmul(latent_p, self.dictionary_p)
        recon_a0 = torch.matmul(latent_a0, self.dictionary_a0)
        recon_a1 = torch.matmul(latent_a1, self.dictionary_a1)

        # Classifier
        combined_latent = torch.cat((latent_p, latent_a0, latent_a1), dim=1)
        frame_logits = self.fc(combined_latent)

        return frame_logits, (recon_p, recon_a0, recon_a1)


In [14]:
# Example usage
model = MultiViewAutoencoder()
input_ids = torch.tensor([[101, 2045, 2003, 1037, 2879, 102]])  # Example token IDs
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1]])  # Example attention mask

frame_logits, reconstructions = model(input_ids, attention_mask)

Downloading model.safetensors: 100%|██████████| 440M/440M [01:01<00:00, 7.19MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
# Gumbel-Softmax Sampling
def gumbel_softmax(logits, temperature=1.0, eps=1e-10):
    gumbel_noise = -torch.log(-torch.log(torch.rand_like(logits) + eps) + eps)
    y = logits + gumbel_noise
    return torch.nn.functional.softmax(y / temperature, dim=1)

In [16]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, v_hat, v, v_negatives):
        positive_distance = torch.norm(v_hat - v, dim=1)
        negative_distances = torch.norm(v_hat.unsqueeze(1) - v_negatives, dim=2)
        losses = torch.clamp(self.margin + positive_distance.unsqueeze(1) - negative_distances, min=0)
        return losses.mean()

In [17]:
class FocalTripletLoss(nn.Module):
    def __init__(self, margin_budget=1.0, t=5):
        super(FocalTripletLoss, self).__init__()
        self.margin_budget = margin_budget
        self.t = t  # number of smallest weight descriptors to consider

    def forward(self, v_hat, v, gz):
        smallest_weights, smallest_indices = torch.topk(gz, k=self.t, largest=False, sorted=False)
        margins = self.margin_budget * (1 - smallest_weights)**2
        distances_to_true = torch.norm(v_hat - v, dim=1, keepdim=True)
        distances_to_descriptors = torch.norm(v_hat.unsqueeze(1) - v_hat[smallest_indices], dim=2)
        losses = torch.clamp(margins + distances_to_true - distances_to_descriptors, min=0)
        return losses.mean()

In [21]:
!pip install allennlp

Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
     ---------------------------------------- 0.0/730.2 kB ? eta -:--:--
      --------------------------------------- 10.2/730.2 kB ? eta -:--:--
     - ----------------------------------- 20.5/730.2 kB 217.9 kB/s eta 0:00:04
     --- --------------------------------- 61.4/730.2 kB 409.6 kB/s eta 0:00:02
     -------------- ----------------------- 276.5/730.2 kB 1.7 MB/s eta 0:00:01
     ----------------------------------- -- 686.1/730.2 kB 3.3 MB/s eta 0:00:01
     -------------------------------------- 730.2/730.2 kB 3.1 MB/s eta 0:00:00
INFO: pip is looking at multiple versions of allennlp to determine which version is compatible with other requirements. This could take a while.
  Downloading allennlp-2.10.0-py3-none-any.whl (729 kB)
     ---------------------------------------- 0.0/729.8 kB ? eta -:--:--
     ------------ ------------------------- 235.5/729.8 kB 7.0 MB/s eta 0:00:01
     ------------------

  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [373 lines of output]
      Collecting setuptools
        Obtaining dependency information for setuptools from https://files.pythonhosted.org/packages/bb/26/7945080113158354380a12ce26873dd6c1ebd88d47f5bc24e2c5bb38c16a/setuptools-68.2.2-py3-none-any.whl.metadata
        Downloading setuptools-68.2.2-py3-none-any.whl.metadata (6.3 kB)
      Collecting wheel<0.33.0,>0.32.0
        Downloading wheel-0.32.3-py2.py3-none-any.whl (21 kB)
      Collecting Cython
        Obtaining dependency information for Cython from https://files.pythonhosted.org/packages/39/5d/5ae976df4e368327864917a24f9dee7c8176de1b5b7044ee9903b8adb07a/Cython-3.0.3-cp311-cp311-win_amd64.whl.metadata
        Downloading Cython-3.0.3-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Obtaining dependency information for cymem<2.1.0,>=2.0.2 from https:

In [20]:
from allennlp.predictors.predictor import Predictor
from allennlp_models.structured_prediction.models import srl_bert

# Load the SRL predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

def extract_srl_components(article, predictor):
    """
    Extract SRL components for an article.
    """
    srl = predictor.predict(sentence=article)
    
    extracted_data = []
    for verb_entry in srl['verbs']:
        predicate = verb_entry['verb']
        tags = verb_entry['tags']
        
        arg0_indices = [i for i, tag in enumerate(tags) if tag in ['B-ARG0', 'I-ARG0']]
        arg1_indices = [i for i, tag in enumerate(tags) if tag in ['B-ARG1', 'I-ARG1']]
        
        arg0 = [srl['words'][i] for i in arg0_indices] if arg0_indices else []
        arg1 = [srl['words'][i] for i in arg1_indices] if arg1_indices else []
        
        extracted_data.append({
            'predicate': [predicate],
            'ARG0': arg0,
            'ARG1': arg1
        })
        
    return extracted_data


ModuleNotFoundError: No module named 'allennlp'