In [81]:
import os

os.chdir('../data/')

labels_path = "data/en/train-labels-subtask-2.txt"
articles_path = "data/en/train-articles-subtask-2/"

In [82]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep="\t")

# Rename the columns for easier processing
labels_df.columns = ["article_id", "frames"]


labels_df.head()

Unnamed: 0,article_id,frames
0,832959523,"Morality,Security_and_defense,Policy_prescript..."
1,833039623,"Political,Crime_and_punishment,External_regula..."
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq..."
3,814777937,"Political,Morality,Fairness_and_equality,Exter..."
4,821744708,"Policy_prescription_and_evaluation,Political,L..."


In [83]:
# A function to read the article text given its ID
def get_article_content(article_id):
    try:
        with open(f"{articles_path}/article{article_id}.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        return None

df = labels_df

# Apply the function to get the article content
df["content"] = df["article_id"].apply(get_article_content)

# Drop rows where content could not be found
df.dropna(subset=["content"], inplace=True)

df.head()


Unnamed: 0,article_id,frames,content
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...


In [84]:
# Split the frames column into a list of frames
df["frames_list"] = df["frames"].str.split(",")

# create for each frame a new column with the frame as name and 1 if the frame is present in the article and 0 if not
for frame in df["frames_list"].explode().unique():
    df[frame] = df["frames_list"].apply(lambda x: 1 if frame in x else 0)

df.head()

Unnamed: 0,article_id,frames,content,frames_list,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...,"[Morality, Security_and_defense, Policy_prescr...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...,"[Political, Crime_and_punishment, External_reg...",0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...,"[Political, Crime_and_punishment, Fairness_and...",0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...,"[Political, Morality, Fairness_and_equality, E...",1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...,"[Policy_prescription_and_evaluation, Political...",0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [85]:
X = df["content"]
y = df.drop(columns=["article_id", "frames", "frames_list", "content"])

In [86]:
X.head()

0    How Theresa May Botched\n\nThose were the time...
1    Robert Mueller III Rests His Case—Dems NEVER W...
2    Robert Mueller Not Recommending Any More Indic...
3    The Far Right Is Trying to Co-opt the Yellow V...
4    ‘Special place in hell’ for those who promoted...
Name: content, dtype: object

In [87]:
y.head()

Unnamed: 0,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [88]:
len(X), len(y)

(432, 432)

In [None]:
from allennlp.predictors.predictor import Predictor
from allennlp_models.structured_prediction.models import srl_bert

# Load the SRL predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

In [129]:
# Function to extract SRL embeddings
def extract_srl_embeddings(article):
    # Get SRL predictions
    srl_output = predictor.predict(sentence=article)
    
    print(srl_output)
    # Extract predicates, ARG0, and ARG1 embeddings
    # Note: This is a simplification. In practice, you might want to use a pre-trained embedding model 
    # (like BERT, GloVe, etc.) to convert these tokens to embeddings.
    #predicates = [verb['verb'] for verb in srl_output['verbs']]
    #ARG0s = [verb['description'].split('[')[1].split(']')[0] if 'ARG0' in verb['description'] else '' for verb in srl_output['verbs']]
    #ARG1s = [verb['description'].split('[')[2].split(']')[0] if 'ARG1' in verb['description'] else '' for verb in srl_output['verbs']]
    
    #return predicates, ARG0s, ARG1s

In [131]:
extract_srl_embeddings("The red horse simply turned around and fought off the fly with its tail.")

{'verbs': [{'verb': 'turned', 'description': '[ARG1: The red horse] [ARGM-ADV: simply] [V: turned] [ARGM-DIR: around] and fought off the fly with its tail .', 'tags': ['B-ARG1', 'I-ARG1', 'I-ARG1', 'B-ARGM-ADV', 'B-V', 'B-ARGM-DIR', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'verb': 'fought', 'description': '[ARG0: The red horse] [ARGM-ADV: simply] turned around and [V: fought] off [ARG1: the fly] [ARG2: with its tail] .', 'tags': ['B-ARG0', 'I-ARG0', 'I-ARG0', 'B-ARGM-ADV', 'O', 'O', 'O', 'B-V', 'O', 'B-ARG1', 'I-ARG1', 'B-ARG2', 'I-ARG2', 'I-ARG2', 'O']}], 'words': ['The', 'red', 'horse', 'simply', 'turned', 'around', 'and', 'fought', 'off', 'the', 'fly', 'with', 'its', 'tail', '.']}


In [126]:
# Extract SRL embeddings for each article
srl_embeddings = [extract_srl_embeddings(article) for article in X]

# Unpack the embeddings
predicates_embeddings, ARG0_embeddings, ARG1_embeddings = zip(*srl_embeddings)

RuntimeError: The size of tensor a (1111) must match the size of tensor b (512) at non-singleton dimension 1

In [89]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random

In [90]:
# Setting up random seed for reproducibility
torch.manual_seed(42)
random.seed(42)

In [91]:
# Tokenize each article into sentences (split by ".")
sentences = [sentence for article in X for sentence in article.split('.') if sentence]

In [92]:
# Generate random embeddings for each sentence (in practice, replace with SRL embeddings)
embedding_dim = 100
sentences_embeddings = [torch.randn(embedding_dim) for _ in sentences]

In [110]:
import torch.nn as nn
import torch

class MultiViewAutoencoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, encoded_dim):
        super(MultiViewAutoencoder, self).__init__()
        
        # For predicates
        self.encoder_p = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoded_dim)
        )
        self.decoder_p = nn.Linear(encoded_dim, embedding_dim)
        
        # For ARG0
        self.encoder_a0 = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoded_dim)
        )
        self.decoder_a0 = nn.Linear(encoded_dim, embedding_dim)
        
        # For ARG1
        self.encoder_a1 = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoded_dim)
        )
        self.decoder_a1 = nn.Linear(encoded_dim, embedding_dim)
    
    def forward(self, x_p, x_a0, x_a1):
        # Encoding
        x_p = self.encoder_p(x_p)
        x_a0 = self.encoder_a0(x_a0)
        x_a1 = self.encoder_a1(x_a1)
        
        # Decoding
        x_p = self.decoder_p(x_p)
        x_a0 = self.decoder_a0(x_a0)
        x_a1 = self.decoder_a1(x_a1)
        
        return x_p, x_a0, x_a1


In [114]:
# Define the classifier
class Classifier(nn.Module):
    def __init__(self, encoded_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(encoded_dim, num_classes)
    
    def forward(self, x):
        x = self.fc(x)
        return torch.sigmoid(x)

In [115]:
# Define the dimensions
embedding_dim = 300  # This could be the size of your word embeddings, e.g., 300 for GloVe or Word2Vec
hidden_dim = 150     # This is an intermediate dimension, can be chosen based on model complexity
encoded_dim = 50     # This is the final encoded dimension

# Initialize the autoencoder
autoencoder = MultiViewAutoencoder(embedding_dim, hidden_dim, encoded_dim)


In [116]:
num_classes = len(y.columns)

classifier = Classifier(encoded_dim, num_classes)

In [117]:
sentences_embeddings[:2], autoencoder, classifier

([tensor([ 1.9269e+00,  1.4873e+00,  9.0072e-01, -2.1055e+00,  6.7842e-01,
          -1.2345e+00, -4.3067e-02, -1.6047e+00, -7.5214e-01,  1.6487e+00,
          -3.9248e-01, -1.4036e+00, -7.2788e-01, -5.5943e-01, -7.6884e-01,
           7.6245e-01,  1.6423e+00, -1.5960e-01, -4.9740e-01,  4.3959e-01,
          -7.5813e-01,  1.0783e+00,  8.0080e-01,  1.6806e+00,  1.2791e+00,
           1.2964e+00,  6.1047e-01,  1.3347e+00, -2.3162e-01,  4.1759e-02,
          -2.5158e-01,  8.5986e-01, -1.3847e+00, -8.7124e-01, -2.2337e-01,
           1.7174e+00,  3.1888e-01, -4.2452e-01,  3.0572e-01, -7.7459e-01,
          -1.5576e+00,  9.9564e-01, -8.7979e-01, -6.0114e-01, -1.2742e+00,
           2.1228e+00, -1.2347e+00, -4.8791e-01, -9.1382e-01, -6.5814e-01,
           7.8024e-02,  5.2581e-01, -4.8799e-01,  1.1914e+00, -8.1401e-01,
          -7.3599e-01, -1.4032e+00,  3.6004e-02, -6.3477e-02,  6.7561e-01,
          -9.7807e-02,  1.8446e+00, -1.1845e+00,  1.3835e+00,  1.4451e+00,
           8.5641e-01,  2

In [118]:
# Define the loss functions
reconstruction_loss_fn = nn.MSELoss()
classification_loss_fn = nn.BCELoss()

# Define the optimizer (both models' parameters are optimized jointly)
optimizer = optim.Adam(list(autoencoder.parameters()) + list(classifier.parameters()), lr=0.001)

# Number of epochs
epochs = 10

# Dummy target frames for demonstration (replace with actual frame data)
targets = [torch.randint(0, 2, (num_classes,)).float() for _ in range(len(sentences_embeddings))]

# Assuming sentences_embeddings is a tuple of (predicates_embeddings, ARG0_embeddings, ARG1_embeddings)
predicates_embeddings, ARG0_embeddings, ARG1_embeddings = sentences_embeddings

# Training loop
for epoch in range(epochs):
    for embedding_p, embedding_a0, embedding_a1, target in zip(predicates_embeddings, ARG0_embeddings, ARG1_embeddings, targets):
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass through the autoencoder
        reconstructed_p, reconstructed_a0, reconstructed_a1 = autoencoder(embedding_p, embedding_a0, embedding_a1)

        # Compute the reconstruction loss for each view
        reconstruction_loss_p = reconstruction_loss_fn(reconstructed_p, embedding_p)
        reconstruction_loss_a0 = reconstruction_loss_fn(reconstructed_a0, embedding_a0)
        reconstruction_loss_a1 = reconstruction_loss_fn(reconstructed_a1, embedding_a1)

        # Total reconstruction loss
        total_reconstruction_loss = reconstruction_loss_p + reconstruction_loss_a0 + reconstruction_loss_a1

        # Forward pass through the classifier (using the encoded embeddings of each view)
        encoded_p = autoencoder.encoder_p(embedding_p)
        encoded_a0 = autoencoder.encoder_a0(embedding_a0)
        encoded_a1 = autoencoder.encoder_a1(embedding_a1)

        # Combine the encoded embeddings (e.g., by averaging) before passing to the classifier
        combined_encoded_embedding = (encoded_p + encoded_a0 + encoded_a1) / 3.0
        frame_predictions = classifier(combined_encoded_embedding)

        # Compute the classification loss
        classification_loss = classification_loss_fn(frame_predictions, target)

        # Combine the losses
        combined_loss = total_reconstruction_loss + classification_loss

        # Backward pass and optimization
        combined_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Reconstruction Loss: {total_reconstruction_loss.item()}, Classification Loss: {classification_loss.item()}")


ValueError: too many values to unpack (expected 3)

In [104]:
def predict_frames(article, autoencoder, classifier):
    # Tokenize the article into sentences
    sentences = article.split('.')
    
    # Extract SRL embeddings for the sentences (use random embeddings for this demo)
    embeddings = [torch.randn(embedding_dim) for sentence in sentences]
    
    # List to store frame predictions for each sentence
    all_predictions = []
    
    with torch.no_grad():  # Ensure no gradients are computed during inference
        for embedding in embeddings:
            # Pass the embedding through the trained encoder
            encoded_embedding = autoencoder.encoder(embedding)
            
            # Pass the encoded embedding through the trained classifier
            frame_predictions = classifier(encoded_embedding)
            
            # Convert frame predictions to binary (0 or 1) using a threshold (e.g., 0.5)
            frame_predictions = (frame_predictions > 0.5).float()
            
            all_predictions.append(frame_predictions)
    
    # Aggregate sentence-level predictions to get document-level prediction (average in this case)
    avg_prediction = torch.mean(torch.stack(all_predictions), dim=0)
    document_prediction = (avg_prediction > 0.5).float()
    
    return document_prediction


In [109]:
# read article from data\en\dev-articles-subtask-2\article813452859.txt
with open("data/en/dev-articles-subtask-2/article813452859.txt", "r") as f:
    article = f.read()

# Predict frames for the article
predicted_frames = predict_frames(article, autoencoder, classifier)

# Convert the predicted frames to a list of frames
predicted_frames = [y.columns[i] for i, frame in enumerate(predicted_frames) if frame == 1]


# read the true frames from data\en\dev-labels-subtask-2.txt
with open("data/en/dev-labels-subtask-2.txt", "r") as f:
    true_frames = f.readlines()[0].split("\t")[1].split(",")

true_frames, predicted_frames


(['Political',
  'Fairness_and_equality',
  'Policy_prescription_and_evaluation',
  'Security_and_defense',
  'Economic',
  'Public_opinion\n'],
 ['Security_and_defense',
  'Legality_Constitutionality_and_jurisprudence',
  'Economic',
  'Crime_and_punishment',
  'Public_opinion',
  'Fairness_and_equality'])

In [107]:
# Compute the F1 score
def f1_score(predicted_frames, true_frames):
    tp = len(set(predicted_frames) & set(true_frames))
    fp = len(set(predicted_frames) - set(true_frames))
    fn = len(set(true_frames) - set(predicted_frames))
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

f1_score(predicted_frames, true_frames)

0.3636363636363636