In [1]:
import os

os.chdir('../data/')

labels_path = "data/en/dev-labels-subtask-3.txt"
articles_path = "data/en/dev-articles-subtask-3/"

In [2]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep='\t', header=None, names=["article_id", "sentence_id", "persuasion_technique"])

labels_df.head()

Unnamed: 0,article_id,sentence_id,persuasion_technique
0,813452859,1,
1,813452859,3,
2,813452859,4,
3,813452859,5,
4,813452859,6,


In [3]:
# Get unique article IDs from the dev-labels data
unique_article_ids = labels_df['article_id'].unique()

# Initialize an empty list to store results
results = []

# For each unique article ID, read the corresponding article file and join with the dev-labels data
for article_id in unique_article_ids:
    # Construct the file path for the article
    file_path = f"{articles_path}/article{article_id}.txt"
    
    try:
        # Load the article file
        with open(file_path, 'r') as file:
            # Read the article sentences into a list
            sentences = file.readlines()

        # Filter dev-labels data for the current article_id and where persuasion_technique is not NaN
        relevant_rows = labels_df[(labels_df['article_id'] == article_id) & (~labels_df['persuasion_technique'].isna())]

        # For each relevant row, get the corresponding sentence and persuasion technique and append to the results list
        for _, row in relevant_rows.iterrows():
            sentence = sentences[row['sentence_id'] - 1].strip()  # Subtracting 1 because list indexing starts from 0
            technique = row['persuasion_technique']
            results.append([article_id, row['sentence_id'], sentence, technique])
    
    except FileNotFoundError:
        # If the file for an article_id doesn't exist, continue to the next one
        continue

# Convert the results list to a dataframe
df = pd.DataFrame(results, columns=['article_id', 'sentence_id', 'sentence', 'persuasion_technique'])

df.head()


Unnamed: 0,article_id,sentence_id,sentence,persuasion_technique
0,813452859,7,Michael Swadling: I guess her only chance is i...,"False_Dilemma-No_Choice,Loaded_Language"
1,813452859,9,There is a chance; as unfortunately there are ...,"False_Dilemma-No_Choice,Loaded_Language,Name_C..."
2,813452859,11,Michael Swadling: The EU withdrawal act is in ...,Conversation_Killer
3,813452859,12,I often use the example of an iPhone to people...,"Conversation_Killer,Red_Herring"
4,813452859,15,Michael Swadling: The EU makes a profit on its...,Obfuscation-Vagueness-Confusion


In [4]:
# Split the frames column into a list of frames
df["persuasion_technique_list"] = df["persuasion_technique"].str.split(",")

# create for each frame a new column with the frame as name and 1 if the frame is present in the article and 0 if not
for frame in df["persuasion_technique_list"].explode().unique():
    df[frame] = df["persuasion_technique_list"].apply(lambda x: 1 if frame in x else 0)

df.head()

Unnamed: 0,article_id,sentence_id,sentence,persuasion_technique,persuasion_technique_list,False_Dilemma-No_Choice,Loaded_Language,Name_Calling-Labeling,Conversation_Killer,Red_Herring,...,Flag_Waving,Doubt,Whataboutism,Appeal_to_Fear-Prejudice,Causal_Oversimplification,Appeal_to_Hypocrisy,Appeal_to_Popularity,Appeal_to_Authority,Straw_Man,Guilt_by_Association
0,813452859,7,Michael Swadling: I guess her only chance is i...,"False_Dilemma-No_Choice,Loaded_Language","[False_Dilemma-No_Choice, Loaded_Language]",1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,813452859,9,There is a chance; as unfortunately there are ...,"False_Dilemma-No_Choice,Loaded_Language,Name_C...","[False_Dilemma-No_Choice, Loaded_Language, Nam...",1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,813452859,11,Michael Swadling: The EU withdrawal act is in ...,Conversation_Killer,[Conversation_Killer],0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,813452859,12,I often use the example of an iPhone to people...,"Conversation_Killer,Red_Herring","[Conversation_Killer, Red_Herring]",0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,813452859,15,Michael Swadling: The EU makes a profit on its...,Obfuscation-Vagueness-Confusion,[Obfuscation-Vagueness-Confusion],0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X = df["sentence"]
y = df.drop(columns=["article_id", "sentence_id", "sentence", "persuasion_technique", "persuasion_technique_list"])

In [6]:
X.head()

0    Michael Swadling: I guess her only chance is i...
1    There is a chance; as unfortunately there are ...
2    Michael Swadling: The EU withdrawal act is in ...
3    I often use the example of an iPhone to people...
4    Michael Swadling: The EU makes a profit on its...
Name: sentence, dtype: object

In [7]:
y.head()

Unnamed: 0,False_Dilemma-No_Choice,Loaded_Language,Name_Calling-Labeling,Conversation_Killer,Red_Herring,Obfuscation-Vagueness-Confusion,Exaggeration-Minimisation,Repetition,Slogans,Flag_Waving,Doubt,Whataboutism,Appeal_to_Fear-Prejudice,Causal_Oversimplification,Appeal_to_Hypocrisy,Appeal_to_Popularity,Appeal_to_Authority,Straw_Man,Guilt_by_Association
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
len(X), len(y)

(1120, 1120)

In [9]:
import torch
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
from allennlp.predictors.predictor import Predictor
from allennlp_models.structured_prediction.models import srl_bert

# Load the SRL predictor
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"Failed to load image Python extension: {e}")
error loading _jsonnet (this is expected on Windows), treating C:\Users\elias\AppData\Local\Temp\tmpt9i9kftq\config.json as plain json
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing

In [23]:
import random

def extract_srl_components(article, predictor):
    """
    Extract SRL components for an article.
    """
    srl = predictor.predict(sentence=article)
    
    extracted_data = []
    for verb_entry in srl['verbs']:
        predicate = verb_entry['verb']
        tags = verb_entry['tags']
        
        arg0_indices = [i for i, tag in enumerate(tags) if tag in ['B-ARG0', 'I-ARG0']]
        arg1_indices = [i for i, tag in enumerate(tags) if tag in ['B-ARG1', 'I-ARG1']]
        
        arg0 = [srl['words'][i] for i in arg0_indices] if arg0_indices else []
        arg1 = [srl['words'][i] for i in arg1_indices] if arg1_indices else []
        
        extracted_data.append({
            'predicate': [predicate],
            'ARG0': arg0,
            'ARG1': arg1
        })
        
    return extracted_data

In [24]:
extract_srl_components("The red horse simply turned around and fought off the fly with its tail.", predictor)

[{'predicate': ['turned'], 'ARG0': [], 'ARG1': ['The', 'red', 'horse']},
 {'predicate': ['fought'],
  'ARG0': ['The', 'red', 'horse'],
  'ARG1': ['the', 'fly']}]

In [38]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if a_set & b_set:
        return True
    else:
        return False


In [50]:
interested_tags_list = ["B-V", "B-ARGM-MOD", "B-ARGM-DIR", "B-ARGM-NEG", 'B-ARG0', 'B-ARG1', "B-ARG2", "B-ARG3",
                        "B-ARG4"]
from collections import defaultdict
def get_predicate_span_tags_dict(tags):
    # Initial Structure of span_tags_dict: {"B-V": [(30, 31)], "B-ARG2": [(28,28), (32,40),...]}
    predicate_span_tags_dict = defaultdict(list)
    i = 0
    while i < len(tags):
        if tags[i] in interested_tags_list:
            start_index = i
            i += 1
            while i < len(tags) and tags[i][0] == 'I':
                i += 1
            end_index = i - 1
            predicate_span_tags_dict[tags[start_index]].append((start_index, end_index))
            continue
        else:
            i += 1
    return predicate_span_tags_dict


In [51]:
import numpy as np


real_args_list = ['B-ARG0', 'B-ARG1', "B-ARG2", "B-ARG3", "B-ARG4"]

def process_srl_sents_for_one_doc(srl_doc):
    doc_words = list()

    doc_span_tags = list()
    for srl_sent in srl_doc:
        sent_span_tags = list()
        for srl_verb in srl_sent['verbs']:
            tags = srl_verb['tags']
            predicate_span_tags_dict = get_predicate_span_tags_dict(tags)

            # Pattern 1: The srl unit should have verb present
            if "B-V" not in predicate_span_tags_dict.keys():
                continue
            # Pattern 2: If there are more than one verb, choose the first one for now.
            if len(predicate_span_tags_dict['B-V']) > 1:
                predicate_span_tags_dict['B-V'] = [predicate_span_tags_dict['B-V'][0]]
            # Pattern 3: The srl unit should have at least one real arg
            if not common_member(list(predicate_span_tags_dict.keys()), real_args_list):
                continue
            # Pattern 4: If there are more than one particular type of arg, choose the one closest to the verb:
            # E.G. [ARG1: UNCLE SAM] [ARGM-MOD: CAN'T] [V: TURN] [ARGM-DIR: BACK] [ARG1: ON LEGAL U.S. IMMIGRANTS] .
            assert len(predicate_span_tags_dict['B-V']) == 1
            appro_verb_pos = int((predicate_span_tags_dict['B-V'][0][0] + predicate_span_tags_dict['B-V'][0][1]) / 2)
            for tag, poss in predicate_span_tags_dict.items():
                if len(poss) > 1:
                    appro_poss = [int((pos[0] + pos[1]) / 2) for pos in poss]
                    appro_diff_from_verb = [abs(appro_pos - appro_verb_pos) for appro_pos in appro_poss]
                    index = np.where(appro_diff_from_verb == np.amin(
                        appro_diff_from_verb))  # The returned index is a tuple (array([0, 2]),)
                    predicate_span_tags_dict[tag] = [predicate_span_tags_dict[tag][index[0][0]]]

            sent_span_tags.append(predicate_span_tags_dict)
        if len(sent_span_tags) > 0:
            doc_span_tags.append(sent_span_tags)
            sent_words = srl_sent['words']
            doc_words.append(sent_words)
    # for each doc
    return doc_words, doc_span_tags

In [55]:
srls = predictor.predict(sentence=X[0])
srls

{'verbs': [{'verb': 'guess',
   'description': 'Michael Swadling : [ARG0: I] [V: guess] [ARG1: her only chance is if Labour decides that they want to dishonour democracy and effectively keep us in the EU] .',
   'tags': ['O',
    'O',
    'O',
    'B-ARG0',
    'B-V',
    'B-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'O']},
  {'verb': 'is',
   'description': 'Michael Swadling : I guess [ARG1: her only chance] [V: is] [ARG2: if Labour decides that they want to dishonour democracy and effectively keep us in the EU] .',
   'tags': ['O',
    'O',
    'O',
    'O',
    'O',
    'B-ARG1',
    'I-ARG1',
    'I-ARG1',
    'B-V',
    'B-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2',
    'I-ARG2

In [54]:
process_srl_sents_for_one_doc([srls])

([['The',
   'red',
   'horse',
   'simply',
   'turned',
   'around',
   'and',
   'fought',
   'off',
   'the',
   'fly',
   'with',
   'its',
   'tail']],
 [[defaultdict(list,
               {'B-ARG1': [(0, 2)], 'B-V': [(4, 4)], 'B-ARGM-DIR': [(5, 5)]}),
   defaultdict(list,
               {'B-ARG0': [(0, 2)],
                'B-V': [(7, 7)],
                'B-ARG1': [(9, 10)],
                'B-ARG2': [(11, 13)]})]])

In [28]:
test_article = X[0]
test_article

'Michael Swadling: I guess her only chance is if Labour decides that they want to dishonour democracy and effectively keep us in the EU.'

In [30]:
len(extract_srl_components(test_article, predictor))

6

In [25]:
# Extract SRL embeddings for each article
srl_embeddings = [extract_srl_components(article, predictor) for article in X[:20]]

In [27]:
srl_embeddings[0]

[{'predicate': ['guess'],
  'ARG0': ['I'],
  'ARG1': ['her',
   'only',
   'chance',
   'is',
   'if',
   'Labour',
   'decides',
   'that',
   'they',
   'want',
   'to',
   'dishonour',
   'democracy',
   'and',
   'effectively',
   'keep',
   'us',
   'in',
   'the',
   'EU']},
 {'predicate': ['is'], 'ARG0': [], 'ARG1': ['her', 'only', 'chance']},
 {'predicate': ['decides'],
  'ARG0': ['Labour'],
  'ARG1': ['that',
   'they',
   'want',
   'to',
   'dishonour',
   'democracy',
   'and',
   'effectively',
   'keep',
   'us',
   'in',
   'the',
   'EU']},
 {'predicate': ['want'],
  'ARG0': ['they'],
  'ARG1': ['to',
   'dishonour',
   'democracy',
   'and',
   'effectively',
   'keep',
   'us',
   'in',
   'the',
   'EU']},
 {'predicate': ['dishonour'], 'ARG0': ['they'], 'ARG1': ['democracy']},
 {'predicate': ['keep'], 'ARG0': ['they'], 'ARG1': ['us']}]

In [26]:
# transform the list of dicts to 3 lists
predicates = [srl.get("predicate", "") for srl in srl_embeddings]
arg0s = [srl.get("ARG0", "") for srl in srl_embeddings]
arg1s = [srl.get('ARG1') for srl in srl_embeddings]

AttributeError: 'list' object has no attribute 'get'

In [15]:
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
def strings_to_wordwise_embeddings(strings_list):
    all_embeddings = []

    for s in strings_list:
        # If none, return empty embedding
        if s is None:
            all_embeddings.append([torch.zeros(768, device=device)])
            continue
        
        word_embeddings = []

        # Split the string into words
        words = s.split()

        # Tokenize and get embeddings for each word
        for word in words:
            inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
            
            word_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach()
            word_embeddings.append(word_embedding)

        all_embeddings.append(word_embeddings)

    return all_embeddings

# Note: This refactored function assumes that the tokenizer and model are already defined and loaded in the global scope.


In [17]:
strings_to_wordwise_embeddings(["The red horse simply turned its tail."])[0]

[tensor([ 8.7686e-02, -1.7619e-01, -2.0995e-01,  6.3118e-02, -2.5592e-02,
         -1.2145e-01,  2.6319e-01, -1.0966e-01,  1.7178e-01, -3.7934e-01,
         -2.1645e-01,  6.7345e-02,  4.3943e-02, -5.4717e-03, -2.2551e-01,
         -2.2371e-01,  1.8952e-02,  3.3047e-02, -7.3294e-02,  1.3794e-01,
          1.1153e-01,  1.0966e-01,  1.2979e-01,  1.1445e-01,  7.3667e-02,
          3.7715e-01, -2.8807e-01, -1.8478e-01, -4.0824e-01,  1.0867e-01,
         -1.3929e-01, -2.7609e-01, -8.6871e-03,  3.9589e-01, -2.2685e-01,
         -1.4674e-01, -8.6000e-03, -9.9858e-02, -3.7821e-01, -1.4144e-01,
         -6.5044e-02, -8.4578e-02,  1.3387e-01, -9.6754e-02, -9.9444e-02,
          6.1538e-02, -3.3527e-01,  7.5753e-02, -1.0591e-01,  3.1784e-01,
         -2.7297e-01,  2.2933e-01,  2.7825e-02,  4.5593e-01,  6.9887e-02,
          8.5220e-05,  3.1045e-01,  1.2441e-01, -2.5211e-01, -2.7717e-01,
          2.7479e-01,  1.8580e-01, -4.0700e-02, -5.0711e-04,  2.2583e-01,
          4.4431e-02,  3.6134e-01, -2.

In [18]:
# Convert the lists of strings to lists of embeddings
predicates_embeddings = strings_to_wordwise_embeddings(predicates)
ARG0_embeddings = strings_to_wordwise_embeddings(arg0s)
ARG1_embeddings = strings_to_wordwise_embeddings(arg1s)

KeyboardInterrupt: 

In [154]:
len(ARG0_embeddings[2])

1

In [109]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random

In [110]:
# Setting up random seed for reproducibility
torch.manual_seed(42)
random.seed(42)

In [111]:
# Generate random embeddings for each sentence (in practice, replace with SRL embeddings)
embedding_dim = 100
sentences_embeddings = [torch.randn(embedding_dim) for _ in X]

sentences_embeddings[0]

tensor([ 1.9269e+00,  1.4873e+00,  9.0072e-01, -2.1055e+00,  6.7842e-01,
        -1.2345e+00, -4.3067e-02, -1.6047e+00, -7.5214e-01,  1.6487e+00,
        -3.9248e-01, -1.4036e+00, -7.2788e-01, -5.5943e-01, -7.6884e-01,
         7.6245e-01,  1.6423e+00, -1.5960e-01, -4.9740e-01,  4.3959e-01,
        -7.5813e-01,  1.0783e+00,  8.0080e-01,  1.6806e+00,  1.2791e+00,
         1.2964e+00,  6.1047e-01,  1.3347e+00, -2.3162e-01,  4.1759e-02,
        -2.5158e-01,  8.5986e-01, -1.3847e+00, -8.7124e-01, -2.2337e-01,
         1.7174e+00,  3.1888e-01, -4.2452e-01,  3.0572e-01, -7.7459e-01,
        -1.5576e+00,  9.9564e-01, -8.7979e-01, -6.0114e-01, -1.2742e+00,
         2.1228e+00, -1.2347e+00, -4.8791e-01, -9.1382e-01, -6.5814e-01,
         7.8024e-02,  5.2581e-01, -4.8799e-01,  1.1914e+00, -8.1401e-01,
        -7.3599e-01, -1.4032e+00,  3.6004e-02, -6.3477e-02,  6.7561e-01,
        -9.7807e-02,  1.8446e+00, -1.1845e+00,  1.3835e+00,  1.4451e+00,
         8.5641e-01,  2.2181e+00,  5.2317e-01,  3.4

In [112]:
import torch.nn as nn
import torch

class MultiViewAutoencoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, encoded_dim):
        super(MultiViewAutoencoder, self).__init__()
        
        # For predicates
        self.encoder_p = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoded_dim)
        )
        self.decoder_p = nn.Linear(encoded_dim, embedding_dim)
        
        # For ARG0
        self.encoder_a0 = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoded_dim)
        )
        self.decoder_a0 = nn.Linear(encoded_dim, embedding_dim)
        
        # For ARG1
        self.encoder_a1 = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoded_dim)
        )
        self.decoder_a1 = nn.Linear(encoded_dim, embedding_dim)
    
    def forward(self, x_p, x_a0, x_a1):
        # Encoding
        x_p = self.encoder_p(x_p)
        x_a0 = self.encoder_a0(x_a0)
        x_a1 = self.encoder_a1(x_a1)
        
        # Decoding
        x_p = self.decoder_p(x_p)
        x_a0 = self.decoder_a0(x_a0)
        x_a1 = self.decoder_a1(x_a1)
        
        return x_p, x_a0, x_a1


In [113]:
# Define the classifier
class Classifier(nn.Module):
    def __init__(self, encoded_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc = nn.Linear(encoded_dim, num_classes)
    
    def forward(self, x):
        x = self.fc(x)
        return x.squeeze()  # Squeeze the tensor to remove singleton dimensions


In [114]:
# Define the dimensions
embedding_dim = 768  # This could be the size of your word embeddings, e.g., 768 for BERT
hidden_dim = 150     # This is an intermediate dimension, can be chosen based on model complexity
encoded_dim = 50     # This is the final encoded dimension

# Initialize the autoencoder
autoencoder = MultiViewAutoencoder(embedding_dim, hidden_dim, encoded_dim)

In [115]:
num_classes = len(y.columns)

classifier = Classifier(encoded_dim, num_classes)

In [116]:
autoencoder, classifier

(MultiViewAutoencoder(
   (encoder_p): Sequential(
     (0): Linear(in_features=768, out_features=150, bias=True)
     (1): ReLU()
     (2): Linear(in_features=150, out_features=50, bias=True)
   )
   (decoder_p): Linear(in_features=50, out_features=768, bias=True)
   (encoder_a0): Sequential(
     (0): Linear(in_features=768, out_features=150, bias=True)
     (1): ReLU()
     (2): Linear(in_features=150, out_features=50, bias=True)
   )
   (decoder_a0): Linear(in_features=50, out_features=768, bias=True)
   (encoder_a1): Sequential(
     (0): Linear(in_features=768, out_features=150, bias=True)
     (1): ReLU()
     (2): Linear(in_features=150, out_features=50, bias=True)
   )
   (decoder_a1): Linear(in_features=50, out_features=768, bias=True)
 ),
 Classifier(
   (fc): Linear(in_features=50, out_features=19, bias=True)
 ))

In [119]:
import torch.optim as optim
import torch.nn as nn

# Define the loss functions
reconstruction_loss_fn = nn.MSELoss()
classification_loss_fn = nn.BCEWithLogitsLoss()  # Updated loss function

# Define the optimizer (both models' parameters are optimized jointly)
optimizer = optim.Adam(list(autoencoder.parameters()) + list(classifier.parameters()), lr=0.001)

# Number of epochs
epochs = 10

# Convert y dataframe to a list of tensors
targets = [torch.tensor(y.iloc[i].values).float() for i in range(len(y))]

# Training loop
for epoch in range(epochs):
    for embedding_p, embedding_a0, embedding_a1, target in zip(predicates_embeddings, ARG0_embeddings, ARG1_embeddings, targets):
        
        # Move tensors to the same device as the model
        embedding_p = embedding_p.to(device)
        embedding_a0 = embedding_a0.to(device)
        embedding_a1 = embedding_a1.to(device)
        target = target.to(device)


        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass through the autoencoder
        reconstructed_p, reconstructed_a0, reconstructed_a1 = autoencoder(embedding_p, embedding_a0, embedding_a1)

        # Compute the reconstruction loss for each view
        reconstruction_loss_p = reconstruction_loss_fn(reconstructed_p, embedding_p)
        reconstruction_loss_a0 = reconstruction_loss_fn(reconstructed_a0, embedding_a0)
        reconstruction_loss_a1 = reconstruction_loss_fn(reconstructed_a1, embedding_a1)

        # Total reconstruction loss
        total_reconstruction_loss = reconstruction_loss_p + reconstruction_loss_a0 + reconstruction_loss_a1

        # Forward pass through the classifier (using the encoded embeddings of each view)
        encoded_p = autoencoder.encoder_p(embedding_p)
        encoded_a0 = autoencoder.encoder_a0(embedding_a0)
        encoded_a1 = autoencoder.encoder_a1(embedding_a1)

        # Combine the encoded embeddings (e.g., by averaging) before passing to the classifier
        combined_encoded_embedding = (encoded_p + encoded_a0 + encoded_a1) / 3.0
        frame_predictions = classifier(combined_encoded_embedding)

        # Compute the classification loss
        classification_loss = classification_loss_fn(frame_predictions, target)

        # Combine the losses
        combined_loss = total_reconstruction_loss + classification_loss

        # Backward pass and optimization
        combined_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Reconstruction Loss: {total_reconstruction_loss.item()}, Classification Loss: {classification_loss.item()}")


torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size([768])
torch.Size

KeyboardInterrupt: 

In [None]:
def predict_frames(article, autoencoder, classifier):
    # Tokenize the article into sentences
    sentences = article.split('.')
    
    # Extract SRL embeddings for the sentences (use random embeddings for this demo)
    embeddings = [torch.randn(embedding_dim) for sentence in sentences]
    
    # List to store frame predictions for each sentence
    all_predictions = []
    
    with torch.no_grad():  # Ensure no gradients are computed during inference
        for embedding in embeddings:
            # Pass the embedding through the trained encoder
            encoded_embedding = autoencoder.encoder(embedding)
            
            # Pass the encoded embedding through the trained classifier
            frame_predictions = classifier(encoded_embedding)
            
            # Convert frame predictions to binary (0 or 1) using a threshold (e.g., 0.5)
            frame_predictions = (frame_predictions > 0.5).float()
            
            all_predictions.append(frame_predictions)
    
    # Aggregate sentence-level predictions to get document-level prediction (average in this case)
    avg_prediction = torch.mean(torch.stack(all_predictions), dim=0)
    document_prediction = (avg_prediction > 0.5).float()
    
    return document_prediction


In [None]:
# read article from data\en\dev-articles-subtask-2\article813452859.txt
with open("data/en/dev-articles-subtask-2/article813452859.txt", "r") as f:
    article = f.read()

# Predict frames for the article
predicted_frames = predict_frames(article, autoencoder, classifier)

# Convert the predicted frames to a list of frames
predicted_frames = [y.columns[i] for i, frame in enumerate(predicted_frames) if frame == 1]


# read the true frames from data\en\dev-labels-subtask-2.txt
with open("data/en/dev-labels-subtask-2.txt", "r") as f:
    true_frames = f.readlines()[0].split("\t")[1].split(",")

true_frames, predicted_frames


AttributeError: 'MultiViewAutoencoder' object has no attribute 'encoder'

In [None]:
# Compute the F1 score
def f1_score(predicted_frames, true_frames):
    tp = len(set(predicted_frames) & set(true_frames))
    fp = len(set(predicted_frames) - set(true_frames))
    fn = len(set(true_frames) - set(predicted_frames))
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    f1 = 2 * (precision * recall) / (precision + recall)
    
    return f1

f1_score(predicted_frames, true_frames)

0.3636363636363636