# Load SRL

In [12]:
# load pkl from data\srls\mfc\FRISS_srl.pkl

import pickle

with open('../../data/srls/mfc/FRISS_srl.pkl', 'rb') as f:
    data = pickle.load(f)

print(data.keys())

RangeIndex(start=0, stop=67480, step=1)


In [13]:
# create statistics

num_preds = []

for k, v in data.items():
    num_preds.append(len(v))

print('Number of predicates per sentence')
print('Mean:', sum(num_preds) / len(num_preds))
print('Max:', max(num_preds))
print('Min:', min(num_preds))
print('Number of sentences:', len(num_preds))

Number of predicates per sentence
Mean: 3.1542679312388855
Max: 17
Min: 1
Number of sentences: 67480


In [14]:
data[0]

[{'predicate': 'need',
  'ARG0': 'IMM-10005 PRIMARY Immigrants without HOPE',
  'ARG1': 'help entering college Anxiety'},
 {'predicate': 'entering',
  'ARG0': 'IMM-10005 PRIMARY Immigrants without HOPE',
  'ARG1': 'college Anxiety Jose Alvarado'},
 {'predicate': 'gripped',
  'ARG0': 'IMM-10005 PRIMARY Immigrants without HOPE need help entering college Anxiety',
  'ARG1': 'Jose Alvarado'}]

# Load MRC preprocessed Data

In [60]:
# data\mfc\data_prepared.json
import pandas as pd

with open('../../data/mfc/data_prepared.json', 'r') as f:
    data_prepared = pd.read_json(f)

In [62]:
data_prepared.shape

(67480, 18)

In [5]:
# preprocess text

import re

def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('  ', ' ')
    text = text.strip()

    # some texts start with "IMM-XXXXX PRIMARY" remove
    text = re.sub(r'^IMM-\d+ PRIMARY', '', text)

    # remove leading and trailing whitespaces
    text = text.strip()

    return text

data_prepared['text'] = data_prepared['text'].apply(preprocess_text)

In [6]:
data_prepared.head()

Unnamed: 0,article_id,text,document_frame,Capacity and Resources,Crime and Punishment,Cultural Identity,Economic,External Regulation and Reputation,Fairness and Equality,Health and Safety,"Legality, Constitutionality, Jurisdiction",Morality,Other,Policy Prescription and Evaluation,Political,Public Sentiment,Quality of Life,Security and Defense
0,Immigration1.0-10005,Immigrants without HOPE need help entering col...,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Immigration1.0-10005,It mounted as students went around the room te...,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Immigration1.0-10005,Georgia Tech.,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,Immigration1.0-10005,University of Georgia.,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,Immigration1.0-10005,"""All I could say was, 'I'm planning to see if ...",Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [7]:
# stats for text length

text_lengths = data_prepared['text'].apply(len)

In [8]:
text_lengths.describe()

count    67480.000000
mean       139.985803
std         78.341919
min          1.000000
25%         81.000000
50%        130.000000
75%        188.000000
max       1102.000000
Name: text, dtype: float64

In [9]:
df = data_prepared.copy()

In [56]:
# save to json
df.to_json('../../data/mfc/data_prepared_cleaned.json', orient='records')

In [5]:
import pandas as pd

# load
with open('../../data/mfc/data_prepared_cleaned.json', 'r') as f:
    df = pd.read_json(f)

df.shape

(67480, 18)

# Load frameaxis

In [2]:
import pickle

# load data\frameaxis\mfc\frameaxis_contextualized_mft.pkl

with open('../../data/frameaxis/mfc/frameaxis_contextualized_mft.pkl', 'rb') as f:
    data_fa = pickle.load(f)

data_fa.shape

(67480, 11)

In [14]:
data_fa

Unnamed: 0,article_id,care_bias,care_intensity,loyalty_bias,loyalty_intensity,authority_bias,authority_intensity,fairness_bias,fairness_intensity,sanctity_bias,sanctity_intensity
0,Immigration1.0-10005,0.046806,0.001429,-0.072163,0.004429,-0.120223,0.005293,-0.089829,0.003872,-0.100013,0.001405
1,Immigration1.0-10005,0.033659,0.001912,-0.096215,0.002419,-0.124081,0.001463,-0.134707,0.006858,-0.106475,0.000591
2,Immigration1.0-10005,-0.013455,0.005673,-0.187340,0.014931,-0.031740,0.006546,-0.003253,0.005017,-0.068036,0.002993
3,Immigration1.0-10005,0.030489,0.000623,-0.134004,0.009003,-0.061839,0.004271,-0.006635,0.004705,-0.039451,0.005877
4,Immigration1.0-10005,0.050968,0.001929,-0.141105,0.011773,-0.163692,0.007566,-0.144491,0.011913,-0.112075,0.001787
...,...,...,...,...,...,...,...,...,...,...,...
67475,Immigration1.0-9998,0.062358,0.001915,-0.104728,0.004193,-0.113421,0.002445,-0.092194,0.003707,-0.111625,0.002539
67476,Immigration1.0-9998,0.024687,0.001381,-0.204733,0.022664,-0.237031,0.020564,-0.206741,0.018972,-0.133000,0.001048
67477,Immigration1.0-9998,0.073612,0.003475,-0.116507,0.007126,-0.176678,0.008782,-0.110437,0.005405,-0.070875,0.003934
67478,Immigration1.0-9998,0.035231,0.003923,-0.125450,0.005931,-0.125360,0.004460,-0.104501,0.005406,-0.111736,0.001030


In [23]:
# join data_fa with df on row-wise

df_fa = pd.concat([df, data_fa], axis=1)

df_fa.shape

(67480, 29)

In [24]:
# keep 'article_id', 'text', 'care_bias', 'care_intensity', 'loyalty_bias', 'loyalty_intensity', 'authority_bias', 'authority_intensity', 'fairness_bias', 'fairness_intensity', 'sanctity_bias', 'sanctity_intensity'

df_fa = df_fa[['article_id', 'text', 'care_bias', 'care_intensity', 'loyalty_bias', 'loyalty_intensity', 'authority_bias', 'authority_intensity', 'fairness_bias', 'fairness_intensity', 'sanctity_bias', 'sanctity_intensity']]

df_fa.head()

Unnamed: 0,article_id,article_id.1,text,care_bias,care_intensity,loyalty_bias,loyalty_intensity,authority_bias,authority_intensity,fairness_bias,fairness_intensity,sanctity_bias,sanctity_intensity
0,Immigration1.0-10005,Immigration1.0-10005,Immigrants without HOPE need help entering col...,0.046806,0.001429,-0.072163,0.004429,-0.120223,0.005293,-0.089829,0.003872,-0.100013,0.001405
1,Immigration1.0-10005,Immigration1.0-10005,It mounted as students went around the room te...,0.033659,0.001912,-0.096215,0.002419,-0.124081,0.001463,-0.134707,0.006858,-0.106475,0.000591
2,Immigration1.0-10005,Immigration1.0-10005,Georgia Tech.,-0.013455,0.005673,-0.18734,0.014931,-0.03174,0.006546,-0.003253,0.005017,-0.068036,0.002993
3,Immigration1.0-10005,Immigration1.0-10005,University of Georgia.,0.030489,0.000623,-0.134004,0.009003,-0.061839,0.004271,-0.006635,0.004705,-0.039451,0.005877
4,Immigration1.0-10005,Immigration1.0-10005,"""All I could say was, 'I'm planning to see if ...",0.050968,0.001929,-0.141105,0.011773,-0.163692,0.007566,-0.144491,0.011913,-0.112075,0.001787


# Load SRL

In [26]:
# open ../../data/srls/mfc/mfc_labeled.pkl

with open('../../data/srls/mfc/mfc_labeled.pkl', 'rb') as f:
    data_srl = pickle.load(f)

data_srl.shape

(67480, 3)

In [27]:
data_srl

Unnamed: 0,article_id,text,srls
0,Immigration1.0-10005,Immigrants without HOPE need help entering col...,"[{'predicate': 'need', 'ARG0': 'Immigrants wit..."
1,Immigration1.0-10005,It mounted as students went around the room te...,"[{'predicate': 'mounted', 'ARG0': '', 'ARG1': ..."
2,Immigration1.0-10005,Georgia Tech.,"[{'predicate': '', 'ARG0': '', 'ARG1': ''}]"
3,Immigration1.0-10005,University of Georgia.,"[{'predicate': '', 'ARG0': '', 'ARG1': ''}]"
4,Immigration1.0-10005,"""All I could say was, 'I'm planning to see if ...","[{'predicate': 'say', 'ARG0': 'I', 'ARG1': 'Al..."
...,...,...,...
67475,Immigration1.0-9998,"Sue Brown, spokeswoman for the INS, said it's ...","[{'predicate': 'said', 'ARG0': 'Sue Brown , sp..."
67476,Immigration1.0-9998,"""They love it,"" she said.","[{'predicate': 'love', 'ARG0': 'They', 'ARG1':..."
67477,Immigration1.0-9998,"""They use these units to interview the people,...","[{'predicate': 'use', 'ARG0': 'They', 'ARG1': ..."
67478,Immigration1.0-9998,"""We do about 15 interviews a day,"" Brown said.","[{'predicate': 'do', 'ARG0': 'We', 'ARG1': 'ab..."


# Article

In [28]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class ArticleDataset(Dataset):
    def __init__(
        self,
        X,
        X_srl,
        X_frameaxis,
        tokenizer,
        labels=None,
        max_sentences_per_article=32,
        max_sentence_length=32,
        max_args_per_sentence=10,
        max_arg_length=16,
        frameaxis_dim=20,
    ):
        self.X = X
        self.X_srl = X_srl
        self.X_frameaxis = X_frameaxis
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_sentences_per_article = max_sentences_per_article
        self.max_sentence_length = max_sentence_length
        self.max_args_per_sentence = max_args_per_sentence
        self.max_arg_length = max_arg_length

        self.frameaxis_dim = frameaxis_dim

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sentences = self.X.iloc[idx]
        srl_data = self.X_srl.iloc[idx]
        frameaxis_data = self.X_frameaxis.iloc[idx]

        # labels
        labels = self.labels.iloc[idx]

        # Tokenize sentences and get attention masks
        sentence_ids, sentence_attention_masks = [], []
        for sentence in sentences:
            encoded = self.tokenizer(
                sentence,
                add_special_tokens=True,
                max_length=self.max_sentence_length,
                truncation=True,
                padding="max_length",
                return_attention_mask=True,
            )
            sentence_ids.append(encoded["input_ids"])
            sentence_attention_masks.append(encoded["attention_mask"])

        # Padding for sentences if necessary
        while len(sentence_ids) < self.max_sentences_per_article:
            sentence_ids.append([0] * self.max_sentence_length)
            sentence_attention_masks.append([0] * self.max_sentence_length)

        sentence_ids = sentence_ids[: self.max_sentences_per_article]
        sentence_attention_masks = sentence_attention_masks[
            : self.max_sentences_per_article
        ]

        # frameaxis
        while len(frameaxis_data) < self.max_sentences_per_article:
            frameaxis_data.append([0] * self.frameaxis_dim)

        frameaxis_data = frameaxis_data[: self.max_sentences_per_article]

        # replace nan values in frameaxis with 0
        frameaxis_data = pd.DataFrame(frameaxis_data).fillna(0).values.tolist()

        # Process SRL data
        predicates, arg0s, arg1s = [], [], []
        predicate_attention_masks, arg0_attention_masks, arg1_attention_masks = (
            [],
            [],
            [],
        )
        for srl_items in srl_data:
            sentence_predicates, sentence_arg0s, sentence_arg1s = [], [], []
            sentence_predicate_masks, sentence_arg0_masks, sentence_arg1_masks = (
                [],
                [],
                [],
            )

            if not isinstance(srl_items, list):
                srl_items = [srl_items]

            for item in srl_items:
                encoded_predicate = self.tokenizer(
                    item["predicate"],
                    add_special_tokens=True,
                    max_length=self.max_arg_length,
                    truncation=True,
                    padding="max_length",
                    return_attention_mask=True,
                )
                encoded_arg0 = self.tokenizer(
                    item["ARG0"],
                    add_special_tokens=True,
                    max_length=self.max_arg_length,
                    truncation=True,
                    padding="max_length",
                    return_attention_mask=True,
                )
                encoded_arg1 = self.tokenizer(
                    item["ARG1"],
                    add_special_tokens=True,
                    max_length=self.max_arg_length,
                    truncation=True,
                    padding="max_length",
                    return_attention_mask=True,
                )

                sentence_predicates.append(encoded_predicate["input_ids"])
                sentence_arg0s.append(encoded_arg0["input_ids"])
                sentence_arg1s.append(encoded_arg1["input_ids"])

                sentence_predicate_masks.append(encoded_predicate["attention_mask"])
                sentence_arg0_masks.append(encoded_arg0["attention_mask"])
                sentence_arg1_masks.append(encoded_arg1["attention_mask"])

            # Padding for SRL elements
            for _ in range(self.max_args_per_sentence):
                sentence_predicates.append([0] * self.max_arg_length)
                sentence_arg0s.append([0] * self.max_arg_length)
                sentence_arg1s.append([0] * self.max_arg_length)

                sentence_predicate_masks.append([0] * self.max_arg_length)
                sentence_arg0_masks.append([0] * self.max_arg_length)
                sentence_arg1_masks.append([0] * self.max_arg_length)

            sentence_predicates = sentence_predicates[: self.max_args_per_sentence]
            sentence_arg0s = sentence_arg0s[: self.max_args_per_sentence]
            sentence_arg1s = sentence_arg1s[: self.max_args_per_sentence]

            sentence_predicate_masks = sentence_predicate_masks[
                : self.max_args_per_sentence
            ]
            sentence_arg0_masks = sentence_arg0_masks[: self.max_args_per_sentence]
            sentence_arg1_masks = sentence_arg1_masks[: self.max_args_per_sentence]

            predicates.append(sentence_predicates)
            arg0s.append(sentence_arg0s)
            arg1s.append(sentence_arg1s)

            predicate_attention_masks.append(sentence_predicate_masks)
            arg0_attention_masks.append(sentence_arg0_masks)
            arg1_attention_masks.append(sentence_arg1_masks)

        # Padding for SRL data
        srl_padding = [[0] * self.max_arg_length] * self.max_args_per_sentence
        mask_padding = [[0] * self.max_arg_length] * self.max_args_per_sentence

        predicates = (predicates + [srl_padding] * self.max_sentences_per_article)[
            : self.max_sentences_per_article
        ]
        arg0s = (arg0s + [srl_padding] * self.max_sentences_per_article)[
            : self.max_sentences_per_article
        ]
        arg1s = (arg1s + [srl_padding] * self.max_sentences_per_article)[
            : self.max_sentences_per_article
        ]

        predicate_attention_masks = (
            predicate_attention_masks + [mask_padding] * self.max_sentences_per_article
        )[: self.max_sentences_per_article]
        arg0_attention_masks = (
            arg0_attention_masks + [mask_padding] * self.max_sentences_per_article
        )[: self.max_sentences_per_article]
        arg1_attention_masks = (
            arg1_attention_masks + [mask_padding] * self.max_sentences_per_article
        )[: self.max_sentences_per_article]

        data = {
            "sentence_ids": torch.tensor(sentence_ids, dtype=torch.long),
            "sentence_attention_masks": torch.tensor(
                sentence_attention_masks, dtype=torch.long
            ),
            "predicate_ids": torch.tensor(predicates, dtype=torch.long),
            "predicate_attention_masks": torch.tensor(
                predicate_attention_masks, dtype=torch.long
            ),
            "arg0_ids": torch.tensor(arg0s, dtype=torch.long),
            "arg0_attention_masks": torch.tensor(
                arg0_attention_masks, dtype=torch.long
            ),
            "arg1_ids": torch.tensor(arg1s, dtype=torch.long),
            "arg1_attention_masks": torch.tensor(
                arg1_attention_masks, dtype=torch.long
            ),
            "frameaxis": torch.tensor(frameaxis_data, dtype=torch.float),
            "labels": torch.tensor(labels[0], dtype=torch.long),
        }

        return data


def custom_collate_fn(batch):
    # Extract individual lists from the batch
    sentence_ids = [item["sentence_ids"] for item in batch]
    sentence_attention_masks = [item["sentence_attention_masks"] for item in batch]
    predicate_ids = [item["predicate_ids"] for item in batch]
    predicate_attention_masks = [item["predicate_attention_masks"] for item in batch]
    arg0_ids = [item["arg0_ids"] for item in batch]
    arg0_attention_masks = [item["arg0_attention_masks"] for item in batch]
    arg1_ids = [item["arg1_ids"] for item in batch]
    arg1_attention_masks = [item["arg1_attention_masks"] for item in batch]
    frameaxis = [item["frameaxis"] for item in batch]
    labels = [item["labels"] for item in batch]

    # Pad each list
    sentence_ids = torch.nn.utils.rnn.pad_sequence(
        sentence_ids, batch_first=True, padding_value=0
    )
    sentence_attention_masks = torch.nn.utils.rnn.pad_sequence(
        sentence_attention_masks, batch_first=True, padding_value=0
    )
    predicate_ids = torch.nn.utils.rnn.pad_sequence(
        predicate_ids, batch_first=True, padding_value=0
    )
    predicate_attention_masks = torch.nn.utils.rnn.pad_sequence(
        predicate_attention_masks, batch_first=True, padding_value=0
    )
    arg0_ids = torch.nn.utils.rnn.pad_sequence(
        arg0_ids, batch_first=True, padding_value=0
    )
    arg0_attention_masks = torch.nn.utils.rnn.pad_sequence(
        arg0_attention_masks, batch_first=True, padding_value=0
    )
    arg1_ids = torch.nn.utils.rnn.pad_sequence(
        arg1_ids, batch_first=True, padding_value=0
    )
    arg1_attention_masks = torch.nn.utils.rnn.pad_sequence(
        arg1_attention_masks, batch_first=True, padding_value=0
    )
    frameaxis = torch.nn.utils.rnn.pad_sequence(
        frameaxis, batch_first=True, padding_value=0
    )
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)

    # Create the output dictionary
    output_dict = {
        "sentence_ids": sentence_ids,
        "sentence_attention_masks": sentence_attention_masks,
        "predicate_ids": predicate_ids,
        "predicate_attention_masks": predicate_attention_masks,
        "arg0_ids": arg0_ids,
        "arg0_attention_masks": arg0_attention_masks,
        "arg1_ids": arg1_ids,
        "arg1_attention_masks": arg1_attention_masks,
        "frameaxis": frameaxis,
        "labels": labels,
    }

    return output_dict


In [None]:
# Sample df, data_srl, df_fa

df_sample = df_fa.head(10)
df_srl_sample = data_srl.head(10)
df_fa_sample = data_fa.head(10)

