## FRISS with MFC

Implementation of the FRISS using the Media Frames Corpus (MFC) from Card et al. (2015). 

In [1]:
!pip install nltk

[0m

In [2]:
import nltk
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [3]:
import os

os.listdir(os.getcwd())

['FRISS_srl.pkl',
 'README.md',
 'logs',
 'used_labels_a1.npy',
 'notebooks',
 'FRISS_SRL_unlabeled.pkl',
 'chunks.pkl',
 'grid_search_metrics.csv',
 'predicted_labels.npy',
 '.git',
 'used_labels_p.npy',
 'results',
 'assets',
 'friss',
 'models',
 'used_labels_a0.npy',
 '.ipynb_checkpoints',
 'data',
 '.gitignore',
 'frameaxis']

In [4]:
labeled_path = "data/mfc/immigration_labeled.json"
unlabeld_path = "data/mfc/immigration_unlabeled.json"
codes_path = "data/mfc/codes.json"

In [5]:
# load data from path 
import json

with open(labeled_path) as f:
    labeled = json.load(f)

with open(unlabeld_path) as f:
    unlabeld = json.load(f)

with open(codes_path) as f:
    codes = json.load(f)

In [6]:
import pandas as pd
from nltk.tokenize import sent_tokenize

In [7]:
def get_labeled_data(labeled, codes):
    # articles list
    articles_list = []

    # Iterate through the data to fill the DataFrame
    for article_id, article_data in labeled.items():
        annotations_data = article_data['annotations']

        irrelevant_dict = annotations_data['irrelevant']

        text = article_data['text']
        irrelevant = article_data['irrelevant']

        # if primary_frame is none set to 15.0
        if article_data['primary_frame'] is not None:
            primary_frame = str(article_data['primary_frame']).split(".")[0] + ".0"
        else:
            primary_frame = "15.0"

        # get primary frame from code
        primary_frame = str(codes[primary_frame])

        # split text into sentences using nltk library
        sentences = sent_tokenize(text)

        # iterate through sentences
        for sentence in sentences:
            article = {
                'article_id': article_id,
                'irrelevant': irrelevant,
                'text': sentence,
                'document_frame': primary_frame
            }

            articles_list.append(article)

    # Create a DataFrame to store the results
    df = pd.DataFrame(articles_list, columns=['article_id', 'irrelevant', 'text', 'document_frame'])

    return df

In [8]:
def get_unlabeled_data(unlabeled):
    # articles list
    articles_list = []

    for idx, article in enumerate(unlabeled):
        article_id = f"unlabeled_{idx}" 
        text = article['text']

        # split text into sentences using nltk library
        sentences = sent_tokenize(text)

        # iterate through sentences
        for sentence in sentences:
            article = {
                'article_id': article_id,
                'text': sentence
            }

            articles_list.append(article)

    # Create a DataFrame to store the results
    df = pd.DataFrame(articles_list, columns=['article_id', 'text'])

    return df

## Get labeled and unlabeled data

In [9]:
df_labeled = get_labeled_data(labeled, codes)
df_unlabeled = get_unlabeled_data(unlabeld)

print("Labeled Count: ", len(df_labeled))
print("Unlabeled Count: ", len(df_unlabeled))

Labeled Count:  74468
Unlabeled Count:  460535


In [10]:
def preprocess_labeled_df(df):
    df = df[df["irrelevant"] == False][["article_id", "text", "document_frame"]]

    # create for each code a col and fill with 1 if code is in code col
    df = pd.concat([df, pd.get_dummies(df['document_frame'])], axis=1)  

    return df

In [11]:
df_labeled = preprocess_labeled_df(df_labeled)

In [12]:
df_labeled.head()

Unnamed: 0,article_id,text,document_frame,Capacity and Resources,Crime and Punishment,Cultural Identity,Economic,External Regulation and Reputation,Fairness and Equality,Health and Safety,"Legality, Constitutionality, Jurisdiction",Morality,Other,Policy Prescription and Evaluation,Political,Public Sentiment,Quality of Life,Security and Defense
0,Immigration1.0-10005,IMM-10005\n\nPRIMARY\n\nImmigrants without HOP...,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Immigration1.0-10005,It mounted as students went around the room te...,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,Immigration1.0-10005,Georgia Tech.,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,Immigration1.0-10005,University of Georgia.,Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,Immigration1.0-10005,"""All I could say was, 'I'm planning to see if ...",Quality of Life,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [13]:
df_unlabeled.head()

Unnamed: 0,article_id,text
0,unlabeled_0,IMM-10000\n\nPRIMARY\n\nMetro Briefing New Yor...
1,unlabeled_0,"As part of the scheme, Ms. Holzer convinced Sp..."
2,unlabeled_0,Katherine E. Finkelstein (NYT)
3,unlabeled_1,IMM-10003\n\nPRIMARY\n\nAmnesty Works for Amer...
4,unlabeled_1,All working families would benefit from immigr...


In [14]:
df_labeled.shape, df_unlabeled.shape

((67480, 18), (460535, 2))

# Fine tune

In [15]:
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [16]:
# Custom dataset class
class ArticlesDataset(Dataset):
    def __init__(self, articles, tokenizer):
        # Tokenize all articles (this may take some time depending on the size of your dataset)
        # This will give you a list of encodings
        self.encodings = tokenizer(articles, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


In [17]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# create train and test dataset
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_unlabeled["text"].tolist(), test_size=0.2, random_state=42)

train_dataset = ArticlesDataset(train_df, tokenizer)
test_dataset = ArticlesDataset(test_df, tokenizer)


In [18]:
from torch.utils.data import DataLoader
from transformers import BertTokenizer, DataCollatorForLanguageModeling

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)


In [19]:
# Load model
model = RobertaForMaskedLM.from_pretrained('roberta-base')

In [20]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [21]:
from tqdm.notebook import tqdm

# Detect if we have a GPU available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Send the model to the device (GPU/CPU)
model.to(device)

# Set the model to training mode
model.train()

epochs = 2

# Initialize the progress bar for the epochs
epoch_pbar = tqdm(range(epochs), desc='Epochs', unit='epoch')

# Training loop with tqdm progress bars
for epoch in epoch_pbar:
    # Initialize the progress bar for the batches
    batch_pbar = tqdm(train_loader, desc='Batches', leave=False)

    # Store the total loss for the epoch
    total_loss = 0

    for batch in tqdm(train_loader, desc='Training', leave=False):
        # Each batch is a dictionary with 'input_ids', 'attention_mask', and 'labels'
        # Send all tensors to the same device as the model
        inputs = {k: v.to(device) for k, v in batch.items()}

        # Zero the gradients before performing the backward pass
        optimizer.zero_grad()

        # Perform a forward pass. The model will return the loss.
        outputs = model(**inputs)
        loss = outputs.loss

        # Perform a backward pass to calculate gradients
        loss.backward()

        # Update weights
        optimizer.step()

        # Update the total loss
        total_loss += loss.item()

        # Update the progress bar for batches
        batch_pbar.set_postfix({'Batch loss': loss.item()})

    # Start of evaluation
    model.eval()  # Set the model to evaluation mode
    total_eval_loss = 0
    with torch.no_grad():  # Turn off gradients for evaluation
        for batch in tqdm(test_loader, desc='Evaluating', leave=False):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            total_eval_loss += loss.item()

    # Update the progress bar for epochs
    avg_epoch_loss = total_loss / len(test_loader)
    epoch_pbar.set_postfix({'Average Epoch loss': avg_epoch_loss})
    print(f"Epoch {epoch} finished, Average loss: {avg_epoch_loss}")

Epochs:   0%|          | 0/2 [00:00<?, ?epoch/s]

Batches:   0%|          | 0/46054 [00:00<?, ?it/s]

Training:   0%|          | 0/46054 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


KeyboardInterrupt: 

In [None]:
# Save the model to the specified directory
model_save_path = 'models/mlm_roberta_base'

# Create directory if it doesn't exist
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

model.save_pretrained(model_save_path)

***** Running training *****
  Num examples = 368428
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 69081
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/wandb_init.py", line 1040, in init
    wi.setup(kwargs)
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/wandb_init.py", line 259, in setup
    wand

Exception: problem