In [1]:
import os

try:
  import google.colab

  from google.colab import drive
  drive.mount('/content/drive')
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  os.chdir('drive/MyDrive/Git/MasterThesis/data')
else:
  os.chdir('../')

labels_path = "data/data/en/train-labels-subtask-2.txt"
articles_path = "data/data/en/train-articles-subtask-2/"

In [2]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep="\t")

# Rename the columns for easier processing
labels_df.columns = ["article_id", "frames"]


labels_df.head()

Unnamed: 0,article_id,frames
0,832959523,"Morality,Security_and_defense,Policy_prescript..."
1,833039623,"Political,Crime_and_punishment,External_regula..."
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq..."
3,814777937,"Political,Morality,Fairness_and_equality,Exter..."
4,821744708,"Policy_prescription_and_evaluation,Political,L..."


In [3]:
# A function to read the article text given its ID
def get_article_content(article_id):
    try:
        with open(f"{articles_path}/article{article_id}.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        return None

df = labels_df

# Apply the function to get the article content
df["content"] = df["article_id"].apply(get_article_content)

# Drop rows where content could not be found
df.dropna(subset=["content"], inplace=True)

df.head()


Unnamed: 0,article_id,frames,content
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...


In [4]:
X = df["content"]

# Setup Dataloader for Masked Learning

In [9]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, DataCollatorForLanguageModeling
import torch

# Assuming 'X' is your DataFrame and it has a column 'articles' with the text data
articles = X.tolist()  # Convert the articles to a list

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize all articles (this may take some time depending on the size of your dataset)
# This will give you a list of encodings
encodings = tokenizer(articles, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

# Custom dataset class
class ArticlesDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Create the dataset
dataset = ArticlesDataset(encodings)

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

# Fine tune BERT using Masked Language Learning (MLL)

In [10]:
from transformers import BertForMaskedLM

# Load the BERT model. The 'bert-base-uncased' model will be used for MLM.
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
from transformers import AdamW

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [17]:
from tqdm.notebook import tqdm

# Detect if we have a GPU available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Send the model to the device (GPU/CPU)
model.to(device)

# Set the model to training mode
model.train()

epochs = 10

# Initialize the progress bar for the epochs
epoch_pbar = tqdm(range(epochs), desc='Epochs', unit='epoch')

# Training loop with tqdm progress bars
for epoch in epoch_pbar:
    # Initialize the progress bar for the batches
    batch_pbar = tqdm(dataloader, desc='Batches', leave=False)

    # Store the total loss for the epoch
    total_loss = 0

    for batch in batch_pbar:
        # Each batch is a dictionary with 'input_ids', 'attention_mask', and 'labels'
        # Send all tensors to the same device as the model
        inputs = {k: v.to(device) for k, v in batch.items()}

        # Zero the gradients before performing the backward pass
        optimizer.zero_grad()

        # Perform a forward pass. The model will return the loss.
        outputs = model(**inputs)
        loss = outputs.loss

        # Perform a backward pass to calculate gradients
        loss.backward()

        # Update weights
        optimizer.step()

        # Update the total loss
        total_loss += loss.item()

        # Update the progress bar for batches
        batch_pbar.set_postfix({'Batch loss': loss.item()})

    # Update the progress bar for epochs
    avg_epoch_loss = total_loss / len(dataloader)
    epoch_pbar.set_postfix({'Average Epoch loss': avg_epoch_loss})
    print(f"Epoch {epoch} finished, Average loss: {avg_epoch_loss}")

Epochs:   0%|          | 0/10 [00:00<?, ?epoch/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 0 finished, Average loss: 1.0336125152451652


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 1 finished, Average loss: 1.0124803866658891


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 2 finished, Average loss: 0.9912666593279157


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 3 finished, Average loss: 0.9901557394436428


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 4 finished, Average loss: 0.9864671145166669


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 5 finished, Average loss: 0.9664472384112222


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 6 finished, Average loss: 0.9804157274109977


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 7 finished, Average loss: 0.9802415583814893


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 8 finished, Average loss: 0.986514972788947


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch 9 finished, Average loss: 0.953081590788705


In [18]:
# Save the model to the specified directory
model_save_path = '../notebooks/models/fine-tuned-model'
model.save_pretrained(model_save_path)