## Classifier - Try 1

Classify if a article has the Morality Frame or not using just the article as input.

In [1]:
import os

os.chdir('../../data/')

labels_path = "data/en/train-labels-subtask-2.txt"
articles_path = "data/en/train-articles-subtask-2/"

In [2]:
import pandas as pd

# Read the dev-labels-subtask-2.txt file
labels_df = pd.read_csv(labels_path, sep="\t")

# Rename the columns for easier processing
labels_df.columns = ["article_id", "frames"]


labels_df.head()

Unnamed: 0,article_id,frames
0,832959523,"Morality,Security_and_defense,Policy_prescript..."
1,833039623,"Political,Crime_and_punishment,External_regula..."
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq..."
3,814777937,"Political,Morality,Fairness_and_equality,Exter..."
4,821744708,"Policy_prescription_and_evaluation,Political,L..."


In [3]:
# A function to read the article text given its ID
def get_article_content(article_id):
    try:
        with open(f"{articles_path}/article{article_id}.txt", "r") as f:
            return f.read()
    except FileNotFoundError:
        return None

df = labels_df

# Apply the function to get the article content
df["content"] = df["article_id"].apply(get_article_content)

# Drop rows where content could not be found
df.dropna(subset=["content"], inplace=True)

df.head()


Unnamed: 0,article_id,frames,content
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...


In [4]:
# Split the frames column into a list of frames
df["frames_list"] = df["frames"].str.split(",")

# create for each frame a new column with the frame as name and 1 if the frame is present in the article and 0 if not
for frame in df["frames_list"].explode().unique():
    df[frame] = df["frames_list"].apply(lambda x: 1 if frame in x else 0)

df.head()

Unnamed: 0,article_id,frames,content,frames_list,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,832959523,"Morality,Security_and_defense,Policy_prescript...",How Theresa May Botched\n\nThose were the time...,"[Morality, Security_and_defense, Policy_prescr...",1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,833039623,"Political,Crime_and_punishment,External_regula...",Robert Mueller III Rests His Case—Dems NEVER W...,"[Political, Crime_and_punishment, External_reg...",0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,833032367,"Political,Crime_and_punishment,Fairness_and_eq...",Robert Mueller Not Recommending Any More Indic...,"[Political, Crime_and_punishment, Fairness_and...",0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,814777937,"Political,Morality,Fairness_and_equality,Exter...",The Far Right Is Trying to Co-opt the Yellow V...,"[Political, Morality, Fairness_and_equality, E...",1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,821744708,"Policy_prescription_and_evaluation,Political,L...",‘Special place in hell’ for those who promoted...,"[Policy_prescription_and_evaluation, Political...",0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [5]:
X = df["content"]
y = df.drop(columns=["article_id", "frames", "frames_list", "content"])

In [6]:
X.head()

0    How Theresa May Botched\n\nThose were the time...
1    Robert Mueller III Rests His Case—Dems NEVER W...
2    Robert Mueller Not Recommending Any More Indic...
3    The Far Right Is Trying to Co-opt the Yellow V...
4    ‘Special place in hell’ for those who promoted...
Name: content, dtype: object

In [7]:
y.head()

Unnamed: 0,Morality,Security_and_defense,Policy_prescription_and_evaluation,Legality_Constitutionality_and_jurisprudence,Economic,Political,Crime_and_punishment,External_regulation_and_reputation,Public_opinion,Fairness_and_equality,Capacity_and_resources,Quality_of_life,Cultural_identity,Health_and_safety
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,1,1,1,1,0,0,0,0,0
2,0,0,0,1,0,1,1,1,0,1,0,0,0,0
3,1,1,0,0,1,1,0,1,1,1,0,0,0,0
4,0,0,1,1,0,1,0,1,0,0,0,0,0,0


In [8]:
# modify y to binary classification morality or Security_and_defense
y = y[["Morality"]]
y.head()

Unnamed: 0,Morality
0,1
1,0
2,0
3,1
4,0


In [29]:
y.value_counts()

Morality
0           230
1           202
Name: count, dtype: int64

In [9]:
len(X), len(y)

(432, 432)

### Create the PyTorch Model

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [24]:
class ArticleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [25]:
# Create DataLoader
BATCH_SIZE = 16
train_dataset = ArticleDataset(X, y, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [26]:
# try train_dataset
train_dataset[0]

  'label': torch.tensor(label, dtype=torch.long)


{'input_ids': tensor([  101,  2129, 14781,  2089, 28516,  7690,  2216,  2020,  1996,  2335,
          1529,  1996,  2335,  3931,  1015,  2003,  1997,  2254,  2324,  1010,
          2418,  1012,  7776,  2090,  1996,  2142,  2983,  1998,  1996,  2647,
          2586,  2055,  7987, 10288,  4183,  2020,  2074,  2927,  1012,  1996,
          1523,  2017,  1521,  2222,  2022, 10560,  1524, 24416,  1999,  1996,
         17653,  2839, 10057,  1996,  7729,  1996,  2329,  2231,  2104,  2089,
          7645,  2076,  1996,  7566,  1012,  3728,  2008,  7729,  2038,  5399,
          2904,  1012,  2023, 12117, 12326,  2001,  2579,  2055,  2019,  3178,
          3283,  1024,  1996,  4035,  7009,  1024,  2114,  1996,  2110,  1024,
          2019,  1012,  1012,  1012,  2222,  7974,  5349,  6038,  1044,  1012,
         25235,  3781,  1012,  2190,  3976,  1024,  1002,  2260,  1012,  5585,
          4965,  2047,  1002,  1023,  1012,  5709,  1006,  2004,  1997,  2184,
          1024,  2423,  3968,  2102,  1

### 2. Model Definition

In [27]:
# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move the model to the device
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### 3. Training

In [30]:
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

# Training loop
for epoch in range(EPOCHS):
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {loss.item()}")

  'label': torch.tensor(label, dtype=torch.long)
