In [38]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load Data

In [39]:
df_labels = pd.read_csv(f"../data/data/en/dev-labels-subtask-2.txt", sep='\t', header=None)
df_labels.columns = ['article_id', 'label']

# split labels into list
df_labels['label'] = df_labels['label'].apply(lambda x: x.split(","))

# binary encode the labels
df_labels = pd.concat([df_labels, pd.get_dummies(df_labels['label'].apply(pd.Series).stack()).sum(level=0)], axis=1)

df_labels.head()

  df_labels = pd.concat([df_labels, pd.get_dummies(df_labels['label'].apply(pd.Series).stack()).sum(level=0)], axis=1)


Unnamed: 0,article_id,label,Capacity_and_resources,Crime_and_punishment,Cultural_identity,Economic,External_regulation_and_reputation,Fairness_and_equality,Health_and_safety,Legality_Constitutionality_and_jurisprudence,Morality,Policy_prescription_and_evaluation,Political,Public_opinion,Quality_of_life,Security_and_defense
0,820791520,"[Political, Fairness_and_equality, Policy_pres...",0,0,0,1,0,1,0,0,0,1,1,1,0,1
1,821040551,"[Political, Capacity_and_resources, Policy_pre...",1,0,0,0,0,0,1,1,0,1,1,1,0,1
2,813552066,"[Public_opinion, Policy_prescription_and_evalu...",0,0,0,0,1,0,0,0,0,1,1,1,0,0
3,817176202,"[Political, External_regulation_and_reputation...",0,0,0,1,1,0,0,1,0,1,1,1,0,0
4,820419869,"[Public_opinion, Political, External_regulatio...",1,0,0,0,1,0,0,1,0,1,1,1,0,0


In [40]:
data = []

# get list of file path from data\data\en\dev-articles-subtask-2\*
articles = os.listdir(f"../data/data/en/dev-articles-subtask-2")

for article in articles:
    with open(f"../data/data/en/dev-articles-subtask-2/{article}", 'r', encoding='utf-8') as f:
        obj = []

        article_id = article.split('.')[0].replace('article', '')

        obj.append(article_id)
        
        # read line 3 to n
        lines = f.readlines()[2:]
        obj.append(''.join(lines))
        
        data.append(obj)

# create pandas dataframe
df_articles = pd.DataFrame(data, columns=['article_id', 'article'])

# article_id to int
df_articles['article_id'] = df_articles['article_id'].astype(int)


In [41]:
# join labels and articles
df = pd.merge(df_articles, df_labels, on='article_id')

df.head()

Unnamed: 0,article_id,article,label,Capacity_and_resources,Crime_and_punishment,Cultural_identity,Economic,External_regulation_and_reputation,Fairness_and_equality,Health_and_safety,Legality_Constitutionality_and_jurisprudence,Morality,Policy_prescription_and_evaluation,Political,Public_opinion,Quality_of_life,Security_and_defense
0,813452859,With the Parliamentary vote on British Prime M...,"[Political, External_regulation_and_reputation...",0,0,0,1,1,0,0,1,0,1,1,0,0,0
1,813494037,The spectacular fireworks that lit up the Lond...,"[Political, Crime_and_punishment, External_reg...",0,1,0,0,1,0,0,1,0,1,1,1,0,0
2,813547724,Post-Brexit Britain should use its power to de...,"[Political, Quality_of_life, External_regulati...",0,0,0,1,1,0,0,0,0,1,1,1,1,0
3,813552066,With three months until Britain leaves the Eur...,"[Public_opinion, Policy_prescription_and_evalu...",0,0,0,0,1,0,0,0,0,1,1,1,0,0
4,813601978,Fireworks exploded overhead and couples kissed...,"[Political, Morality, Cultural_identity, Crime...",0,1,1,0,1,0,1,0,1,0,1,0,1,1


In [42]:
X = df['article']
y = df.drop(['article_id', 'article', 'label'], axis=1)

In [43]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
X_tokenized = X.apply(lambda row: tokenizer(row, padding=True, truncation=True))

In [44]:
X_tokenized.head()

0    [input_ids, token_type_ids, attention_mask]
1    [input_ids, token_type_ids, attention_mask]
2    [input_ids, token_type_ids, attention_mask]
3    [input_ids, token_type_ids, attention_mask]
4    [input_ids, token_type_ids, attention_mask]
Name: article, dtype: object

In [45]:
# Convert the tokenized data into a format that BERT can understand
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.X[idx]["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(self.X[idx]["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(self.y.iloc[idx], dtype=torch.float),
        }

In [46]:
def collate_fn(batch):
    input_ids = pad_sequence([item["input_ids"] for item in batch], batch_first=True)
    attention_mask = pad_sequence([item["attention_mask"] for item in batch], batch_first=True)
    labels = torch.stack([item["labels"] for item in batch])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [47]:
dataset = CustomDataset(X_tokenized, y)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn)

In [48]:
y

Unnamed: 0,Capacity_and_resources,Crime_and_punishment,Cultural_identity,Economic,External_regulation_and_reputation,Fairness_and_equality,Health_and_safety,Legality_Constitutionality_and_jurisprudence,Morality,Policy_prescription_and_evaluation,Political,Public_opinion,Quality_of_life,Security_and_defense
0,0,0,0,1,1,0,0,1,0,1,1,0,0,0
1,0,1,0,0,1,0,0,1,0,1,1,1,0,0
2,0,0,0,1,1,0,0,0,0,1,1,1,1,0
3,0,0,0,0,1,0,0,0,0,1,1,1,0,0
4,0,1,1,0,1,0,1,0,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,0,0,0,0,1,0,0,1,0,1,1,1,0,0
79,0,0,0,0,1,0,0,0,0,1,1,1,1,0
80,0,0,0,0,1,0,0,1,0,1,1,1,0,0
81,0,0,0,0,0,0,0,0,0,1,1,1,0,0


In [49]:
# Fine-tune a pre-trained BERT model on the tokenized data using a multi-label classification loss function
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(y.columns))
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
for epoch in range(3):
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()