In [226]:
!pip install -U scikit-learn transformers


[notice] A new release of pip is available: 23.0.1 -> 23.2.1
[notice] To update, run: C:\Users\elias\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip




In [None]:
!tar -xvf /content/semeval2023task3bundle-v4.tgz

In [5]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
import pandas as pd
import numpy as np
import os

# Load Data

In [6]:
df_labels = pd.read_csv(f"../data/data/en/dev-labels-subtask-2.txt", sep='\t', header=None)
df_labels.columns = ['article_id', 'label']

# split labels into list
df_labels['label'] = df_labels['label'].apply(lambda x: x.split(","))

# binary encode the labels
df_labels = pd.concat([df_labels, pd.get_dummies(df_labels['label'].apply(pd.Series).stack()).sum(level=0)], axis=1)

df_labels.head()

  df_labels = pd.concat([df_labels, pd.get_dummies(df_labels['label'].apply(pd.Series).stack()).sum(level=0)], axis=1)


Unnamed: 0,article_id,label,Capacity_and_resources,Crime_and_punishment,Cultural_identity,Economic,External_regulation_and_reputation,Fairness_and_equality,Health_and_safety,Legality_Constitutionality_and_jurisprudence,Morality,Policy_prescription_and_evaluation,Political,Public_opinion,Quality_of_life,Security_and_defense
0,820791520,"[Political, Fairness_and_equality, Policy_pres...",0,0,0,1,0,1,0,0,0,1,1,1,0,1
1,821040551,"[Political, Capacity_and_resources, Policy_pre...",1,0,0,0,0,0,1,1,0,1,1,1,0,1
2,813552066,"[Public_opinion, Policy_prescription_and_evalu...",0,0,0,0,1,0,0,0,0,1,1,1,0,0
3,817176202,"[Political, External_regulation_and_reputation...",0,0,0,1,1,0,0,1,0,1,1,1,0,0
4,820419869,"[Public_opinion, Political, External_regulatio...",1,0,0,0,1,0,0,1,0,1,1,1,0,0


In [7]:
data = []

# get list of file path from data\data\en\dev-articles-subtask-2\*
articles = os.listdir(f"../data/data/en/dev-articles-subtask-2")

for article in articles:
    with open(f"../data/data/en/dev-articles-subtask-2/{article}", 'r', encoding='utf-8') as f:
        obj = []

        article_id = article.split('.')[0].replace('article', '')

        obj.append(article_id)
        
        # read line 3 to n
        lines = f.readlines()[2:]
        obj.append(''.join(lines))
        
        data.append(obj)

# create pandas dataframe
df_articles = pd.DataFrame(data, columns=['article_id', 'article'])

# article_id to int
df_articles['article_id'] = df_articles['article_id'].astype(int)


In [8]:
# join labels and articles
df = pd.merge(df_articles, df_labels, on='article_id')

texts = df["article"].tolist()
labels = df.drop(['article_id', 'article', 'label'], axis=1).to_numpy().tolist()

In [9]:
# Define hyperparameters
max_seq_length = 128
batch_size = 32
learning_rate = 2e-5
num_epochs = 10

In [10]:
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=14)  # Adjust num_labels according to your task.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [233]:
input_ids = []
attention_masks = []
binary_labels = torch.tensor(labels, dtype=torch.float32)  # Convert labels to float32 for BCEWithLogitsLoss

In [234]:
for text in texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_seq_length,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




In [235]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

dataset = TensorDataset(input_ids, attention_masks, binary_labels)
sampler = RandomSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

In [236]:
# Set up GPU/CPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [237]:
# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * num_epochs)



In [238]:
# Fine-tuning loop
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [239]:
for epoch in range(num_epochs):
    total_loss = 0.0
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Optional: Gradient clipping
        optimizer.step()
        scheduler.step()

    # Print the average loss for this epoch
    avg_epoch_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Avg. Loss: {avg_epoch_loss:.4f}")

Epoch 1/3, Avg. Loss: 0.7242
Epoch 2/3, Avg. Loss: 0.6981
Epoch 3/3, Avg. Loss: 0.6777


In [240]:
# store model in ../models
model.save_pretrained("../models")

In [11]:
import torch
from transformers import BertTokenizer, BertModel


def get_word_embeddings(text, model, tokenizer, max_seq_length=128):
    # Tokenize the input text
    if isinstance(text, str):
        text = [text]  # Convert single sentence to a list for processing
    
    word_embeddings_list = []

    for sentence in text:
        for word in sentence.split():
            encoded_dict = tokenizer.encode_plus(
                word,
                add_special_tokens=True,
                max_length=max_seq_length,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            
            input_ids = encoded_dict['input_ids'].to(model.device)
            attention_mask = encoded_dict['attention_mask'].to(model.device)

            # Get word embeddings from model's tokenizer
            with torch.no_grad():
                model.eval()
                outputs = model.base_model(input_ids, attention_mask=attention_mask)
                hidden_states = outputs.last_hidden_state

            # Average the hidden states to get word embeddings for each word
            word_embeddings = torch.mean(hidden_states, dim=1).tolist()
            word_embeddings_list.append(word_embeddings)

    return word_embeddings_list

sentence = "This is an example sentence."
word_embeddings = get_word_embeddings(sentence, model, tokenizer)

# For multiple sentences:
sentences = ["This is sentence 1.", "Another example sentence."]
word_embeddings_batch = get_word_embeddings(sentences, model, tokenizer)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
word_embeddings2 = get_word_embeddings("Test", model, tokenizer)

In [13]:
len(word_embeddings2), len(word_embeddings2[0]), len(word_embeddings2[0][0])

(1, 1, 768)

In [14]:
len(word_embeddings), len(word_embeddings[0]), len(word_embeddings[0][0])

(5, 1, 768)

In [278]:
assert get_word_embeddings("Test", model, tokenizer) == get_word_embeddings("Test", model, tokenizer)

In [2]:
# load bert model
from transformers import BertModel
model = BertModel.from_pretrained("../models")

  from .autonotebook import tqdm as notebook_tqdm
  t = torch.tensor([], dtype=storage.dtype, device=storage.untyped().device)


In [3]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          