# Main imports and code

In [1]:
# check which gpu we're using
!nvidia-smi

Thu Feb 27 20:55:41 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:06.0 Off |                    0 |
| N/A   29C    P8              10W /  70W |      2MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install simpletransformers
!pip install tensorboardx



In [34]:
from urllib import request
import pandas as pd
import logging
import torch
from torch import nn 
from collections import Counter
from ast import literal_eval
from googletrans import Translator # NOTE: Run in your venv: `pip install googletrans==4.0.0-rc1``
from tqdm import tqdm

In [4]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [None]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

2025-02-27 20:57:25.705215: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740689847.745016    5602 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740689848.433008    5602 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-27 20:57:34.960618: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found GPU at: /device:GPU:0


I0000 00:00:1740689956.547959    5602 gpu_device.cc:2022] Created device /device:GPU:0 with 13775 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:06.0, compute capability: 7.5


# Fetch Don't Patronize Me! data manager module

In [6]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [7]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [8]:
from dont_patronize_me import DontPatronizeMe

In [9]:
dpm = DontPatronizeMe('data/train/', '.')

In [10]:
dpm.load_task1()

# Load paragraph IDs

In [11]:
trids = pd.read_csv('data/train/train_semeval_parids-labels.csv')
teids = pd.read_csv('data/dev/dev_semeval_parids-labels.csv')

In [12]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [13]:
data=dpm.train_task1_df

In [14]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = str(trids.par_id[idx])
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

  trdf1 = pd.DataFrame(rows)

In [18]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


## Data sampling

In [27]:
print("positive samples: ", len(trdf1[trdf1.label == 1]))
print("negative samples: ", len(trdf1[trdf1.label == 0]))

positive samples:  794
negative samples:  7581


In [28]:
# Downsample negative instances
# this is to handle the class imbalance, and ensures negative to positive ratio is 2:1
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [29]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


In [46]:
print("After downsampling")
print("positive samples: ", len(training_set1[training_set1.label == 1]))
print("negative samples: ", len(training_set1[training_set1.label == 0]))
print("total samples: ", len(training_set1))

After downsampling
positive samples:  794
negative samples:  1588
total samples:  2382


## Data augmentation the training set
### 1. Translating into another language, and back to english
NOTE: I've commented this out because the augmentation only has to be done once and I've kept the augmented (translated) training data in `data/train/translated_data.csv`
Also note: We are only augmenting the training data (which includes the validation data, but not test data)

In [None]:
# import time
# import os
# from deep_translator import GoogleTranslator
# import numpy as np

# def back_translate_text(text, src_lang='en', intermediate_lang='fr'):
#     # Translate to intermediate language
#     translated = GoogleTranslator(source=src_lang, target=intermediate_lang).translate(text)
#     # Translate back to source language
#     back_translated = GoogleTranslator(source=intermediate_lang, target=src_lang).translate(translated)
#     return back_translated

# # Apply back-translation to a percentage of the DataFrame
# def back_translate_dataframe(df, text_column='text', src_lang='en', intermediate_lang='fr', augment_frac=0.5, batch_size=50, delay=2):
#     # Create a copy of the DataFrame to avoid modifying the original
#     augmented_df = df.copy()
    
#     # Randomly select a fraction of the samples to augment
#     num_samples = len(augmented_df)
#     num_augment = int(num_samples * augment_frac)
#     augment_indices = np.random.choice(num_samples, size=num_augment, replace=False)
    
#     # Sort the indices for efficient memory access
#     augment_indices = np.sort(augment_indices)
    
#     # Process the selected samples in batches
#     for i in tqdm(range(0, len(augment_indices), batch_size)):
#         batch_indices = augment_indices[i:i + batch_size]
#         batch = augmented_df.iloc[batch_indices]
#         augmented_df.iloc[batch_indices, augmented_df.columns.get_loc(text_column)] = batch[text_column].apply(
#             lambda x: back_translate_text(x, src_lang=src_lang, intermediate_lang=intermediate_lang)
#         )
#         print(f"Processed {i + batch_size} rows...")
#         time.sleep(delay)  # Add delay between batches
    
#     return augmented_df

# # Example usage
# # Check if augmented data already exists
# augmented_translate_df = back_translate_dataframe(training_set1[2000:], text_column='text', augment_frac=0.5, batch_size=10, delay=2)
# # augmented_translate_df.to_csv('translated_data.csv', index=False)
# augmented_translate_df.to_csv('translated_data.csv', mode='a', header=False, index=False)

# print(training_set1.head())
# print(augmented_translate_df.head())

  0%|          | 0/20 [00:00<?, ?it/s]

Processed 10 rows...


  5%|▌         | 1/20 [00:07<02:20,  7.41s/it]

Processed 20 rows...


 10%|█         | 2/20 [00:12<01:52,  6.26s/it]

Processed 30 rows...


 15%|█▌        | 3/20 [00:20<01:56,  6.83s/it]

Processed 40 rows...


 20%|██        | 4/20 [00:31<02:14,  8.39s/it]

Processed 50 rows...


 25%|██▌       | 5/20 [00:42<02:19,  9.32s/it]

Processed 60 rows...


 30%|███       | 6/20 [00:53<02:21, 10.11s/it]

Processed 70 rows...


 35%|███▌      | 7/20 [01:02<02:06,  9.71s/it]

Processed 80 rows...


 40%|████      | 8/20 [01:14<02:04, 10.36s/it]

Processed 90 rows...


 45%|████▌     | 9/20 [01:25<01:57, 10.65s/it]

Processed 100 rows...


 50%|█████     | 10/20 [01:39<01:57, 11.77s/it]

Processed 110 rows...


 55%|█████▌    | 11/20 [01:51<01:44, 11.57s/it]

Processed 120 rows...


 60%|██████    | 12/20 [01:58<01:23, 10.40s/it]

Processed 130 rows...


 65%|██████▌   | 13/20 [02:10<01:15, 10.78s/it]

Processed 140 rows...


 70%|███████   | 14/20 [02:26<01:13, 12.26s/it]

Processed 150 rows...


 75%|███████▌  | 15/20 [02:40<01:03, 12.78s/it]

Processed 160 rows...


 80%|████████  | 16/20 [02:55<00:54, 13.54s/it]

Processed 170 rows...


 85%|████████▌ | 17/20 [03:08<00:40, 13.40s/it]

Processed 180 rows...


 90%|█████████ | 18/20 [03:19<00:25, 12.75s/it]

Processed 190 rows...


 95%|█████████▌| 19/20 [03:32<00:12, 12.88s/it]

Processed 200 rows...


100%|██████████| 20/20 [03:35<00:00, 10.79s/it]

  par_id      community                                               text  \
0   4341  poor-families  The scheme saw an estimated 150,000 children f...   
1   4136       homeless  Durban 's homeless communities reconciliation ...   
2  10352  poor-families  The next immediate problem that cropped up was...   
3   8279     vulnerable  Far more important than the implications for t...   
4   1164  poor-families  To strengthen child-sensitive social protectio...   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  
     par_id   community                                               text  \
2000   1353     migrant  An eyewitness who asked for anonymity said tha...   
2001   1355  vulnerable  Pharmacies recorded an increase in the number ...   
2002   1356    hopeless  The whispers were here and there. There have b...   
2003   1357    disabled  """ I like my life as a disabled person , "" h...   
2004   1359   immigrant  South Africa has about two million documented ... 




### 2. Sentence shuffling

In [None]:
# TODO: Implement sentence shuffling

### Data augmentation experiment

In [None]:
# TODO:
# 1. Train and evaluate the model on the original data
# 2. Train and evaluate the model on the translated data
# 3. Train and evaluate the model on the sentence shuffled data
# 4. Train and evaluate the model on the translated + sentence shuffled data
# 5. Compare performances

# Rebuild test set (Task 1)

In [18]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = str(teids.par_id[idx])
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [19]:
len(rows)

2094

In [20]:
tedf1 = pd.DataFrame(rows)

In [21]:
print(tedf1)

     par_id   community                                               text  \
0      4046    hopeless  We also know that they can benefit by receivin...   
1      1279     refugee  Pope Francis washed and kissed the feet of Mus...   
2      8330     refugee  Many refugees do n't want to be resettled anyw...   
3      4063     in-need  "Budding chefs , like "" Fred "" , "" Winston ...   
4      4089    homeless  "In a 90-degree view of his constituency , one...   
...     ...         ...                                                ...   
2089  10462    homeless  The sad spectacle , which occurred on Saturday...   
2090  10463     refugee  """ The Pakistani police came to our house and...   
2091  10464    disabled  "When Marie O'Donoghue went looking for a spec...   
2092  10465       women  "Sri Lankan norms and culture inhibit women fr...   
2093  10466  vulnerable  He added that the AFP will continue to bank on...   

      label  
0         1  
1         1  
2         1  
3      

In [22]:
tedf1.sample(frac=1)

Unnamed: 0,par_id,community,text,label
1470,9784,immigrant,People are threatening once again to boycott a...,0
1473,9787,hopeless,The traditional logic seemed unassailable : wh...,0
417,8621,homeless,"Gulzar 's relatives , who inherited the palace...",0
993,9253,poor-families,The problem is most cocoa is produced by poor ...,0
1233,9513,immigrant,It is clear the US is feeling the heat over tr...,0
...,...,...,...,...
1518,9836,immigrant,"Ozil , who is of Turkish descent , chose to re...",0
992,9252,refugee,In an act of defiance against Hungarian author...,0
1225,9504,in-need,""""""" There are four rental blocks around here a...",0
619,8844,in-need,He said they are not always able to transfer m...,0


# Ensemble learning

## Set Up

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np

In [26]:
# Define a PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, labels):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=256, return_tensors="pt")
        inputs = {key: value.squeeze(0) for key, value in inputs.items()}
        return inputs, torch.tensor(label, dtype=torch.long)

In [27]:
# Hyperparameters
num_epochs = 3
batch_size = 32
learning_rate = 2e-5
k_folds = 5

data_texts = training_set1['text'].tolist()
data_labels = training_set1['label'].tolist()

kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
def load_model_and_tokenizer(model_name):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

In [None]:
def train_and_validate_model(model, tokenizer):
    overall_f1_scores = []
    all_predictions = []
    all_true_labels = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(data_texts)):
        print(f"Training fold {fold+1}/{k_folds}")
        
        train_texts = [data_texts[i] for i in train_idx]
        train_labels = [data_labels[i] for i in train_idx]
        val_texts = [data_texts[i] for i in val_idx]
        val_labels = [data_labels[i] for i in val_idx]
        
        train_dataset = TextDataset(tokenizer, train_texts, train_labels)
        val_dataset = TextDataset(tokenizer, val_texts, val_labels)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
        
        model.train()
        for epoch in range(num_epochs):
            total_loss = 0
            for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
                inputs = {key: value.to(device) for key, value in inputs.items()}
                labels = labels.to(device)
                
                optimizer.zero_grad()
                outputs = model(**inputs)
                loss = criterion(outputs.logits, labels)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
            scheduler.step()
            print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")
        
        # Validation phase
        model.eval()
        predictions, true_labels = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = {key: value.to(device) for key, value in inputs.items()}
                labels = labels.to(device)
                
                outputs = model(**inputs)
                preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                predictions.extend(preds)
                true_labels.extend(labels.cpu().numpy())
        
        f1 = f1_score(true_labels, predictions, average='binary')
        overall_f1_scores.append(f1)
        print(f"Fold {fold+1} F1 Score: {f1:.4f}")

        # Store predictions and true labels for this fold
        all_predictions.extend(predictions)
        all_true_labels.extend(true_labels)

    print(f"Mean F1 Score across {k_folds} folds: {np.mean(overall_f1_scores):.4f}")

    return all_predictions, all_true_labels

## Base model 1: RoBERTa-base

In [31]:
# Train baseline model 1: roberta-base
model1, tokenizer1 = load_model_and_tokenizer("FacebookAI/roberta-base")
predictions1, true_labels1 = train_and_validate_model(model1, tokenizer1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold 1/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.5695


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.3952


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 3 Loss: 0.3694
Fold 1 F1 Score: 0.7317
Training fold 2/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.3974


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.2756


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 3 Loss: 0.2483
Fold 2 F1 Score: 0.7791
Training fold 3/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 1 Loss: 0.3170


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 2 Loss: 0.1915


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 3 Loss: 0.1586
Fold 3 F1 Score: 0.8765
Training fold 4/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.2459


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.1279


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 3 Loss: 0.1045
Fold 4 F1 Score: 0.9470
Training fold 5/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 1 Loss: 0.1824


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.0746


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 3 Loss: 0.0648
Fold 5 F1 Score: 0.9689
Mean F1 Score across 5 folds: 0.8607


## Base model 2: DistilBERT

In [32]:
# Train baseline model 2: DistilBERT
model2, tokenizer2 = load_model_and_tokenizer("distilbert/distilbert-base-uncased")
predictions2, true_labels2 = train_and_validate_model(model2, tokenizer2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold 1/5


Epoch 1: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 1 Loss: 0.5790


Epoch 2: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 2 Loss: 0.4330


Epoch 3: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 3 Loss: 0.4163
Fold 1 F1 Score: 0.6789
Training fold 2/5


Epoch 1: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 1 Loss: 0.4298


Epoch 2: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 2 Loss: 0.3296


Epoch 3: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 3 Loss: 0.3136
Fold 2 F1 Score: 0.7556
Training fold 3/5


Epoch 1: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 1 Loss: 0.3414


Epoch 2: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 2 Loss: 0.2335


Epoch 3: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 3 Loss: 0.2121
Fold 3 F1 Score: 0.8424
Training fold 4/5


Epoch 1: 100%|██████████| 60/60 [00:40<00:00,  1.47it/s]


Epoch 1 Loss: 0.2485


Epoch 2: 100%|██████████| 60/60 [00:40<00:00,  1.47it/s]


Epoch 2 Loss: 0.1339


Epoch 3: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 3 Loss: 0.1200
Fold 4 F1 Score: 0.8970
Training fold 5/5


Epoch 1: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 1 Loss: 0.1726


Epoch 2: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 2 Loss: 0.0784


Epoch 3: 100%|██████████| 60/60 [00:41<00:00,  1.46it/s]


Epoch 3 Loss: 0.0649
Fold 5 F1 Score: 0.9586
Mean F1 Score across 5 folds: 0.8265


## Base model 3: BERT base uncased

In [30]:
# # Train baseline model 3: ALBERT base v2
model3, tokenizer3 = load_model_and_tokenizer("bert-base-uncased")
predictions3, true_labels3 = train_and_validate_model(model3, tokenizer3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold 1/5


Epoch 1: 100%|██████████| 60/60 [01:20<00:00,  1.35s/it]


Epoch 1 Loss: 0.5743


Epoch 2: 100%|██████████| 60/60 [01:20<00:00,  1.35s/it]


Epoch 2 Loss: 0.4202


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 3 Loss: 0.3946
Fold 1 F1 Score: 0.6828
Training fold 2/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.4127


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.2885


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 3 Loss: 0.2628
Fold 2 F1 Score: 0.7531
Training fold 3/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.3221


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.1966


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 3 Loss: 0.1728
Fold 3 F1 Score: 0.8812
Training fold 4/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.2473


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.1185


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.36s/it]


Epoch 3 Loss: 0.1009
Fold 4 F1 Score: 0.9477
Training fold 5/5


Epoch 1: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 1 Loss: 0.1587


Epoch 2: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 2 Loss: 0.0741


Epoch 3: 100%|██████████| 60/60 [01:21<00:00,  1.35s/it]


Epoch 3 Loss: 0.0627
Fold 5 F1 Score: 0.9763
Mean F1 Score across 5 folds: 0.8482


## Stacking (Meta-Learning)

In [33]:
from sklearn.linear_model import LogisticRegression

# Stack the predictions from all 3 models
X_train_meta = np.column_stack([predictions1, predictions2, predictions3])

assert np.array_equal(true_labels1, true_labels2)
assert np.array_equal(true_labels1, true_labels3)
y_train_meta = true_labels1

# Train the meta-model (stacker)
meta_model = LogisticRegression()
meta_model.fit(X_train_meta, y_train_meta)

# Prepare deliverables

In [34]:
from torch.amp import autocast

def get_predictions(model, tokenizer, df, batch_size=16):
    model.eval()

    texts = df['text'].tolist()
    
    all_preds = []

    with torch.no_grad():
        # Process texts in smaller batches to save memory
        for start_idx in range(0, len(texts), batch_size):
            batch_texts = texts[start_idx:start_idx + batch_size]
 
            encodings = tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt", max_length=512)

            inputs = {key: value.to(device) for key, value in encodings.items()}

            # Use mixed precision to reduce memory usage
            with autocast('cuda'):
                outputs = model(**inputs)
                preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                
            all_preds.extend(preds)

    return all_preds

## Predict on the official dev set

In [36]:
# Generate predictions for the official dev set from all 3 models
predictions_dev1 = get_predictions(model1, tokenizer1, tedf1)
predictions_dev2 = get_predictions(model2, tokenizer2, tedf1)
predictions_dev3 = get_predictions(model3, tokenizer3, tedf1)

X_dev_meta = np.column_stack([predictions_dev1, predictions_dev2, predictions_dev3])

final_predictions_dev = meta_model.predict(X_dev_meta)

In [37]:
# Calculate F1 score for the stacked model
true_labels_dev = tedf1['label'].tolist()
f1 = f1_score(true_labels_dev, final_predictions_dev, average='binary')
print(f"Final F1 Score for Stacked Model: {f1:.4f}")

Final F1 Score for Stacked Model: 0.5473


In [None]:
# Write predictions into a .txt file
with open('outputs/dev.txt', 'w') as file:
    for prediction in final_predictions_dev:
        file.write(f"{prediction}\n")

## Predict on the official test set

In [None]:
df = pd.read_csv('data/test/task4_test.tsv', sep='\t')
df.columns = ['par_id', 'art_id', 'keyword', 'country_code', 'text']
df.par_id = df.par_id.astype(str)
print(df.head())

In [None]:
rows = [] # will contain par_id and text
for idx in range(len(df)):
  parid = str(df.par_id[idx])
  data = df.loc[df.par_id == parid]
  # select row from original dataset to retrieve `text`
  keyword = data.loc[df.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
  })

In [None]:
len(rows)

In [None]:
offical_test_df = pd.DataFrame(rows)

In [None]:
print(offical_test_df)

In [None]:
# Generate predictions for the official test set from all 3 models
predictions_test1 = get_predictions(model1, tokenizer1, offical_test_df)
predictions_test2 = get_predictions(model2, tokenizer2, offical_test_df)
# predictions_test3 = get_predictions(model3, tokenizer3, tedf1)
# X_test_meta = np.column_stack([predictions_test1, predictions_test2, predictions_test3])
X_test_meta = np.column_stack([predictions_test1, predictions_test2])

final_predictions_test = meta_model.predict(X_test_meta)

In [None]:
# Write predictions into a .txt file
with open('outputs/test.txt', 'w') as file:
    for prediction in final_predictions_test:
        file.write(f"{prediction}\n")