# Main imports and code

In [4]:
# check which gpu we're using
!nvidia-smi

Mon Feb 17 22:57:43 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:09.0 Off |                    0 |
| N/A   43C    P8               9W /  70W |      2MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!pip install simpletransformers
!pip install tensorboardx



In [30]:
from urllib import request
import pandas as pd
import logging
import torch
from torch import nn 
from collections import Counter
from ast import literal_eval

In [6]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [7]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


I0000 00:00:1739833126.579906 2269067 gpu_device.cc:2022] Created device /device:GPU:0 with 13775 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:09.0, compute capability: 7.5


# Fetch Don't Patronize Me! data manager module

In [8]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [9]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [10]:
from dont_patronize_me import DontPatronizeMe

In [11]:
dpm = DontPatronizeMe('.', '.')

In [12]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [13]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [14]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [15]:
data=dpm.train_task1_df

In [16]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [17]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = str(trids.par_id[idx])
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

In [18]:
import random

In [19]:
trdf1 = pd.DataFrame(rows)

In [20]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


# Rebuild test set (Task 1)

In [21]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = str(teids.par_id[idx])
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [22]:
len(rows)

2094

In [23]:
tedf1 = pd.DataFrame(rows)

In [24]:
print(tedf1)

     par_id   community                                               text  \
0      4046    hopeless  We also know that they can benefit by receivin...   
1      1279     refugee  Pope Francis washed and kissed the feet of Mus...   
2      8330     refugee  Many refugees do n't want to be resettled anyw...   
3      4063     in-need  "Budding chefs , like "" Fred "" , "" Winston ...   
4      4089    homeless  "In a 90-degree view of his constituency , one...   
...     ...         ...                                                ...   
2089  10462    homeless  The sad spectacle , which occurred on Saturday...   
2090  10463     refugee  """ The Pakistani police came to our house and...   
2091  10464    disabled  "When Marie O'Donoghue went looking for a spec...   
2092  10465       women  "Sri Lankan norms and culture inhibit women fr...   
2093  10466  vulnerable  He added that the AFP will continue to bank on...   

      label  
0         1  
1         1  
2         1  
3      

In [25]:
tedf1.sample(frac=1)

Unnamed: 0,par_id,community,text,label
393,8594,hopeless,"But if that all sounds a bit hopeless , fear n...",0
787,9030,migrant,"""The United States """" does n't have the luxury...",0
1871,10219,migrant,The US Department of Homeland Security cited i...,0
388,8589,homeless,The year 2017 is approaching its end with some...,0
2072,10443,in-need,"Uganda : over 900,000 South Sudanese refugees ...",0
...,...,...,...,...
914,9170,migrant,"This May 5 , 2018 , photo , released by the Ro...",0
989,9248,hopeless,"In other words , if you want to show how hopel...",0
1616,9944,women,A Filipino boxer can become among an additiona...,0
690,8923,women,Multiple births are increasingly the result of...,0


In [26]:
# downsample negative instances
# this is to handle the class imbalance, and ensures negative to positive ratio is 2:1
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [27]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


# Ensemble learning

## Base model 1: RoBERTa-base

In [56]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=256, return_tensors="pt")
        inputs = {key: value.squeeze(0) for key, value in inputs.items()}
        return inputs, torch.tensor(label, dtype=torch.long)

# Hyperparameters
num_epochs = 3
batch_size = 32
learning_rate = 2e-5
k_folds = 5

data_texts = training_set1['text'].tolist()
data_labels = training_set1['label'].tolist()

kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

overall_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(data_texts)):
    print(f"Training fold {fold+1}/{k_folds}")
    
    train_texts = [data_texts[i] for i in train_idx]
    train_labels = [data_labels[i] for i in train_idx]
    val_texts = [data_texts[i] for i in val_idx]
    val_labels = [data_labels[i] for i in val_idx]
    
    train_dataset = TextDataset(train_texts, train_labels)
    val_dataset = TextDataset(val_texts, val_labels)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
    
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        scheduler.step()
        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")
    
    # Validation phase
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = {key: value.to(device) for key, value in inputs.items()}
            labels = labels.to(device)
            
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    f1 = f1_score(true_labels, predictions, average='binary')
    overall_f1_scores.append(f1)
    print(f"Fold {fold+1} F1 Score: {f1:.4f}")

print(f"Mean F1 Score across {k_folds} folds: {np.mean(overall_f1_scores):.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold 1/5


Epoch 1: 100%|██████████| 60/60 [01:10<00:00,  1.17s/it]


Epoch 1 Loss: 0.5844


Epoch 2: 100%|██████████| 60/60 [01:10<00:00,  1.17s/it]


Epoch 2 Loss: 0.3941


Epoch 3: 100%|██████████| 60/60 [01:10<00:00,  1.18s/it]


Epoch 3 Loss: 0.3571
Fold 1 F1 Score: 0.7478
Training fold 2/5


Epoch 1: 100%|██████████| 60/60 [01:11<00:00,  1.18s/it]


Epoch 1 Loss: 0.3955


Epoch 2: 100%|██████████| 60/60 [01:11<00:00,  1.18s/it]


Epoch 2 Loss: 0.2817


Epoch 3: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 3 Loss: 0.2407
Fold 2 F1 Score: 0.7892
Training fold 3/5


Epoch 1: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 1 Loss: 0.3100


Epoch 2: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 2 Loss: 0.1828


Epoch 3: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 3 Loss: 0.1493
Fold 3 F1 Score: 0.8434
Training fold 4/5


Epoch 1: 100%|██████████| 60/60 [01:11<00:00,  1.18s/it]


Epoch 1 Loss: 0.2426


Epoch 2: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 2 Loss: 0.1258


Epoch 3: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 3 Loss: 0.1086
Fold 4 F1 Score: 0.9383
Training fold 5/5


Epoch 1: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 1 Loss: 0.1760


Epoch 2: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 2 Loss: 0.0889


Epoch 3: 100%|██████████| 60/60 [01:11<00:00,  1.19s/it]


Epoch 3 Loss: 0.0681
Fold 5 F1 Score: 0.9695
Mean F1 Score across 5 folds: 0.8576


In [None]:
from collections import Counter

prediction_list = [int(x) for x in predictions]
print(prediction_list)
Counter(prediction_list)

In [46]:
# labels2file(prediction_list, 'task1.txt')
with open("task1.txt", "w") as file:
    for pred in predictions:
        file.write(f"{pred}\n")

## Prepare submission

In [None]:
!cat task1.txt | head -n 10

1
1
0
1
0
0
1
1
0
1


In [None]:
!zip submission.zip task1.txt

  adding: task1.txt (deflated 92%)
  adding: task2.txt (deflated 97%)
