<a href="https://colab.research.google.com/github/chekwubeutomi/nlp-polarization-project/blob/main/subtask_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

import torch

from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

In [None]:
import wandb

wandb.init(mode="disabled")

  | |_| | '_ \/ _` / _` |  _/ -_)


## Data import

In [None]:
train = pd.read_csv('sample_data/eng.csv')
train.head()

Unnamed: 0,id,text,polarization
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0


In [None]:
X = train['text']
y = train['polarization']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

## Dataset preprocessing

In [None]:
class PolarizationDataset(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    item = {key: encoding[key].squeeze() for key in encoding.keys()}
    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Create datasets
train_dataset = PolarizationDataset(X_train, y_train, tokenizer)
val_dataset = PolarizationDataset(X_val, y_val, tokenizer)

## Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

# Define training arguments
training_args = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=3,
        learning_rate=2e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)

trainer.train()

eval_results = trainer.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")



Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.496222,0.666045
2,No log,0.450239,0.721697
3,0.495100,0.447868,0.743238




Macro F1 score on validation set: 0.743238162637946


In [None]:
test_df = pd.read_csv('sample_data/eng_val.csv')
test_df.head()

Unnamed: 0,id,text,polarization
0,eng_f66ca14d60851371f9720aaf4ccd9b58,God is with Ukraine and Zelensky,
1,eng_3a489aa7fed9726aa8d3d4fe74c57efb,"4 Dems, 2 Republicans Luzerne County Council s...",
2,eng_95770ff547ea5e48b0be00f385986483,Abuse Survivor Recounts Her Struggles at YWCA ...,
3,eng_2048ae6f9aa261c48e6d777bcc5b38bf,"After Rwanda, another deportation camp disaster",
4,eng_07781aa88e61e7c0a996abd1e5ea3a20,Another plea in Trump election interference probe,


In [None]:
X_test = test_df["text"]

In [None]:
X_test_dataset = PolarizationDataset(X_test, [0] * len(X_test), tokenizer)

In [None]:
predictions = trainer.predict(X_test_dataset)



In [None]:
prediction_labels = np.argmax(predictions.predictions, axis=1)
print(prediction_labels)

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0
 1 0 0 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 1 0
 1 1 0 0 0 0 1 0 1 1 0 0]


In [None]:
test_df['polarization'] = prediction_labels

In [None]:
test_df.sample(10)

Unnamed: 0,id,text,polarization
105,eng_995661141a6abbbdef6770db6e1bd8f5,Not all show. Rejected iron dome funding and p...,1
108,eng_ef3116c56645f94bbe998051aaa49e40,The real dangers of this bogus populism,1
141,eng_d9a5c1cc8ce24ec15a6b432204a34a9d,I apologize on behalf of all southerners. We a...,0
55,eng_d21174080515b0ad8dc1d00ee4368d4f,Thanks to President Biden for obtaining 1.5 bi...,0
94,eng_3857e2c131a5b5fef5bbf2e9794bcd5e,I dont remember voting to have open borders ei...,0
29,eng_3c4dc44df877cfb232d222462dab6543,"Israeli strikes kill at least 37 Palestinians,...",0
101,eng_5606f46a55984e9bf7db7034d451a1f6,NATO countries doesnt have balls to interfere ...,0
51,eng_3235c4f9ae1a74593f5cc6cff9277a85,St. Johnsbury Hires New Police Chief,0
100,eng_310cb1f24f3545a53f33e005798ece95,Modern day genocide and ethnic cleansing with ...,1
142,eng_e71b8aa8338a1deaae7328a73a122cca,If Israel hadnt committed ethnic cleansing the...,1


In [None]:
test_df.to_csv("eng_result1.csv")

In [None]:
submission = test_df[["id", "polarization"]]
submission.to_csv("submission.csv", index=False)