<a href="https://colab.research.google.com/github/czarodziejszyn/ssne/blob/main/projekt6/hate_classificator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [40]:
MODEL_NAME = "allegro/herbert-base-cased"
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [41]:
df = pd.read_csv("/content/drive/MyDrive/hate_train.csv")
train_texts, val_texts, train_labels, val_labels = train_test_split(df["sentence"], df["label"], test_size=0.2)

In [42]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class HateDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=MAX_LEN)
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HateDataset(train_texts, train_labels)
val_dataset = HateDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [43]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
model.train()
for epoch in range(EPOCHS):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 502/502 [02:44<00:00,  3.06it/s, loss=0.0773]
Epoch 2: 100%|██████████| 502/502 [02:44<00:00,  3.05it/s, loss=0.0822]
Epoch 3: 100%|██████████| 502/502 [02:44<00:00,  3.06it/s, loss=0.168]
Epoch 4: 100%|██████████| 502/502 [02:44<00:00,  3.06it/s, loss=0.333]
Epoch 5: 100%|██████████| 502/502 [02:44<00:00,  3.06it/s, loss=0.00389]
Epoch 6: 100%|██████████| 502/502 [02:44<00:00,  3.06it/s, loss=0.00195]
Epoch 7: 100%|██████████| 502/502 [02:44<00:00,  3.05it/s, loss=0.00149]
Epoch 8: 100%|██████████| 502/502 [02:44<00:00,  3.06it/s, loss=0.000192]
Epoch 9: 100%|██████████| 502/502 [02:44<00:00,  3.05it/s, loss=0.00142]
Epoch 10: 100%|██████████| 502/502 [02:44<00:00,  3.05it/s, loss=0.000166]


In [45]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

print("Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds))

Accuracy: 0.93827775012444
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1827
           1       0.67      0.62      0.64       182

    accuracy                           0.94      2009
   macro avg       0.82      0.79      0.80      2009
weighted avg       0.94      0.94      0.94      2009



In [46]:
with open("/content/drive/MyDrive/hate_test_data.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f.readlines() if line.strip()]

predictions = []
with torch.no_grad():
    for text in sentences:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LEN)
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        logits = model(**inputs).logits
        pred = torch.argmax(logits, dim=1).item()
        predictions.append(str(pred))

with open("pred.csv", "w", encoding="utf-8") as f:
    f.write("\n".join(predictions))