In [1]:
!pip install transformers
!pip install pytorch-lightning

import pandas as pd



In [2]:
data_path = "./dataset/processed/train.csv"

In [3]:
ds = pd.read_csv(data_path, dtype={"id": str})
ds.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,score
0,0,59848,"this is so cool. it is like, 'would you want y...",0
1,1,59849,thank you!! this would make my life a lot less...,0
2,2,59852,this is such an urgent design problem; kudos t...,0
3,3,59855,is this something i will be able to install on...,0
4,4,59856,haha you guys are a bunch of losers.,9


In [4]:
# max_token_len = ds["comment_text"].str.len().max() + 2
max_token_len = 512

In [5]:
X = ds["comment_text"]
X

0          this is so cool. it is like, 'would you want y...
1          thank you!! this would make my life a lot less...
2          this is such an urgent design problem; kudos t...
3          is this something i will be able to install on...
4                       haha you guys are a bunch of losers.
                                 ...                        
1964286    ":::::and for the second time of asking, when ...
1964287    you should be ashamed of yourself \n\nthat is ...
1964288    spitzer \n\numm, there is no actual article fo...
1964289    and it looks like it was actually you who put ...
1964290    "\nand ... i really do not think you understan...
Name: comment_text, Length: 1964291, dtype: object

In [6]:
y = ds["score"]
y

0          0
1          0
2          0
3          0
4          9
          ..
1964286    0
1964287    0
1964288    0
1964289    0
1964290    0
Name: score, Length: 1964291, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=103)

In [8]:
train_ds = pd.DataFrame({"comment_text": X_train, "score": y_train})
train_ds

Unnamed: 0,comment_text,score
519999,interesting how the democrat liberals turn the...,8
1892924,ban\nit was only for six months that ban is ov...,0
1115773,i do not know of too many men who are publicly...,0
759297,i wonder if you called for the imprisonment of...,2
1745627,pam smith has been a sycophantic party gadfly ...,0
...,...,...
783558,and because politicians have houses\nthey can ...,0
194361,mr. trump has exceeded his first amendment pro...,0
1662299,"""it is a unique forum that newspapers offer. i...",2
1662739,"i think we too are the people who, on the one ...",3


In [9]:
val_ds = pd.DataFrame({"comment_text": X_test, "score": y_test})
val_ds

Unnamed: 0,comment_text,score
1539379,"now that we got that out of the way, can we st...",0
1143896,the only fraud in this picture is trump himself.,0
1268050,liberals have to come to grips with the fact t...,0
84155,lying? it was a facebook post?,0
1885782,"thanks, delicious carbuncle. this admission, h...",0
...,...,...
1423548,[right could have ironically benefited today f...,0
496621,the comments by these doctors that there are m...,0
989732,the answer to this question is self evident. \...,0
809035,"hi judyrae, well, it is been 7 years now, and ...",0


In [10]:
from torch.utils.data import Dataset
import torch

In [11]:
class ToxicityDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len, sample = 700_000):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = int(max_token_len)
        self.sample = sample
        self.__prepare_data()

    def __prepare_data(self):
        if self.sample is not None:
            toxic = self.data.loc[self.data["score"] > 0]
            not_toxic = self.data.loc[self.data["score"] == 0]
            self.data = pd.concat([toxic, not_toxic.sample(self.sample, random_state=53)])
            print(self.data.head())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if index > len(self.data):
            raise IndexError("Index out of bound: ", f"index: {index}, len: {len(self.data)}")
        item = self.data.iloc[index]
        comment = str(item.comment_text)
        score = torch.tensor(item.iloc[1], dtype=torch.float32)
        # score = torch.FloatTensor(item[["score"]])
        
        tokens = self.tokenizer.encode_plus(comment,
                                           add_special_tokens=True,
                                           return_tensors="pt",
                                           truncation=True,
                                           max_length=self.max_token_len,
                                           padding="max_length",
                                           return_attention_mask=True)

        return {"input_ids": tokens.input_ids.flatten(), "attention_mask": tokens.attention_mask.flatten(), "label": score}

In [12]:
from transformers import AutoTokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = ToxicityDataset(train_ds, tokenizer, max_token_len)
val_dataset = ToxicityDataset(val_ds, tokenizer, max_token_len, sample=None)

                                              comment_text  score
519999   interesting how the democrat liberals turn the...      8
759297   i wonder if you called for the imprisonment of...      2
477418   we all have the ability to hate, just look at ...      2
1588926  terrorist attack on canadian soil...no comment...      2
1031047  subjective guilt has never been relevant to th...      5


In [13]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader

In [14]:
class ToxicityDataModule(pl.LightningDataModule):
    def __init__(self, train_data, val_data, max_token_len, model_name, batch_size):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size
        self.max_token_len = max_token_len
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def setup(self, stage = "fit"):
        if stage == "fit":
            self.train_ds = ToxicityDataset(self.train_data, self.tokenizer, max_token_len)
            self.val_ds = ToxicityDataset(self.val_data, self.tokenizer, max_token_len, sample=None)
        if stage == "predict":
            self.val_ds = ToxicityDataset(self.val_data, self.tokenizer, max_token_len, sample=None)

    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size = self.batch_size, num_workers=7, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size = self.batch_size, num_workers=4, shuffle=False)

    def predict_dataloader(self):
        return DataLoader(self.val_ds, batch_size = self.batch_size, num_workers=4, shuffle=False)

In [15]:
from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F

In [16]:
class ToxicityClassifier(pl.LightningModule):
    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(config["model_name"])
        # hidden layer
        self.hidden = nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
        # classification layer
        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, self.config["n_labels"])
        torch.nn.init.xavier_uniform_(self.hidden.weight)
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        self.loss_func = nn.CrossEntropyLoss(reduction="mean")
        # dropout layer
        self.dropout = nn.Dropout()

    def forward(self, input_ids, attention_mask, label=None):
        # roberta model
        output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)
        pooled_output = torch.mean(output.last_hidden_state, 1)
        # nerual network classfication layer
        pooled_output = self.hidden(pooled_output)
        # activation function
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        # calculate loss
        loss = 0

        if label is not None:
            loss = self.loss_func(logits.view(-1, self.config["n_labels"]), label.view(-1, self.config['n_labels']))
            return loss, logits

    def training_step(self, batch, batch_index):
        loss, logits = self(**batch)
        self.log("train loss", loss, prog_bar = True, logger = True)
        return { "loss": loss, "predictions": logits, "label": batch["label"]}

    def validation_step(self, batch, batch_index):
        loss, logits = self(**batch)
        self.log("validation loss", loss, prog_bar = True, logger = True)
        return { "val_loss": loss, "predictions": logits, "label": batch["label"]}

    def prediction_step(self, batch, batch_index):
        _, logits = self(**batch)
        return logits

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.config["lr"], weight_decay=self.config["w_decay"])
        # optimizer = AdamW(self.parameters(), lr=self.config["lr"], weight_decay=self.config["w_decay"])
        total_steps = self.config["train_size"] / self.config["batch_size"]
        warmup_steps = math.floor(total_steps + self.config["warmup"])
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer], [scheduler]
        

In [17]:
data_module = ToxicityDataModule(train_ds, val_ds, max_token_len, model_name = "roberta-base", batch_size=128)
data_module.setup()

                                              comment_text  score
519999   interesting how the democrat liberals turn the...      8
759297   i wonder if you called for the imprisonment of...      2
477418   we all have the ability to hate, just look at ...      2
1588926  terrorist attack on canadian soil...no comment...      2
1031047  subjective guilt has never been relevant to th...      5


In [18]:
config = {
    "model_name": "distilroberta-base",
    "n_labels": 1,
    "batch_size": 128,
    "lr": 1.5e-6,
    "warmup": 0.2,
    "train_size": len(data_module.train_dataloader()) ,
    "w_decay": 0.001,
    "n_epochs": 1
}

In [19]:
data_module = ToxicityDataModule(train_ds, val_ds, max_token_len, model_name = config["model_name"], batch_size = config["batch_size"])
data_module.setup()

                                              comment_text  score
519999   interesting how the democrat liberals turn the...      8
759297   i wonder if you called for the imprisonment of...      2
477418   we all have the ability to hate, just look at ...      2
1588926  terrorist attack on canadian soil...no comment...      2
1031047  subjective guilt has never been relevant to th...      5


In [21]:
model = ToxicityClassifier(config)

In [22]:
idx=0
input_ids = train_dataset.__getitem__(idx)['input_ids']
attention_mask = train_dataset.__getitem__(idx)['attention_mask']
label = train_dataset.__getitem__(idx)['label']
model.cpu()
print(label)
loss, output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0), label.unsqueeze(dim=0))
print(label.shape, output.shape, output)

tensor(8.)
torch.Size([]) torch.Size([1, 1]) tensor([[1.0777]], grad_fn=<AddmmBackward0>)


In [None]:
if __name__ == "__main__":
    trainer = pl.Trainer(max_epochs=config["n_epochs"], num_sanity_val_steps=2, logger = True, enable_progress_bar = True, num_nodes = 1)
    trainer.fit(model, data_module)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs






  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | pretrained_model | RobertaModel     | 82.1 M | eval 
1 | hidden           | Linear           | 590 K  | train
2 | classifier       | Linear           | 769    | train
3 | loss_func        | CrossEntropyLoss | 0      | train
4 | dropout          | Dropout          | 0      | train
--------------------------------------------------------------
82.7 M    Trainable params
0         Non-trainable params
82.7 M    Total params
330.839   Total estimated model params size (MB)


                                              comment_text  score
519999   interesting how the democrat liberals turn the...      8
759297   i wonder if you called for the imprisonment of...      2
477418   we all have the ability to hate, just look at ...      2
1588926  terrorist attack on canadian soil...no comment...      2
1031047  subjective guilt has never been relevant to th...      5


Sanity Checking: |                                                                           | 0/? [00:00<?, ?…

C:\Users\Long\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
