In [50]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoConfig, AutoTokenizer, AutoModel, pipeline, BertForSequenceClassification
from transformers import logging as hflogging
from transformers import pipeline
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

Define a custome classifier object

In [11]:
class TransformerBinaryClassifier(torch.nn.Module):

    def __init__(self, plm_name: str):
        super(TransformerBinaryClassifier, self).__init__()
        self.lmconfig = AutoConfig.from_pretrained(plm_name)
        self.lmtokenizer = AutoTokenizer.from_pretrained(plm_name)
        self.lm = AutoModel.from_pretrained(plm_name, output_attentions=False)
        self.emb_dim = self.lmconfig.hidden_size
        self.output_size = 1
        self.classifier = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(self.emb_dim, self.output_size),
            torch.nn.Sigmoid()
        )
        self.loss_fn = torch.nn.BCELoss(reduction='mean')


    def forward(self, x):
        #  x['input_ids']  -> N,T,d
        x : torch.Tensor = self.lm(x['input_ids'], x['attention_mask']).last_hidden_state
        global_vects = x.mean(dim=1)   #  N,d
        x = self.classifier(global_vects)
        return x.squeeze(-1)

    def compute_loss(self, predictions, target):
        return self.loss_fn(predictions, target)
    
model = TransformerBinaryClassifier("bert-base-uncased")

Loading the data, formating it into a dataset type.

In [5]:
data = pd.read_csv("Data/pairs_with_ratings.tsv",sep="\t")
data = data.drop_duplicates(["original_id"])
data = pd.DataFrame({"text":pd.concat([data["original_title"], data["title"]], axis=0),
                         "label":[1]*len(data["original_title"])+[0]*len(data["title"])})
# Split dataset into train and test
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Convert pandas DataFrame to HuggingFace Dataset
ds_train = Dataset.from_pandas(train_df)
ds_test = Dataset.from_pandas(test_df)

Transform text into tokens

In [21]:
def tokenize_function(examples):
    return model.lmtokenizer(examples["text"], truncation=True)


# ds_train = ds_train.rename_column("label", "labels")
# ds_test = ds_test.rename_column("label", "labels")

# tokenize datasets
tok_ds_train = ds_train.map(tokenize_function, batched=True)
tok_ds_test = ds_test.map(tokenize_function, batched=True)

tok_ds_train = tok_ds_train.remove_columns(["text"])
tok_ds_test = tok_ds_test.remove_columns(["text"])

tok_ds_train = tok_ds_train.rename_column("label", "labels")
tok_ds_test = tok_ds_test.rename_column("label", "labels")

Map: 100%|██████████| 1905/1905 [00:00<00:00, 47913.78 examples/s]
Map: 100%|██████████| 477/477 [00:00<00:00, 44192.50 examples/s]


Now we will create the data loaders for the train and test sets. As a collate function (i.e. the function used by a data loader to build batches from instances sampled from a dataset), we use the `DataCollatorWithPadding` class from the `transformers`library. It automatically performs dynamic (batch-wise) padding. It can do this because it recognizes the relevant "keys" that encode the input texts, i.e. the dataset column names `input_ids` and `attention_mask`:

In [45]:
data_collator = DataCollatorWithPadding(tokenizer=model.lmtokenizer, padding=True, return_tensors='pt')

# Divides the data into batches and places them in an iterator
train_dataloader = DataLoader(tok_ds_train, shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tok_ds_test, batch_size=8, collate_fn=data_collator)


In [48]:
optimizer = AdamW(model.parameters(), lr=5e-5)


num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# device = 'cpu'

model.to(device)

lr_scheduler

<torch.optim.lr_scheduler.LambdaLR at 0x1ba1899ee40>

In [51]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        predictions = model(batch)
        loss = model.loss_fn(predictions, batch['labels'].float())
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



100%|██████████| 717/717 [10:36<00:00,  1.27it/s]

In [52]:
def predict(model, texts):
  model.eval()
  encoded_texts = model.lmtokenizer(texts, truncation=True, padding=True, return_attention_mask=True, return_tensors='pt')
  with torch.no_grad():
    output = model(encoded_texts.to(device)).tolist()
    pred_labels = ["positive" if p>0.5 else "negative" for p in output]
    return list(zip(texts, pred_labels))

In [53]:
texts = [
    "That was a horrible movie!",
    "I really liked it! Nice acting.",
]

predict(model, texts)

[('That was a horrible movie!', 'negative'),
 ('I really liked it! Nice acting.', 'negative')]