In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
) 

model_name = "distilbert-base-uncased" #apparently faster?
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # changing to binary classification
model = model.to(device) #move to GPU

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
train_data = pd.read_csv('./LIAR/train.tsv', sep='\t', header=None) #train data
test_data = pd.read_csv('./LIAR/test.tsv', sep='\t', header=None) #test data

#if data is not compeltely true, assue it's false
def changeToBinaryCls(label):
    return True if label.strip().lower() == 'true' else False #turn all labels to T/F

train_data[1] = train_data[1].apply(changeToBinaryCls)
test_data[1] = test_data[1].apply(changeToBinaryCls)

In [4]:
train_text = train_data[2].values
train_label = train_data[1].values
test_text = test_data[2].values
test_label = test_data[1].values

In [25]:
np.unique(train_label, return_counts=True)

(array([False,  True]), array([8564, 1676]))

In [5]:
#tokenize text
def preprocess(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors="pt")


In [6]:
tokenized_train_text = preprocess(train_text.tolist())
tokenized_test_text = preprocess(test_text.tolist())

In [7]:
from torch.utils.data import Dataset
import torch

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# putting data into datasets
train_dataset = TextDataset(tokenized_train_text, train_label)
test_dataset = TextDataset(tokenized_test_text, test_label)

In [8]:
from torch.utils.data import DataLoader #iterator
from transformers import AdamW
import numpy as np 
from transformers import get_scheduler

#put data into dataloader for iteration
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
#learning scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)



1920


In [9]:
#show training progress
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
100%|██████████| 1920/1920 [17:02<00:00,  1.88it/s]

In [12]:
#evaluate the model
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'accuracy': 0.7884767166535123, 'f1': 0.20710059171597633}