In [15]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

In [16]:
data = pd.read_csv('IMDB_Dataset.csv')

In [17]:
print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [19]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [20]:
dataset = Dataset.from_pandas(data)

In [21]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [22]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def tokenize_function(examples):
    return tokenizer(examples['review'], truncation=True, padding='max_length', max_length=512)

In [24]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [25]:
train_dataset = train_dataset.rename_column("sentiment", "labels")
test_dataset = test_dataset.rename_column("sentiment", "labels")

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [35]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1057,0.285214
2,0.105,0.270796
3,0.0361,0.28379


TrainOutput(global_step=3750, training_loss=0.07621046259452899, metrics={'train_runtime': 1532.4896, 'train_samples_per_second': 78.304, 'train_steps_per_second': 2.447, 'total_flos': 3.15733266432e+16, 'train_loss': 0.07621046259452899, 'epoch': 3.0})

In [37]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 0.27079567313194275, 'eval_runtime': 40.3313, 'eval_samples_per_second': 247.946, 'eval_steps_per_second': 7.761, 'epoch': 3.0}


In [38]:
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
accuracy = accuracy_score(test_dataset['labels'], pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(test_dataset['labels'], pred_labels, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

def compute_metrics(p):
    pred, labels = p
    pred = pred.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
    acc = accuracy_score(labels, pred)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

Accuracy: 0.933
Precision: 0.9482867353373975
Recall: 0.9140830800405269
F1 Score: 0.9308708212959141
