In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

<br>
<br>
<br>

### Data Collection

In [None]:
# loading csv data
data = pd.read_csv("./data.csv")
data.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


In [None]:
# inspecting shape
data.shape

(50000, 2)

In [None]:
# removing duplicates
data = data.drop_duplicates().reset_index()
data.shape

(49582, 3)

<br>
<br>
<br>

### Data Preparation

In [None]:
# seperating features
X = data.review
X.head(3)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
Name: review, dtype: object

In [None]:
# seperating target
y = data.sentiment
y.head(3)

0    positive
1    positive
2    positive
Name: sentiment, dtype: object

In [None]:
# creating validation set
x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, test_size=0.1)

In [None]:
# checking shapes
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(44623,)
(4959,)
(44623,)
(4959,)


<br>

In [None]:
# remove html tags from text
cleaner_regex = re.compile('<.*?>')

def remove_tags(text):
    cleantext = re.sub(cleaner_regex, '', text)
    return cleantext

In [None]:
# cleaning training data
x_train_cleaned = x_train.apply(lambda x: remove_tags(x))
x_train_cleaned.head(3)

34455    In fact, parts of it I liked a lot. It had som...
3811     This is primarily about love in WWII, yet we m...
32781    From the writer of "What Ever Happened to Baby...
Name: review, dtype: object

In [None]:
# cleaning test data
x_test_cleaned = x_test.apply(lambda x: remove_tags(x))
x_test_cleaned.head(3)

6653     No awards show can please all the people. Clea...
44594    This movie was so poorly acted. What was with ...
25957    WHEN I first saw this film, in London, in 1958...
Name: review, dtype: object

In [None]:
# verifying shapes
print(x_train_cleaned.shape)
print(x_test_cleaned.shape)

(44623,)
(4959,)


<br>

In [None]:
# encoding training data
y_train_cleaned = y_train.apply(lambda x: 1 if x == 'positive' else 0)
y_train_cleaned.head(3)

34455    0
3811     1
32781    1
Name: sentiment, dtype: int64

In [None]:
# encoding test data
y_test_cleaned = y_test.apply(lambda x: 1 if x == 'positive' else 0)
y_test_cleaned.head(3)

6653     1
44594    0
25957    1
Name: sentiment, dtype: int64

In [None]:
# verifying shapes
print(y_train_cleaned.shape)
print(y_test_cleaned.shape)

(44623,)
(4959,)


<br>
<br>
<br>

### Fine Tunning

In [None]:
# huggingface model
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

In [None]:
# creating tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)
# creating model with 2 labels
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2, ignore_mismatched_sizes=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

In [None]:
# adding gpu support
model = model.to('cuda')

In [None]:
# tokenizing sample data

sample_data = ["I am eating","I am playing "]
tokenizer(sample_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[0, 100, 524, 4441, 2, 1], [0, 100, 524, 816, 1437, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1]]}

In [None]:
# tokenizing training data
train_tokens = tokenizer(list(x_train_cleaned), padding=True, truncation=True, max_length=512)

In [None]:
# tokenizing test data
test_tokens = tokenizer(list(x_test_cleaned), padding=True, truncation=True, max_length=512)

In [None]:
# verifying keys
print(train_tokens.keys())
print(test_tokens.keys())

dict_keys(['input_ids', 'attention_mask'])
dict_keys(['input_ids', 'attention_mask'])


In [None]:
# create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
# creating training and test datasets

train_dataset = Dataset(train_tokens, list(y_train_cleaned))
test_dataset = Dataset(test_tokens, list(y_test_cleaned))

<br>
<br>
<br>

### Modelling

In [None]:
# coputer model metrics
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# defining trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=1,
    per_device_train_batch_size=8

)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
# training model
trainer.train()

***** Running training *****
  Num examples = 44623
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5578
  Number of trainable parameters = 124647170


Step,Training Loss
500,0.3573
1000,0.3196
1500,0.3153
2000,0.2818
2500,0.2607
3000,0.2411
3500,0.2452
4000,0.2166
4500,0.2351
5000,0.202


Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to output/checkpoint-2500
Configuration saved in output/checkpoint-2500/config.json
Model weights saved in output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to output/checkpoint-3000
Configuration saved in output/checkpoint-3000/config.json
Model weights saved in output/check

TrainOutput(global_step=5578, training_loss=0.25987752956829435, metrics={'train_runtime': 4295.7445, 'train_samples_per_second': 10.388, 'train_steps_per_second': 1.298, 'total_flos': 1.174080462332928e+16, 'train_loss': 0.25987752956829435, 'epoch': 1.0})

In [None]:
# evaluating model preformance
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 4959
  Batch size = 8


<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.2237410545349121,
 'eval_accuracy': 0.941318814277072,
 'eval_precision': 0.9364575059571089,
 'eval_recall': 0.9473684210526315,
 'eval_f1': 0.9418813660874775,
 'eval_runtime': 149.1099,
 'eval_samples_per_second': 33.257,
 'eval_steps_per_second': 4.158,
 'epoch': 1.0}

In [None]:
# making predictions

text = "The movie was not that good. But I loved the actors"
inputs = tokenizer(text, padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


SequenceClassifierOutput(loss=None, logits=tensor([[-2.7686,  2.1023]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[0.0076, 0.9924]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


array([[0.00760796, 0.9923921 ]], dtype=float32)

<br>
<br>
<br>

### Saving & Loading

In [None]:
# saving model
trainer.save_model('../models/movie_review_model')

Saving model checkpoint to movie_review_model
Configuration saved in movie_review_model/config.json
Model weights saved in movie_review_model/pytorch_model.bin


In [None]:
# loading saved model
model_2 = AutoModelForSequenceClassification.from_pretrained("../models/movie_review_model")
model_2 = model_2.to('cuda')

In [None]:
# making prediction on loaded model

text = "Not going to watch it again"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.99806505, 0.00193497]], dtype=float32)