## Import data and format

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
!pip install datasets

import pandas as pd
from datasets import Dataset

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [4]:
data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [6]:
dataset.column_names

{'train': ['review', 'sentiment'], 'test': ['review', 'sentiment']}

In [7]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [8]:
label2id = {'negative': 0,
            'positive': 1
            }
id2label = {0: 'negative',
            1: 'positive'}
dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [9]:
dataset['train'][0]

{'review': 'I have given this film an elevated rating of 2 stars as I personally appear in minutes 42 and 43 of the film....the road side bar scene in Russia. In this scene the director of the movie offered me the immortal line - "50 Dollars..you Drink and Talk", but I felt that my Polish counterpart could speak in a more convincing Russian accent than I could, so I declined to take this speaking part on. I was slightly starstruck as this was my first Film experience....and who knows... these lines could have ended up there with lines such as "I\'ll be Back" and "Quite Frankly My Dear, I Don\'t Give a Damn". Had I spoken that one line then my name would appear in the credits of Rancid Aluminium as \'Heavy 1\' instead of the name of Ryszard Janikowski. <br /><br />As time goes on, I am counting myself lucky that my name is in no way connected to this film.<br /><br />Even though I spent a whole day on the set, in South Wales hot-spot Barry Island, no one could tell me what the actual st

## Tokenize the inputs

In [10]:
from transformers import AutoTokenizer
import torch

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [12]:
print(dataset['train'][0])
print(tokenizer(dataset['train'][0]['review']))

{'review': 'I have given this film an elevated rating of 2 stars as I personally appear in minutes 42 and 43 of the film....the road side bar scene in Russia. In this scene the director of the movie offered me the immortal line - "50 Dollars..you Drink and Talk", but I felt that my Polish counterpart could speak in a more convincing Russian accent than I could, so I declined to take this speaking part on. I was slightly starstruck as this was my first Film experience....and who knows... these lines could have ended up there with lines such as "I\'ll be Back" and "Quite Frankly My Dear, I Don\'t Give a Damn". Had I spoken that one line then my name would appear in the credits of Rancid Aluminium as \'Heavy 1\' instead of the name of Ryszard Janikowski. <br /><br />As time goes on, I am counting myself lucky that my name is in no way connected to this film.<br /><br />Even though I spent a whole day on the set, in South Wales hot-spot Barry Island, no one could tell me what the actual st

In [13]:
def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [14]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

## Model Evaluation

In [15]:
!pip install evaluate

import evaluate
import numpy as np
import sklearn

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           num_labels=len(label2id),
                                                           label2id=label2id,
                                                           id2label=id2label)

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [19]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3521,0.330054,0.8618
2,0.2908,0.299191,0.872267
3,0.2562,0.291453,0.880067


TrainOutput(global_step=3282, training_loss=0.3169583094548627, metrics={'train_runtime': 668.7852, 'train_samples_per_second': 157.001, 'train_steps_per_second': 4.907, 'total_flos': 882184338000000.0, 'train_loss': 0.3169583094548627, 'epoch': 3.0})

In [20]:
trainer.evaluate()

{'eval_loss': 0.2914528548717499,
 'eval_accuracy': 0.8800666666666667,
 'eval_runtime': 29.7977,
 'eval_samples_per_second': 503.394,
 'eval_steps_per_second': 15.739,
 'epoch': 3.0}

In [21]:
trainer.save_model('tinybert-sentiment-analysis')

In [22]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [23]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

[{'label': 'negative', 'score': 0.9912823438644409},
 {'label': 'negative', 'score': 0.9912341237068176},
 {'label': 'positive', 'score': 0.9893152713775635}]