## Import data and format 

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
dataset.column_names

{'train': ['review', 'sentiment'], 'test': ['review', 'sentiment']}

In [6]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
label2id = {'negative': 0,
            'positive': 1
            }
id2label = {0: 'negative',
            1: 'positive'}
dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})


Map: 100%|██████████| 35000/35000 [00:02<00:00, 12647.57 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 10991.77 examples/s]


In [8]:
dataset['train'][0]

{'review': 'I just saw "Checking Out" at the Philadelphia Film Festival. What a terrific combination of a heartwarming storyline and a great cast. Director Jeff Hare has done an outstanding job of inviting the audience into the disjointed, yet hilarious world of Morris Applebaum and family. The family life is presented in such a way that we enjoy the crazy antics yet feel the real pain and concern they have for one another.<br /><br />Typically I am not a Peter Falk fan, but he IS Morris Applebaum and plays the role with great humor and humanity.<br /><br />I hope that everyone gets to see this wonderful movie and enjoy it as I did.<br /><br />Hats off to the Director, Cast, and Crew for a job well done!',
 'sentiment': 'positive',
 'label': 1}

## Tokenize the inputs

In [9]:
from transformers import AutoTokenizer
import torch

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [11]:
print(dataset['train'][0])
print(tokenizer(dataset['train'][0]['review']))

{'review': 'I just saw "Checking Out" at the Philadelphia Film Festival. What a terrific combination of a heartwarming storyline and a great cast. Director Jeff Hare has done an outstanding job of inviting the audience into the disjointed, yet hilarious world of Morris Applebaum and family. The family life is presented in such a way that we enjoy the crazy antics yet feel the real pain and concern they have for one another.<br /><br />Typically I am not a Peter Falk fan, but he IS Morris Applebaum and plays the role with great humor and humanity.<br /><br />I hope that everyone gets to see this wonderful movie and enjoy it as I did.<br /><br />Hats off to the Director, Cast, and Crew for a job well done!', 'sentiment': 'positive', 'label': 1}
{'input_ids': [101, 1045, 2074, 2387, 1000, 9361, 2041, 1000, 2012, 1996, 4407, 2143, 2782, 1012, 2054, 1037, 27547, 5257, 1997, 1037, 2540, 9028, 6562, 9994, 1998, 1037, 2307, 3459, 1012, 2472, 5076, 14263, 2038, 2589, 2019, 5151, 3105, 1997, 150

In [12]:
def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:23<00:00, 1471.09 examples/s]
Map: 100%|██████████| 15000/15000 [00:08<00:00, 1766.17 examples/s]


In [13]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

## Model Evaluation

In [14]:
import evaluate
import numpy as np
import sklearn

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
                                                           num_labels=len(label2id), 
                                                           label2id=label2id, 
                                                           id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [17]:
# NEED TO OPEN THIS REPO IN COLAB AND RUN ON A GPU (~150 SAMPLES / SEC IE ROUGHLY 15MINS RUNTIME)
trainer.train()

  8%|▊         | 252/3282 [1:52:23<4:09:48,  4.95s/it]   

KeyboardInterrupt: 

## Save and Load Model for Prediction

In [None]:
trainer.save_model('tinybert-sentiment-analysis')

In [None]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [None]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)