## Import data and format 

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
dataset.column_names

{'train': ['review', 'sentiment'], 'test': ['review', 'sentiment']}

In [6]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
label2id = {'negative': 0,
            'positive': 1
            }
id2label = {0: 'negative',
            1: 'positive'}
dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map: 100%|██████████| 35000/35000 [00:04<00:00, 7744.94 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 7888.07 examples/s]


In [8]:
dataset['train'][0]

{'review': 'First of all, let me start by saying that I have been a devoted follower of C Thomas Howell\'s career ever since "The Outsiders" and "The Hitcher". He was an up and coming star in the 1980s - with hits such as "Soul Man" also. The future was bright for this young actor and he had the potential to go on from there and really assert himself in Hollywood. Put it this way - Tom Cruise had a bit part in "The Outsiders" while Howell had the lead. Look at Cruise today !!! But picking material like this drivel will only denigrate Howell\'s career even more - if that was possible. Why does he pick stuff like this? A small part in a major movie would be of more benefit to him than this rubbish.<br /><br />Essentially the story here takes place in a post-apocalyptic world where everybody lives underground where chaos reigns. Howell is a Shepherd - protecting the flock of various religious leaders by killing off any undesirables. He\'s a hit-man in other words.<br /><br />The sets are 

## Tokenize the inputs

In [9]:
from transformers import AutoTokenizer
import torch

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [11]:
print(dataset['train'][0])
print(tokenizer(dataset['train'][0]['review']))

{'review': 'First of all, let me start by saying that I have been a devoted follower of C Thomas Howell\'s career ever since "The Outsiders" and "The Hitcher". He was an up and coming star in the 1980s - with hits such as "Soul Man" also. The future was bright for this young actor and he had the potential to go on from there and really assert himself in Hollywood. Put it this way - Tom Cruise had a bit part in "The Outsiders" while Howell had the lead. Look at Cruise today !!! But picking material like this drivel will only denigrate Howell\'s career even more - if that was possible. Why does he pick stuff like this? A small part in a major movie would be of more benefit to him than this rubbish.<br /><br />Essentially the story here takes place in a post-apocalyptic world where everybody lives underground where chaos reigns. Howell is a Shepherd - protecting the flock of various religious leaders by killing off any undesirables. He\'s a hit-man in other words.<br /><br />The sets are 

In [12]:
def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:29<00:00, 1199.38 examples/s]
Map: 100%|██████████| 15000/15000 [00:10<00:00, 1385.11 examples/s]


In [13]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

## Model Evaluation

In [14]:
import evaluate
import numpy as np
import sklearn

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
                                                           num_labels=len(label2id), 
                                                           label2id=label2id, 
                                                           id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [20]:
trainer.train()

  4%|▍         | 132/3282 [27:55<3:36:31,  4.12s/it]  

KeyboardInterrupt: 