In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torchvision.transforms as transforms
import torch

In [2]:
test_data = load_dataset('imdb',split=['test'])[0]

In [3]:
train_data = load_dataset('imdb',split=['train'])[0]

In [4]:
#alternative way

In [5]:
splits = ['train','test']

In [6]:
ds = {split:ds for split,ds in zip(splits,load_dataset('imdb',split=splits))}

In [7]:
ds

{'train': Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 'test': Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 })}

In [8]:
ds_train = ds['train'].shuffle(seed=42).select(range(500))

In [9]:
ds_train

Dataset({
    features: ['text', 'label'],
    num_rows: 500
})

In [10]:
token = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [11]:
check = token(ds_train[:5]['text'])

Token indices sequence length is longer than the specified maximum sequence length for this model (936 > 512). Running this sequence through the model will result in indexing errors


In [12]:
token(ds_train[0]['text'],truncation=True,padding='max_length')

{'input_ids': [101, 2045, 2003, 2053, 7189, 2012, 2035, 2090, 3481, 3771, 1998, 6337, 2099, 2021, 1996, 2755, 2008, 2119, 2024, 2610, 2186, 2055, 6355, 6997, 1012, 6337, 2099, 3504, 15594, 2100, 1010, 3481, 3771, 3504, 4438, 1012, 6337, 2099, 14811, 2024, 3243, 3722, 1012, 3481, 3771, 1005, 1055, 5436, 2024, 2521, 2062, 8552, 1012, 1012, 1012, 3481, 3771, 3504, 2062, 2066, 3539, 8343, 1010, 2065, 2057, 2031, 2000, 3962, 12319, 1012, 1012, 1012, 1996, 2364, 2839, 2003, 5410, 1998, 6881, 2080, 1010, 2021, 2031, 1000, 17936, 6767, 7054, 3401, 1000, 1012, 2111, 2066, 2000, 12826, 1010, 2000, 3648, 1010, 2000, 16157, 1012, 2129, 2055, 2074, 9107, 1029, 6057, 2518, 2205, 1010, 2111, 3015, 3481, 3771, 3504, 2137, 2021, 1010, 2006, 1996, 2060, 2192, 1010, 9177, 2027, 9544, 2137, 2186, 1006, 999, 999, 999, 1007, 1012, 2672, 2009, 1005, 1055, 1996, 2653, 1010, 2030, 1996, 4382, 1010, 2021, 1045, 2228, 2023, 2186, 2003, 2062, 2394, 2084, 2137, 1012, 2011, 1996, 2126, 1010, 1996, 5889, 2024, 2428,

In [13]:
text = ds_train[0]['text']

In [14]:
def pre_process(example):
    return token(example['text'],truncation=True,padding='max_length')

In [16]:
token_d = {}
for split in splits:
    # Optionally, remove the index column if it's not needed
    # Apply the pre_process function to each example in the split, batched for efficiency
    token_d[split] = ds[split].shuffle().select(range(50)).map(pre_process, batched=True)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [17]:
token_d['train'][0]['text']

'While I don\'t agree with Bob\'s and Tammy\'s decision to give up baby Jesse, and it\'s something I\'d never do, they were trying to do what was best for the baby. The way this movie is written, you see yourself becoming wrapped up in the story and asking yourself what you really believe, from all different aspects. Patty Duke? Antagonist? Almost unheard of, as far as I\'m concerned. But during the movie, she really convinces you that she\'s psychotic, or at least, that there\'s something seriously wrong with her. Her character is the meaning of "emotionally disturbed." The movie seems to end quickly, leaving things somewhat unresolved. But other than that, this movie is really great. It really makes you think. It\'s not a movie to watch when you just want to kick back and relax and watch something cute that\'ll make you laugh. But it is a good movie to see when you want to challenge your own beliefs, see things from others\' perspectives, and discover a little something about yoursel

In [18]:
from transformers import AutoModelForSequenceClassification as am

In [19]:
from transformers import DataCollatorWithPadding,Trainer,TrainingArguments

In [20]:
model = am.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    id2label={0:'NEGATIVE', 1:'POSITIVE'},
    label2id={'NEGATIVE':0,'POSITIVE':1})

for param in model.base_model.parameters():
    param.requires_grad=False


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model.classifier

Linear(in_features=768, out_features=2, bias=True)

In [22]:
import numpy as np

In [23]:
def compute_metrics(eval):
    pred,evals=eval

In [24]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir='./data/sa',
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True),
    train_dataset=token_d['train'],
    eval_dataset=token_d['test'],
    tokenizer=token,
    data_collator=DataCollatorWithPadding(tokenizer=token))   

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [25]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msingh-gagandeep1103[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.684784


Checkpoint destination directory ./data/sa/checkpoint-13 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=13, training_loss=0.6878298245943509, metrics={'train_runtime': 65.8898, 'train_samples_per_second': 0.759, 'train_steps_per_second': 0.197, 'total_flos': 6623369932800.0, 'train_loss': 0.6878298245943509, 'epoch': 1.0})

In [26]:
import pandas as pd

In [27]:
df = pd.DataFrame(token_d['test'])

In [28]:
df = df[['text','label']]

In [29]:
predictions = trainer.predict(token_d['test'])

In [30]:
df['pred'] = np.argmax(predictions[0],axis=1)

In [31]:
dp = df[df['pred']==df['label']]

In [32]:
dp

Unnamed: 0,text,label,pred
5,This movie begins with a man who appears to be...,0,0
7,"Yes,the movie is not a piece of art but the fi...",0,0
8,Cage plays a drunk and gets high critically pr...,0,0
9,Although it's been hailed as a comedy-drama I ...,0,0
12,"Oh God, Why? I am aghast at the sheer ineptitu...",0,0
13,"Alright, how someone can actually think this m...",0,0
14,I'm not sure what the director and editor were...,0,0
15,"In his feature film debut `Yellow,' Chris Chan...",0,0
16,Thomas Edison had no other reason to make this...,0,0
20,Story of the creation of Underdog and adventur...,0,0
