In [1]:
!python --version


Python 3.9.21


In [2]:
2+2

4

In [9]:
import sys
!{sys.executable} -m pip install datasets



In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [4]:
#Look at element 15 of the training set and element 87 of the validation set. What are their labels?
raw_datasets["train"][15]
raw_datasets["validation"][87]
raw_datasets["train"].features
raw_datasets["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [5]:
len(raw_datasets["train"]["sentence1"])

3668

In [7]:
"""Take element 15 of the training set and tokenize the two sentences separately and as a pair. What’s the difference between the two results?"""

'Take element 15 of the training set and tokenize the two sentences separately and as a pair. What’s the difference between the two results?'

In [8]:
# create tokenizer with 'bert-based-uncased'
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
print(raw_datasets['train'][15])
inputs = tokenizer(raw_datasets['train'][15]['sentence1'], raw_datasets['train'][15]['sentence2'])

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .', 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .', 'label': 0, 'idx': 16}


In [14]:
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
print(tokens)
print(inputs['token_type_ids'])

['[CLS]', 'rudder', 'was', 'most', 'recently', 'senior', 'vice', 'president', 'for', 'the', 'developer', '&', 'platform', 'evan', '##gel', '##ism', 'business', '.', '[SEP]', 'senior', 'vice', 'president', 'eric', 'rudder', ',', 'formerly', 'head', 'of', 'the', 'developer', 'and', 'platform', 'evan', '##gel', '##ism', 'unit', ',', 'will', 'lead', 'the', 'new', 'entity', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [15]:
#this tokenization works well, but it has the disadvantage of returning a dictionary (with our keys, input_ids, attention_mask, and token_type_ids, and values that are lists of lists). 
#It will also only work if you have enough RAM to store your whole dataset during the tokenization
# hence Dataset.map() 

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)



Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [16]:
#notice new fileds have been added to the raw_datasets for each dataset
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [18]:
# lets see dynamic padding where dynamic padding for batches are supported by models

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print([len(x) for x in samples["input_ids"]]) #max len is 67 for this batch


[50, 59, 47, 67, 59, 50, 62, 32]


In [21]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
batch = data_collator(samples)
print([v.shape for k, v in batch.items()])

[torch.Size([8, 67]), torch.Size([8, 67]), torch.Size([8, 67]), torch.Size([8])]


In [27]:
"Replicate the preprocessing on the GLUE SST-2 dataset. It’s a little bit different since it’s composed of single sentences instead of pairs."

'Replicate the preprocessing on the GLUE SST-2 dataset. It’s a little bit different since it’s composed of single sentences instead of pairs.'

In [23]:

from datasets import load_dataset

raw_datasets_sst = load_dataset("glue", "sst2")
raw_datasets_sst

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [24]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer_sst = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(dataset):
    return tokenizer_sst(dataset['sentence'], truncation=True)

tokenized_sst_dataset = raw_datasets_sst.map(tokenize_function, batched=True)
   

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [26]:
print(tokenized_sst_dataset)
tokenized_sst_dataset = tokenized_sst_dataset.remove_columns(['sentence','idx'])
tokenized_sst_dataset = tokenized_sst_dataset.rename_column('label','labels')
tokenized_sst_dataset.with_format('torch')
print(tokenized_sst_dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})


In [29]:
#now use datacollator, which will do the dynamic padding
from transformers import DataCollatorWithPadding

data_collator_sst = DataCollatorWithPadding(tokenizer=tokenizer_sst)
batched_padded_samples = data_collator_sst(tokenized_sst_dataset['train'][:15])
print([v.shape for k, v in batched_padded_samples.items()])


[torch.Size([15]), torch.Size([15, 29]), torch.Size([15, 29]), torch.Size([15, 29])]


In [36]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(dataset=tokenized_sst_dataset['train'], batch_size=16,shuffle=True,collate_fn=data_collator_sst)
for step, batch in enumerate(train_dataloader):
    print(f"{step},{batch['input_ids'].shape}")
    if(step >=5):
        break


    

0,torch.Size([16, 46])
1,torch.Size([16, 35])
2,torch.Size([16, 34])
3,torch.Size([16, 27])
4,torch.Size([16, 46])
5,torch.Size([16, 48])
