### Overview: https://huggingface.co/docs/datasets/tutorial

#### Quickstart: NLP

In [1]:
from datasets import load_dataset

dataset = load_dataset("glue", "mrpc", split="train")

In [202]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

config = AutoConfig.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [205]:
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [206]:
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [207]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [200]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [4]:
tokenizer(dataset[0]["sentence1"],dataset[0]["sentence2"])

{'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
def encode(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length")

dataset = dataset.map(encode,batched=True)
dataset[0].keys(), dataset[0]["label"]

(dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']),
 1)

In [6]:
# label: 1 means equivalent, 0 means not-equivalent
dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [7]:
import torch

dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
dataset[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [8]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

#### Tutorials

In [9]:
# Inspect the dataset before loading it

from datasets import load_dataset_builder
ds_builder = load_dataset_builder("rotten_tomatoes")

ds_builder.info.description
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [10]:
# Load a dataset after inspecting it 
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes", split="train") 

In [11]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [12]:
from datasets import get_dataset_split_names

get_dataset_split_names("rotten_tomatoes")

['train', 'validation', 'test']

In [13]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [14]:
# Configurations or subsets are sub-datasets contained within a dataset
# Notes: Certain datasets repositories contain a loading script with the Python code used to generate the dataset.
# You should set trust_remote_code=True to use a dataset with a loading script, or you will get an error

from datasets import get_dataset_config_names

configs = get_dataset_config_names("PolyAI/minds14",trust_remote_code=True)
print(configs)

['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN', 'all']


In [15]:
from datasets import load_dataset

# mindsFR = load_dataset("PolyAI/minds14", split="train") # ERROR: "ValueError: Config name is missing."
# mindsFR = load_dataset("PolyAI/minds14", "fr-FR", split="train")

In [16]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("glue")
print(configs)

['ax', 'cola', 'mnli', 'mnli_matched', 'mnli_mismatched', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']


In [20]:
from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset

# c4 = load_dataset("c4", "en", split="train", trust_remote_code=True)
get_dataset_config_names("c4", trust_remote_code=True)
get_dataset_split_names("c4", "en", trust_remote_code=True)

['train', 'validation']

#### Knowing your dataset: "Dataset" and "IterableDataset" 

In [22]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes", split="train")
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [25]:
type(dataset[0]), dataset[0]

(dict,
 {'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'label': 1})

In [33]:
len(dataset[:-8525]["text"])

5

In [28]:
dataset[-3:]

{'text': ["hardly a nuanced portrait of a young woman's breakdown , the film nevertheless works up a few scares .",
  'interminably bleak , to say nothing of boring .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [0, 0, 0]}

In [37]:
# Indexing order matters: Indexing by column name returns all the values in the column first. 

import time

start_time = time.time()
text = dataset[0]["text"]
print(dataset[0])
end_time = time.time()
print(f"Elapsed time: {end_time - start_time:.4f} seconds")

start_time = time.time()
text = dataset["text"][0]
print(len(dataset["text"])) # extracted all the values of "text"
end_time = time.time()
print(f"Elapsed time: {end_time - start_time:.4f} seconds")

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
Elapsed time: 0.0000 seconds
8530
Elapsed time: 0.0151 seconds


In [None]:
# Iterable Dataset: for very big datasets that wont fit on disk or in memory

from datasets import load_dataset

iterable_dataset = load_dataset("food101", split="train", streaming=True)
iterable_dataset

IterableDataset({
    features: ['image', 'label'],
    num_shards: 8
})

In [41]:
type(iterable_dataset)

datasets.iterable_dataset.IterableDataset

In [42]:
for example in iterable_dataset:
    print(example)
    break

{'image': <PIL.Image.Image image mode=RGB size=384x512 at 0x154C05FFCA0>, 'label': 6}


In [43]:
# convert dataset to iterabledataset

from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes", split="train")
iterable_dataset = dataset.to_iterable_dataset()

In [44]:
iterable_dataset

IterableDataset({
    features: ['text', 'label'],
    num_shards: 1
})

In [45]:
next(iter(iterable_dataset))

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [48]:
list(iterable_dataset.take(3)) # return subset of the dataset

[{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'label': 1},
 {'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .',
  'label': 1},
 {'text': 'effective but too-tepid biopic', 'label': 1}]

In [49]:
count = sum(1 for _ in iterable_dataset)
print(f"Number of samples in the dataset: {count}")

Number of samples in the dataset: 8530


#### Preprocess: Tokenize text

In [51]:
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = load_dataset("rotten_tomatoes", split="train")

In [52]:
tokenizer(dataset[0]["text"])

{'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [53]:
def tokenization(example):
    return tokenizer(example["text"])

dataset = dataset.map(tokenization)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

In [54]:
dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8530
})

In [55]:
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8530
})

In [56]:
dataset[0]

{'label': tensor(1),
 'input_ids': tensor([  101,  1996,  2600,  2003, 16036,  2000,  2022,  1996,  7398,  2301,
          1005,  1055,  2047,  1000, 16608,  1000,  1998,  2008,  2002,  1005,
          1055,  2183,  2000,  2191,  1037, 17624,  2130,  3618,  2084,  7779,
         29058,  8625, 13327,  1010,  3744,  1011, 18856, 19513,  3158,  5477,
          4168,  2030,  7112, 16562,  2140,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [57]:
dataset.format["type"]

'torch'

### Datasets: Process

In [70]:
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc", split="train")

In [71]:
len(dataset["label"])

3668

In [72]:
dataset["label"][:10]
sorted_dataset = dataset.sort("label")
sorted_dataset["label"][:10], sorted_dataset["label"][-10:]

([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [73]:
shuffled_dataset = sorted_dataset.shuffle(seed=42)
shuffled_dataset["label"][:10]

[1, 1, 1, 0, 1, 1, 1, 1, 1, 0]

In [74]:
shuffled_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [67]:
# from datasets import Dataset

# # Step 1: Create a small dataset
# data = {
#     "text": ["Hello", "World", "This", "Is", "HuggingFace"],
#     "label": [0, 1, 0, 1, 0],
# }
# dataset = Dataset.from_dict(data)

# # Step 2: Shuffle the dataset
# shuffled_dataset = dataset.shuffle(seed=42)

# # Step 3: Check the indices mapping
# print("Original dataset data:")
# print(dataset)

# print("\nShuffled dataset data:")
# print(shuffled_dataset)

# # Access the _indices attribute
# print("\nOriginal dataset indices mapping:")
# print(getattr(dataset, "_indices", None))  # No mapping for the original dataset

# print("\nShuffled dataset indices mapping:")
# print(getattr(shuffled_dataset, "_indices", None))  # Indices mapping for shuffled dataset

# shuffled_dataset["text"]


Original dataset data:
Dataset({
    features: ['text', 'label'],
    num_rows: 5
})

Shuffled dataset data:
Dataset({
    features: ['text', 'label'],
    num_rows: 5
})

Original dataset indices mapping:
None

Shuffled dataset indices mapping:
InMemoryTable
indices: uint64
----
indices: [[4,2,3,1,0]]


In [75]:
iterable_dataset = dataset.to_iterable_dataset(num_shards=128)
iterable_dataset

IterableDataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_shards: 128
})

In [76]:
shuffled_iterable_dataset = iterable_dataset.shuffle(seed=42, buffer_size=1000)
shuffled_iterable_dataset

IterableDataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_shards: 128
})

In [77]:
print(getattr(iterable_dataset, "_indices", None))
print(getattr(shuffled_iterable_dataset, "_indices", None))

None
None


In [78]:
small_dataset = dataset.select([0, 10, 20, 30, 40, 50])
len(small_dataset)

6

In [80]:
small_dataset['idx']

[0, 11, 22, 35, 45, 58]

In [83]:
dataset['idx'][-5:], len(dataset['idx'])

([4071, 4072, 4073, 4074, 4075], 3668)

In [84]:
start_with_ar = dataset.filter(lambda example: example["sentence1"].startswith("Ar"))
len(start_with_ar)
start_with_ar["sentence1"]

Filter:   0%|          | 0/3668 [00:00<?, ? examples/s]

['Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 'Arison said Mann may have been one of the pioneers of the world music movement and he had a deep love of Brazilian music .',
 'Arts helped coach the youth on an eighth-grade football team at Lombardi Middle School in Green Bay .',
 'Around 9 : 00 a.m. EDT ( 1300 GMT ) , the euro was at $ 1.1566 against the dollar , up 0.07 percent on the day .',
 "Arguing that the case was an isolated example , Canada has threatened a trade backlash if Tokyo 's ban is not justified on scientific grounds .",
 'Artists are worried the plan would harm those who need help most - performers who have a difficult time lining up shows .']

In [85]:
even_dataset = dataset.filter(lambda example, idx: idx % 2 == 0, with_indices=True)
len(even_dataset)
len(dataset) / 2

Filter:   0%|          | 0/3668 [00:00<?, ? examples/s]

1834.0

In [88]:
splitted_dataset = dataset.train_test_split(test_size=0.1) # splits are shuffled by default
0.1 * len(dataset)

366.8

In [89]:
splitted_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3301
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 367
    })
})

In [101]:
from datasets import load_dataset
dataset = load_dataset("imdb", split="train")
print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [92]:
dataset.shard(num_shards=4, index=0)
print(25000/4)

6250.0


In [95]:
sharded_dataset = dataset.shard(num_shards=5, index=0)
sharded_dataset, len(dataset)

(Dataset({
     features: ['text', 'label'],
     num_rows: 5000
 }),
 25000)

In [97]:
print(dataset)
dataset1 = dataset.rename_column("text", "sentenceA")
dataset1

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


Dataset({
    features: ['sentenceA', 'label'],
    num_rows: 25000
})

In [102]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [103]:
dataset = dataset.remove_columns("label")
dataset

Dataset({
    features: ['text'],
    num_rows: 25000
})

In [112]:
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc", split="train")

In [109]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [110]:
print(dataset)
dataset = dataset.select_columns(['sentence1', 'sentence2', 'idx'])
print(dataset)
dataset = dataset.select_columns('idx')
print(dataset)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
Dataset({
    features: ['sentence1', 'sentence2', 'idx'],
    num_rows: 3668
})
Dataset({
    features: ['idx'],
    num_rows: 3668
})


In [113]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [114]:
dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [115]:
from datasets import ClassLabel, Value
new_features = dataset.features.copy()
new_features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [116]:
new_features["label"] = ClassLabel(names=["negative", "positive"])
new_features["idx"] = Value("int64")
dataset = dataset.cast(new_features)
dataset.features

Casting the dataset:   0%|          | 0/3668 [00:00<?, ? examples/s]

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int64', id=None)}

In [121]:
dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int64', id=None)}

In [124]:
from datasets import load_dataset
dataset = load_dataset("squad", split="train")
dataset.features

{'id': Value(dtype='string', id=None),
 'title': Value(dtype='string', id=None),
 'context': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}

In [125]:
flat_dataset = dataset.flatten()
flat_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers.text', 'answers.answer_start'],
    num_rows: 87599
})

- Map( ): primary function is to speed up processing functions

In [127]:
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc", split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [128]:
def add_prefix(example):
    example["sentence1"] = 'My sentence: ' + example["sentence1"]
    return example

In [129]:
updated_dataset = small_dataset.map(add_prefix)
updated_dataset["sentence1"][:5]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

['My sentence: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'My sentence: Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .',
 'My sentence: Trading in Loral was halted yesterday ; the shares closed on Monday at $ 3.01 .',
 'My sentence: The group will be headed by State Department official John S. Wolf , who has served in Australia , Vietnam , Greece and Pakistan .']

In [131]:
updated_dataset["sentence2"][:5]

['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .',
 'The New York Stock Exchange suspended trading yesterday in Loral , which closed at $ 3.01 Friday .',
 'The group will be headed by John S. Wolf , an assistant secretary of state who has served in Australia , Vietnam , Greece and Pakistan .']

In [132]:
updated_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 6
})

In [133]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [141]:
updated_dataset = dataset.map(lambda example: {"new_sentence": example["sentence1"]}, remove_columns=["sentence1"])
updated_dataset.column_names

['sentence2', 'label', 'idx', 'new_sentence']

In [142]:
updated_dataset

Dataset({
    features: ['sentence2', 'label', 'idx', 'new_sentence'],
    num_rows: 3668
})

In [143]:
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [144]:
updated_dataset = dataset.map(lambda example, idx: {"sentence2": f"{idx}: " + example["sentence2"]}, with_indices=True)
updated_dataset["sentence2"][:5]

['0: Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 "1: Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
 "2: On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
 '3: Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .',
 '4: PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .']

In [145]:
updated_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [146]:
updated_dataset['sentence1'][:5]

['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']

In [147]:
updated_dataset = dataset.map(lambda example, idx: {"sentence1": f"{idx}: " + example["sentence1"]}, with_indices=True, num_proc=4)
updated_dataset['sentence1'][:5]

Map (num_proc=4):   0%|          | 0/3668 [00:00<?, ? examples/s]

['0: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "1: Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 '2: They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 '3: Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 '4: The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']

In [148]:
updated_dataset = dataset.map(lambda example, idx: {"sentence1": f"{idx}: " + example["sentence1"]}, with_indices=True)
updated_dataset['sentence1'][:5]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

['0: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 "1: Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 '2: They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .',
 '3: Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .',
 '4: The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .']

In [164]:
from datasets import load_dataset

dataset = load_dataset('glue', 'mrpc')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [166]:
encoded_dataset = dataset.map(lambda examples: tokenizer(examples["sentence1"]), batched=True)
# encoded_dataset["train"][0]
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [169]:
train_split = load_dataset('glue', 'mrpc', split="train")
train_split

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [170]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes", split="train")
batched_dataset = dataset.batch(batch_size=4)
batched_dataset

Batching examples:   0%|          | 0/8530 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 2133
})

In [171]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [177]:
dataset[0], len(dataset)

({'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'label': 1},
 8530)

In [178]:
batched_dataset[0], len(batched_dataset)

({'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
   'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .',
   'effective but too-tepid biopic',
   'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'],
  'label': [1, 1, 1, 1]},
 2133)

In [182]:
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc", split="train")
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [184]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def encode(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length")

dataset = dataset.map(encode,batched=True)
dataset[0].keys(), dataset[0]["label"]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']),
 1)

In [185]:
import torch
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [188]:
dataset_new = dataset.with_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])

In [196]:
dataset.format, dataset_new.format

({'type': 'torch',
  'format_kwargs': {},
  'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
  'output_all_columns': False},
 {'type': 'torch',
  'format_kwargs': {},
  'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
  'output_all_columns': False})

In [197]:
# dataset.map( ): Permanently applies a transformation to the dataset by modifying its underlying data. Updates the dataset itself. 
# dataset.set_transform( ): Lazily applies a transformation whenever the dataset is accessed, without modifying the underlying data. Saves memory and computation by not altering the dataset permanently.