In [1]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
raw_dataset = load_dataset("quora", trust_remote_code=True)
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'is_duplicate'],
        num_rows: 404290
    })
})

In [4]:
class_labels = ClassLabel(num_classes=2, names=["Not Duplicate", "Duplicate"])

In [5]:
def value_to_classlabel(element):
    return {"is_duplicate": class_labels.str2int(element["is_duplicate"])}

In [6]:
features_copy = raw_dataset["train"].features.copy()
features_copy["is_duplicate"] = class_labels

raw_dataset["train"] = raw_dataset["train"].cast(features_copy)
raw_dataset["train"]

Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 404290
})

In [7]:
# # We are basically performing the following operation using the map function since assignment isn't supported:
# # Equivalent to the following:
# raw_dataset["train"]["is_duplicate"] = class_labels.int2str(raw_dataset["train"]["is_duplicate"])

raw_dataset["train"] = raw_dataset["train"].map(value_to_classlabel, batched=True)
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'is_duplicate'],
        num_rows: 404290
    })
})

In [8]:
raw_dataset["train"]["is_duplicate"]

[0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,


In [9]:
# Just verifying the selection
raw_dataset["train"]["questions"][0]["text"][0]

'What is the step by step guide to invest in share market in india?'

In [10]:
# Separating out the question pairs into different lists (to tokenize later)

sentences1 = []
sentences2 = []

for question_pair in raw_dataset["train"]["questions"]:
    sentences1.append(question_pair["text"][0])
    sentences2.append(question_pair["text"][1])

print(sentences1[:3])
print(sentences2[:3])

['What is the step by step guide to invest in share market in india?', 'What is the story of Kohinoor (Koh-i-Noor) Diamond?', 'How can I increase the speed of my internet connection while using a VPN?']
['What is the step by step guide to invest in share market?', 'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?', 'How can Internet speed be increased by hacking through DNS?']


In [11]:
# Without this, tokenization results in the following error:
# ArrowInvalid: Column 2 named input_ids expected length 1000 but got length 283003

raw_dataset["train"] = raw_dataset["train"].add_column("sentences1", sentences1)
raw_dataset["train"] = raw_dataset["train"].add_column("sentences2", sentences2)

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2'],
        num_rows: 404290
    })
})

In [12]:
raw_dataset = raw_dataset["train"].train_test_split(test_size=0.30, shuffle=True, stratify_by_column="is_duplicate", seed=42)

validation_test_split = raw_dataset["test"].train_test_split(test_size=0.50, shuffle=True, stratify_by_column="is_duplicate", seed=42)

raw_dataset["validation"] = validation_test_split["train"]
raw_dataset["test"] = validation_test_split["test"]

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2'],
        num_rows: 283003
    })
    test: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2'],
        num_rows: 60644
    })
    validation: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2'],
        num_rows: 60643
    })
})

In [13]:
def tokenize_function(element):
    return tokenizer(element["sentences1"], element["sentences2"], truncation=True)

In [14]:
%%time
# Adding time to just experiment with different batch sizes and `num_proc`

tokenized_dataset = raw_dataset.map(
                        tokenize_function,
                        batched=True,
                        batch_size=0,  # batch_size=0 corresponds to passing the whole dataset as a batch
                        # num_proc=4,
                    )

tokenized_dataset

CPU times: total: 15.6 ms
Wall time: 308 ms


DatasetDict({
    train: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 283003
    })
    test: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 60644
    })
    validation: Dataset({
        features: ['questions', 'is_duplicate', 'sentences1', 'sentences2', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 60643
    })
})

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='tf')

In [16]:
tokenized_dataset["train"].features

{'questions': Sequence(feature={'id': Value(dtype='int32', id=None), 'text': Value(dtype='string', id=None)}, length=-1, id=None),
 'is_duplicate': ClassLabel(names=['Not Duplicate', 'Duplicate'], id=None),
 'sentences1': Value(dtype='string', id=None),
 'sentences2': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [17]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    label_cols=["is_duplicate"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8
)

tf_validation_dataset = tokenized_dataset["validation"].to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    label_cols=["is_duplicate"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
