# Visualize your 🤗 Hugging Face data
#### 🛠️ Installation and set-up

In [1]:
import os
import pandas as pd
import json
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer
import torch
from datasets import load_from_disk, load_metric, Dataset, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### 🛫 Data and model preparation
#### 🏷️ Loading a dataset

In [2]:
# load
datadir = '/work3/s174498/sst2_dataset/'

from_disk = True
all = False

if from_disk:
    test_dataset = load_from_disk(datadir + 'test_dataset')
    if all:
        train_dataset = load_from_disk(datadir + 'train_dataset')
        validation_dataset = load_from_disk(datadir + 'validation_dataset')
else:
    if all:
        dataset = load_dataset("sst2")
        train_dataset = dataset['train']
        validation_dataset = dataset['validation']
        test_dataset = dataset['test']

For demo sub-sample dataset 

In [3]:
#small_data_train = dataset['train'].select(range(dataset['train'].num_rows // 10))
# alternative methods
# dataset["train"].shuffle(seed=42).select([i for i in list(range(100))])
#small_data_val = dataset['validation'].select(range(dataset['validation'].num_rows // 10)) # dataset["validation"].shuffle(seed=42).select([i for i in list(range(50))])

### ⚙️ Tokenizing the dataset
In a typical NLP workflow, we must first tokenize our dataset.

Converting the stream of characters in the text into a stream of defined "tokens", which can be anything from a smaller set of characters to words from a vocabulary.

We will use a pretrained model, so we inherit its tokenization scheme.

Wanting to see all files on RoBERTa e.g. tokenization https://huggingface.co/roberta-base/tree/main 

**Merge**-file explanation https://github.com/huggingface/transformers/issues/4777 

### Files used for the RoBERTa pre-trained Tokenizer 

In [4]:
ellen_little_nb_path = '/zhome/94/5/127021/speciale/master_project/notebooks'
roberta_files_path = '/work3/s174498/roberta_files/'
print('Current wokring directory',os.getcwd())


Current wokring directory /zhome/94/5/127021/speciale/master_project/notebooks


In [5]:
# load
checkpoint = '/work3/s174498/finetuning-sentiment-model-all-samples-test6/checkpoint-1000'

# tokenizer
tokenizer_checkpoint = RobertaTokenizer.from_pretrained(checkpoint) 
tokenizer_pretrained = RobertaTokenizer.from_pretrained('roberta-base')
# model
model = RobertaForSequenceClassification.from_pretrained(checkpoint,output_hidden_states = True, output_attentions = True, return_dict = True)

In [6]:
# merges files
df_merges = pd.read_csv("/work3/s174498/roberta_files/merges.txt", sep=" ",  on_bad_lines='skip')

# dict file
df_dict = pd.read_csv("/work3/s174498/roberta_files/dict.txt", sep=" ", header = None, names = ['id_GPT_2','occurrence'])

# tokenizer file
file = open('/work3/s174498/roberta_files/tokenizer.json')
tokenizer_json = json.load(file)
file.close()

# vocab file
file = open('/work3/s174498/roberta_files/vocab.json')
vocab = json.load(file)
file.close()

In [7]:
df_dict.head()

Unnamed: 0,id_GPT_2,occurrence
0,13,850314647
1,262,800385005
2,11,800251374
3,284,432911125
4,290,394899794


**Vocab** consists of 50265 'units'

In [8]:
print('vocab length:', len(vocab.keys()))
print('the 10 first:',list(vocab.keys())[:10])

vocab length: 50265
the 10 first: ['<s>', '<pad>', '</s>', '<unk>', '.', 'Ġthe', ',', 'Ġto', 'Ġand', 'Ġof']


**Tokenizer** has a lot of information about model, and which setting are chosen and the vocab can be found here

In [9]:
tokenizer_json['version']
#tokenizer['model']
#tokenizer['model']['vocab']

'1.0'

### How these files are used  

In [10]:
text = 'A day is just better with Lavazza coffee. You agree?'
print('length of text:',len(text))
print(text)

length of text: 52
A day is just better with Lavazza coffee. You agree?


**1.** First step in the tokenizer is to tokenize according to the merges-file:

In [11]:
print('number of tokens:', len(tokenizer_pretrained.tokenize(text)))
print('tokens from pre-trained:\n',tokenizer_pretrained.tokenize(text))
print('tokens from checkpoint:\n',tokenizer_checkpoint.tokenize(text))

number of tokens: 14
tokens from pre-trained:
 ['A', 'Ġday', 'Ġis', 'Ġjust', 'Ġbetter', 'Ġwith', 'ĠLav', 'az', 'za', 'Ġcoffee', '.', 'ĠYou', 'Ġagree', '?']
tokens from checkpoint:
 ['A', 'Ġday', 'Ġis', 'Ġjust', 'Ġbetter', 'Ġwith', 'ĠLav', 'az', 'za', 'Ġcoffee', '.', 'ĠYou', 'Ġagree', '?']


*Ġ is the rep. for space*

**2.** Second step is to replace these tokens with their corresponding indices, using the vocab-file

In [12]:
print('number of idx:',len(tokenizer_pretrained.encode(text)))
print('indices from pre-trained:\n',tokenizer_pretrained.encode(text))
print('indices from checkpoint:\n',tokenizer_checkpoint.encode(text))

number of idx: 16
indices from pre-trained:
 [0, 250, 183, 16, 95, 357, 19, 18126, 1222, 2478, 3895, 4, 370, 2854, 116, 2]
indices from checkpoint:
 [0, 250, 183, 16, 95, 357, 19, 18126, 1222, 2478, 3895, 4, 370, 2854, 116, 2]


Where the following for end and start of sentence is used 
* sep : ['< /s>', 2] (last token of a sequence built with special tokens)
* cls : ['< s>', 0] (fisrt token of a sequence built with special tokens)

Having the indices we can decode back to original text:

In [13]:
print('decode from pre-trained:\n',tokenizer_pretrained.decode(tokenizer_pretrained.encode(text)))
print('decode from checkpoint:\n',tokenizer_checkpoint.decode(tokenizer_checkpoint.encode(text)))

decode from pre-trained:
 <s>A day is just better with Lavazza coffee. You agree?</s>
decode from checkpoint:
 <s>A day is just better with Lavazza coffee. You agree?</s>


The dict.txt file is the connection between GPT-2 vocab and RoBERTa vocab. 

* Where the **row-idx+4 is the index in RoBERTa** - the 4 is from the 4 special tokens (see below).
* And the column 'index' is the **index from GPT-2.**
* The column 'occurencies' gives the number of times the **index/token appears**' in the training set. 

The GPT-2 vocab is remapped with the RoBERTa vocab and the first four values are the special tokens:

In [14]:
# special tokens
{"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

{'<s>': 0, '<pad>': 1, '</s>': 2, '<unk>': 3}

#### Settings for Tokenizer
Tokenizer.json gives all settings for the Tokenizer.

As examples are that it gives which special tokens are added and their corresponding id

In [15]:
nr_add_tokens = len(tokenizer_json['added_tokens'])
for i in range(nr_add_tokens):
    print(tokenizer_json['added_tokens'][i])

{'id': 0, 'special': True, 'content': '<s>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': True}
{'id': 1, 'special': True, 'content': '<pad>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': True}
{'id': 2, 'special': True, 'content': '</s>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': True}
{'id': 3, 'special': True, 'content': '<unk>', 'single_word': False, 'lstrip': False, 'rstrip': False, 'normalized': True}
{'id': 50264, 'special': True, 'content': '<mask>', 'single_word': False, 'lstrip': True, 'rstrip': False, 'normalized': True}


Where **cls** is the classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification)

and **sep** is the separator token, which is used when building a sequence from multiple sequences.

In [16]:
tokenizer_json['post_processor']

{'type': 'RobertaProcessing',
 'sep': ['</s>', 2],
 'cls': ['<s>', 0],
 'trim_offsets': True,
 'add_prefix_space': False}

#### Some of the inputs to the Tokenizer

* **bos_token** (str, optional, defaults to "< s>") — The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

* **eos_token** (str, optional, defaults to "< /s>") — The end of sequence token.

* **sep_token** (str, optional, defaults to "< /s>") — The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for sequence classification or for a text and a question for question answering. It is also used as the last token of a sequence built with special tokens.

* **cls_token** (str, optional, defaults to "< s>") — The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens.

* **unk_token** (str, optional, defaults to "< unk>") — The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead.

* **pad_token** (str, optional, defaults to "< pad>") — The token used for padding, for example when batching sequences of different lengths.

* **mask_token** (str, optional, defaults to "< mask>") — The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict.

# Prediction 
A little bit of prediction 

In [17]:
# Prepare the text inputs for the model
def preprocess_function(examples):
    return tokenizer_checkpoint(examples["sentence"], truncation=True)

tokenized_test = test_dataset.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]


PermissionError: [Errno 13] Permission denied: '/work3/s174498/sst2_dataset/test_dataset/tmpc5u0wdx7'

In [20]:
trainer = Trainer(
    model=model,                        
    tokenizer=tokenizer_checkpoint
)

In [21]:
# Predicting with model
predictions = trainer.predict(test_dataset)
#dataset_test_pred = list(np.argmax(predictions.predictions, axis=-1))

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: idx, sentence. If idx, sentence are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']

In [None]:
predictions

In [None]:
accuracy_metric = load_metric("accuracy")
accuracy_metric.compute(predictions=dataset_test_pred, references=test_dataset['label'])

#### Some other text examples

In [21]:
test_text = ["Hello World, Hello World, and hello world differs.", "It tokenize Danish words and rare english words such as 'speciale skrivning' and Obelus, Nudiustertian, Nikehedonia and Metanoia"]
test_text2 = ['in store and dog',' in store ','in store ',' in store','in store?','in store.','in store .']

In [24]:
text = test_text
for i in range(len(text)):
    ids = tokenizer(text[i], truncation=True)['input_ids']
    
    print('The tokenizer first tokenizes according to the merges file:\n',tokenizer.tokenize(text[i]))
    print('And then(according to the values in the vocab.json)\nthese tokens are then replaced by their indices:\n',ids)

    print(tokenizer.decode(ids))
    print(tokenizer.convert_ids_to_tokens(ids))

#tokenizer(test_text2, truncation = True)


The tokenizer first tokenizes according to the merges file:
 ['Hello', 'ĠWorld', ',', 'ĠHello', 'ĠWorld', ',', 'Ġand', 'Ġhello', 'Ġworld', 'Ġdiffers', '.']
And then(according to the values in the vocab.json)
these tokens are then replaced by their indices:
 [0, 31414, 623, 6, 20920, 623, 6, 8, 20760, 232, 31381, 4, 2]
<s>Hello World, Hello World, and hello world differs.</s>
['<s>', 'Hello', 'ĠWorld', ',', 'ĠHello', 'ĠWorld', ',', 'Ġand', 'Ġhello', 'Ġworld', 'Ġdiffers', '.', '</s>']
The tokenizer first tokenizes according to the merges file:
 ['It', 'Ġtoken', 'ize', 'ĠDanish', 'Ġwords', 'Ġand', 'Ġrare', 'Ġenglish', 'Ġwords', 'Ġsuch', 'Ġas', "Ġ'", 'special', 'e', 'Ġsk', 'riv', 'ning', "'", 'Ġand', 'ĠOb', 'el', 'us', ',', 'ĠN', 'udi', 'ust', 'ert', 'ian', ',', 'ĠNike', 'hed', 'onia', 'Ġand', 'ĠMet', 'anoia']
And then(according to the values in the vocab.json)
these tokens are then replaced by their indices:
 [0, 243, 19233, 2072, 13501, 1617, 8, 3159, 47510, 1617, 215, 25, 128, 19423, 24

In [None]:
# input_ids: the token indices
# attention_mask: exactly ehat it says - a 0 or 1 array that tells the model which tokens should be attended to and which should not

# The truncation argument controls truncation. It can be a boolean or a string:
# True or 'longest_first': 
# truncate to a maximum length specified by the max_length argument or the maximum length accepted by the model if no max_length is provided (max_length=None). 
# This will truncate token by token, removing a token from the longest sequence in the pair until the proper length is reached.

In [19]:
tokenizer(small_data_train['sentence'][:5])#, truncation = True)

{'input_ids': [[0, 37265, 92, 3556, 2485, 31, 5, 20536, 2833, 1437, 2], [0, 10800, 5069, 117, 22094, 2156, 129, 6348, 3995, 821, 8299, 1437, 2], [0, 6025, 6138, 63, 3768, 8, 39906, 402, 1195, 2721, 59, 1050, 2574, 1437, 2], [0, 5593, 5069, 19223, 10028, 7, 1091, 5, 276, 1328, 1437, 2], [0, 261, 5, 2373, 13543, 12, 1116, 12, 627, 12, 1396, 11622, 43848, 5739, 5, 17504, 115, 31120, 1899, 62, 1437, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [33]:
len(tokenizer_pretrained(test_dataset['sentence'], padding=True,truncation = True)['input_ids'][0])

55

We then map the tokenizer over our dataset:

In [3]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

dataset = load_dataset("sst2")

small_train_dataset = dataset["train"].shuffle(seed=42).select([i for i in list(range(100))])
small_val_dataset = dataset["validation"].shuffle(seed=42).select([i for i in list(range(50))])

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_val = small_val_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.
Downloading and preparing dataset sst2/default (download: 7.09 MiB, generated: 4.78 MiB, post-processed: Unknown size, total: 11.88 MiB) to /zhome/94/5/127021/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5...


OSError: [Errno 28] No space left on device: '/zhome/94/5/127021/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5.incomplete'