# Testing

In [38]:
from pathlib import Path

In [39]:
paths = [str(x) for x in Path('./test_data/part_of_data_processed').glob('*.txt')]

In [40]:
paths

['test_data\\part_of_data_processed\\0_test_processed.txt',
 'test_data\\part_of_data_processed\\1_test_processed.txt']

In [41]:
from transformers import RobertaTokenizerFast
tokenizer_srberta = RobertaTokenizerFast.from_pretrained("srberta_tokenizer_new")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [42]:
import torch

def mlm(tensor):
    
    rand = torch.rand(tensor.shape) #[0,1]
    mask_arr = (rand < 0.15)* (tensor!=0)* (tensor!=1)* (tensor!=2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
        
    return tensor

In [43]:
from tqdm.auto import tqdm
import os

input_ids = []
mask = [] # attention mask
labels = []

for path in tqdm(paths[:1]):

    with open(path, 'r', encoding='utf-8') as f: 
        lines = f.read().split('\n')
        
    sample = tokenizer_srberta(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt')

    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids.detach().clone()))

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.37it/s]


In [44]:
sample['input_ids'].shape

torch.Size([107, 512])

In [45]:
input_ids[0].shape

torch.Size([107, 512])

This means that no matter how big one txt sample in dataset['train'][idx] is, it will get truncated to size 512!

### Concat tensors

In [46]:
input_ids = torch.cat(input_ids)

In [47]:
mask = torch.cat(mask)

In [48]:
labels = torch.cat(labels)

## Save tensors to disk

In [49]:
import torch

In [14]:
torch.save(input_ids, './test_srberta_books/input_ids_test.pt')

In [15]:
torch.save(mask, './test_srberta_books/mask_test.pt')

In [16]:
torch.save(labels, './test_srberta_books/labels_test.pt')

In [2]:
input_ids = torch.load("input_ids.pt")
mask = torch.load("mask.pt")
labels = torch.load("labels.pt")

FileNotFoundError: [Errno 2] No such file or directory: 'input_ids.pt'

### Test masked language modeling

In [50]:
input_ids[0][:10]

tensor([    0,  9432,     4,   515,   287,  5824,  1700,   335, 18691,     4])

In [51]:
input_ids.shape

torch.Size([107, 512])

## Testing loop

In [52]:
len(mask)

107

In [53]:
import torch

encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, encodings):
        self.encodings = encodings
        
    def __len__(self):
        return self.encodings['input_ids'].shape[0]
    
    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

dataset = Dataset(encodings)

In [54]:
BATCH_SIZE = 1
DO_SHUFFLE = True
testloader = torch.utils.data.DataLoader(dataset,batch_size=1, shuffle=DO_SHUFFLE)

for i, data in enumerate(testloader, 0):
    print(i)
    print(data)
    break

0
{'input_ids': tensor([[    0, 22811,  7938,    19,  4209,  6463,   914,    18,  1850, 10056,
         21106,  7113, 29862,  9440,     4, 13234,  7930, 20692,   330,   346,
           289, 17042,   300, 15065,     4,   622,   349,  7113,  4310,     4,
          1644,  3792,  2478,  4283,  3781,   335, 17855,  3792,    18,  1850,
          4401,   427,  8976,  4371,    16,  3781,   346, 15317, 10355,  5746,
          1358,  1916,  5907,   335, 17855,  5746,    16,  1523, 17855,  1504,
          6805,    18,     4,     4,  5746, 10588,  6858,  6805,  5746,    16,
          1523, 10674,  1504,  6805, 10588,  6858,     4, 16292,   330,   346,
           289, 15065, 12702,   622,   349,  7113,  4310,    16,  2625,  1210,
          3781,  7950,  1358, 17016,  1227,     4,    18,  1850,  4401,  1730,
           851,  6215,  2004,  2415,   427,  1540,   264,     4,  2179,   384,
         16379,  3015,  1072,     4,    16, 30194,  2048,  4283,   346,   335,
         17855,     4,  6805,    18,

In [55]:
len(testloader.dataset)

107

In [59]:
from tqdm.auto import tqdm
import numpy as np
from transformers import RobertaForMaskedLM

folder = "./fine_tuned_16/"
original_srberta = "./pre_trained/srberta_model_16"
num_trained = 15

for i in range(num_trained+1):
    print(i)
    
    if i == num_trained:
         model = RobertaForMaskedLM.from_pretrained(original_srberta)
    else:
        model = RobertaForMaskedLM.from_pretrained(folder + f"srberta_model_{i}")
        print("fine tuned")
    
    #model.to(device)
    model.eval()
    
    total_acc_test = 0
    total_num_mask = 0
    list_of_accuracies = []

    with torch.no_grad():

        loop = tqdm(testloader, leave=True)
        
        step=0
        for batch in loop:

            input_ids = batch['input_ids']#.to(device)
            mask = batch['attention_mask']#.to(device)
            labels = batch['labels']#.to(device)

            outputs = model(input_ids, attention_mask=mask, labels=labels)

            mask_token_index = (input_ids == 4)[0].nonzero(as_tuple=True)[0]
            total_num_mask += len(mask_token_index)
            
            indices = torch.topk(outputs.logits[0, mask_token_index],10,-1).indices
            
            acc2 = 0
            
            for i,top_five_scores in enumerate(torch.topk(outputs.logits[0, mask_token_index],5,-1).indices,0):
                if labels[0, mask_token_index][i] in top_five_scores:
                    acc2+=1
            
            list_of_accuracies.append(acc2/len(mask_token_index))
            
            total_acc_test += acc2

    print(f'Test overall Accuracy: {round(total_acc_test / total_num_mask*100, 2)}')
    
    
    accuracy_of_percents = sum(list_of_accuracies)/len(list_of_accuracies)
    
    print(f'Test accuracy of percents: {round(accuracy_of_percents*100, 2)}')
    
    if i==num_trained:
        file_txt = f"acc_results_original_model.txt"
    else:
        file_txt = f'acc_results_model_{i}.txt'
    
    with open(folder+file_txt, 'w') as f:
        f.write(str(round(total_acc_test / total_num_mask*100, 2)))
        f.write("\n")
        f.write(str(accuracy_of_percents))
    
        

0
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:39<00:00,  2.74it/s]


Test overall Accuracy: 79.55
Test accuracy of percents: 79.65
1
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:39<00:00,  2.69it/s]


Test overall Accuracy: 79.85
Test accuracy of percents: 79.93
2
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:39<00:00,  2.68it/s]


Test overall Accuracy: 79.67
Test accuracy of percents: 79.75
3
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.67it/s]


Test overall Accuracy: 79.24
Test accuracy of percents: 79.31
4
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.67it/s]


Test overall Accuracy: 78.03
Test accuracy of percents: 78.12
5
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.66it/s]


Test overall Accuracy: 78.87
Test accuracy of percents: 78.98
6
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:39<00:00,  2.68it/s]


Test overall Accuracy: 78.77
Test accuracy of percents: 78.86
7
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.67it/s]


Test overall Accuracy: 78.33
Test accuracy of percents: 78.43
8
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.66it/s]


Test overall Accuracy: 77.72
Test accuracy of percents: 77.81
9
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.67it/s]


Test overall Accuracy: 77.5
Test accuracy of percents: 77.59
10
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.67it/s]


Test overall Accuracy: 76.65
Test accuracy of percents: 76.72
11
fine tuned


100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:40<00:00,  2.67it/s]


Test overall Accuracy: 76.94
Test accuracy of percents: 77.05
12
fine tuned


  5%|███▊                                                                              | 5/107 [00:01<00:39,  2.57it/s]


KeyboardInterrupt: 

In [34]:
i


70

In [31]:
round(total_acc_test / total_num_mask*100, 2)

78.59