In [1]:
%%capture
%pip install boto3 sentence_transformers datasets

In [2]:
from transformers import AutoTokenizer, AutoModel
from time import time
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from tqdm import tqdm

In [12]:
from datasets import load_dataset
dataset = load_dataset("vishnupriyavr/wiki-movie-plots-with-summaries")
sentences = dataset['train']['Plot']
num_sent = 1000
sentences = sentences[:num_sent]

# Bert Time

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

In [15]:
tok_time_start = time()

tokens = {'input_ids': [],'attention_mask': []}
for sentence in sentences:
    new_tokens = tokenizer.encode_plus(sentence, return_tensors = 'pt',
                                       padding='max_length', truncation = True)
    tokens ['input_ids'].append(new_tokens['input_ids'][0])
    tokens ['attention_mask'].append(new_tokens['attention_mask'][0])

tokens ['input_ids'] = torch.stack(tokens['input_ids'])
tokens ['attention_mask'] = torch.stack(tokens['attention_mask'])

tok_time_end = time()
tok_time = tok_time_end - tok_time_start

print(f"Bert tokenization time: {tok_time:2f} sec")

total_tokens = tokens ['attention_mask'].sum()

Bert tokenization time: 0.050523 sec


In [16]:
total_tokens

tensor(1923)

In [None]:
list_output = []
for k, v in tokens.items():
    print(k)
    list_output.append(v.to(device)) #togliere questa parte del to device

input_ids
attention_mask


In [None]:
eval_batch_size = 4
enc_time_start = time()
eval_dataset = TensorDataset(*list_output)

eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

for batch in tqdm(eval_dataloader, desc="Running..."):
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {'input_ids': batch[0],
                'attention_mask': batch[1]}

    outputs = model(**inputs)

enc_time_end = time()
enc_time = enc_time_end - enc_time_start
print(f"Charbert encoding time: {enc_time:.2f} sec")

Running...: 100%|██████████| 250/250 [00:36<00:00,  6.80it/s]

Charbert encoding time: 36.75 sec





In [None]:
print(tok_time)
print (enc_time)

1.6059489250183105
36.75372338294983


In [None]:
total_tokens.item()

301430

In [None]:
print(f'Tokenization: {(total_tokens/tok_time):.2f} token per sec')
print(f'Encoding: {(total_tokens/enc_time):.2f} token per sec')
print(f'Total: {(total_tokens/(tok_time + enc_time)):.2f} token per sec')

Tokenization: 61833.32 token per sec
Encoding: 7435.01 token per sec
Total: 6636.96 token per sec


In [None]:
tot_sent = len(sentences)

print(f'Tokenization: {((tot_sent * 512)/tok_time):.2f} token per sec')
print(f'Encoding: {((tot_sent * 512)/enc_time):.2f} token per sec')
print(f'Total: {((tot_sent * 512)/(tok_time + enc_time)):.2f} token per sec')

Tokenization: 127975.32 token per sec
Encoding: 15388.10 token per sec
Total: 13736.40 token per sec


**100 frasi:**
* Tokenization: 244791.36412807446 token per sec
* Encoding: 21818.590556069936 token per sec
* Total: 20033.019966939435 token per sec

_________
* Tokenization: 164165.77 token per sec
* Encoding: 11792.14 token per sec
* Total: 11001.87 token per sec

