<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Tradu%C3%A7ao_dbedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install ftfy
!pip install transformers
!pip install sentencepiece
!pip install magic_timer

In [29]:
import os
import glob
import torch
import random
import pandas as pd

from transformers import MarianMTModel, MarianTokenizer
from transformers import logging
logging.set_verbosity_error()

from ftfy import fix_encoding
from magic_timer import MagicTimer

from spacy.lang.en import English


def pickle_file(path, data=None):
    import pickle
    if data is None:
        with open(path, 'rb') as f:
            return pickle.load(f)
    if data is not None:
        with open(path, 'wb') as handle:
            pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

MANUAL_SEED = 341
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def deterministic(rep=True):
    if rep:
        torch.manual_seed(MANUAL_SEED)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(MANUAL_SEED)
            torch.cuda.manual_seed_all(MANUAL_SEED)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Experimento deterministico, seed: {MANUAL_SEED} -- ', end = '')
        print(f'Existe {torch.cuda.device_count()} GPU {torch.cuda.get_device_name(0)} disponível.')
    else:
        print('Experimento randomico')
deterministic()        

Experimento deterministico, seed: 341 -- Existe 1 GPU Tesla P100-PCIE-16GB disponível.


In [8]:
path_data = '/content/drive/MyDrive/Dirty-Talks/Topic Modeling/dbpedia_sample_abstract_20k_unprep.txt'
corpus = [line.strip() for line in open(path_data, encoding="utf-8").readlines()]
corpus[0]

'The Mid-Peninsula Highway is a proposed freeway across the Niagara Peninsula in the Canadian province of Ontario. Although plans for a highway connecting Hamilton to Fort Erie south of the Niagara Escarpment have surfaced for decades,it was not until The Niagara Frontier International Gateway Study was published by the Ministry'

In [9]:
model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
marian_tokenizer = MarianTokenizer.from_pretrained(model_name)
marian_model = MarianMTModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/761k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

In [10]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
def chunkstring_spacy(text):
    chunck_sentences = []
    doc = nlp(str(text))
    for sent in doc.sents:
        chunck_sentences.append('>>pt_br<<' + ' ' + sent.text)
        
    return chunck_sentences
    
def translate(aux_sent):
    max_length = 512
    num_beams = 1

    sentence = chunkstring_spacy(aux_sent)

    #Move o modelo para a GPU
    marian_model.to(device)
    marian_model.eval()

    tokenized_text = marian_tokenizer.prepare_seq2seq_batch(sentence, max_length=max_length, return_tensors='pt')
                        
    translated = marian_model.generate(input_ids=tokenized_text['input_ids'].to(device), 
                                        max_length=max_length, 
                                        num_beams=num_beams, 
                                        early_stopping=True, 
                                        do_sample=False)
                        
    tgt_text = [fix_encoding(marian_tokenizer.decode(t, skip_special_tokens=True)) for t in translated]
    return ' '.join(tgt_text)

In [27]:
deterministic()        

path_base = '/content/drive/MyDrive/Dirty-Talks/Topic Modeling/data/'

CONTINUE_FROM = 0

timer = MagicTimer()  
translated_corpus = []
for idx, doc in enumerate(corpus[CONTINUE_FROM:]):
    translated_corpus.append(translate(doc))
    
    if (idx > CONTINUE_FROM and idx%2000==0) or (idx==len(corpus)-1):
        print(f'\tprocessed {idx}/{len(corpus)} samples. Time elapsed: {timer}')
        pickle_file(path_base+'translated_dbpedia_'+str(idx), translated_corpus)
        translated_corpus = []

	processed 2000/20000 samples. Time elapsed: 9.2 minutes
	processed 4000/20000 samples. Time elapsed: 19 minutes
	processed 6000/20000 samples. Time elapsed: 27 minutes
	processed 8000/20000 samples. Time elapsed: 36 minutes
	processed 10000/20000 samples. Time elapsed: 45 minutes
	processed 12000/20000 samples. Time elapsed: 54 minutes
	processed 14000/20000 samples. Time elapsed: 1.1 hours
	processed 16000/20000 samples. Time elapsed: 1.3 hours
	processed 18000/20000 samples. Time elapsed: 1.4 hours
	processed 19999/20000 samples. Time elapsed: 1.6 hours


In [41]:
os.chdir(path_base)
translated_concat = []
for i, file_list in enumerate(sorted(glob.glob("translated_dbpedia_*"), key=os.path.getmtime)): 
        translated_concat += pickle_file(file_list)

#---------------------------------------------
df = pd.DataFrame({'doc':translated_concat})
df = df[~df.duplicated()].sample(frac=1, random_state=MANUAL_SEED).reset_index(drop=True)
print(df.shape)
df.to_parquet(path_base+'dbpedia_translated_pandas.parquet', compression='gzip')
df1 = pd.read_parquet(path_base+'dbpedia_translated_pandas.parquet')
assert df.equals(df1)

df.head(20)

(19991, 1)


Unnamed: 0,doc
0,"Mārti Š Krūmi Š (Martiš, 2 de março de 1900 - ..."
1,AniZona foi uma convenção anual de anime basea...
2,"Railroad Wash é um rio efêmero afluente ou ""Wa..."
3,Ler (nome e datas desconhecidas) foi um críque...
4,"Thunder Bay Island Light, localizado na ponta ..."
5,Winship Point (62°15′S 58°44′O / 62.250°S 58.7...
6,A Universidade de Ciências de Hanoi é membro d...
7,Teddy Bear Sing Along é uma mini-série britâni...
8,O Grupo de Dados de Partículas (ou PDG) é uma ...
9,"A discografia de My Bloody Valentine, uma band..."


In [53]:
df.doc.to_list()[16000]

'Na matemática, o sistema de números reais afinalmente estendido é obtido do sistema de números reais R adicionando dois elementos: +. e –. (lido como infinito positivo e infinito negativo, respectivamente). Esses novos elementos não são números reais. É útil para descrever vários comportamentos limitantes no cálculo.'