In [2]:
from transformers import BertModel, BertForMaskedLM, BertTokenizer
from utils import read_movid_file, load_we, generate_emb_dict, save_emb_and_meta, generate_bert_emb_dict

# Lee datos

In [3]:
infile_name = '../data/20200629.csv'

In [8]:
text_field='s3_cons_otra_TEXT'
text_dict = read_movid_file(infile_name, text_field, after=20200101)
print(f'{len(text_dict)} textos cargados del campo {text_field}')

15047 textos cargados del campo s3_cons_otra_TEXT


# Carga modelo de embeddings y computa y guarda embeddings para textos

In [5]:
%time wordvectors = load_we('../we/fasttext-suc', 'bin', limit=40000)

CPU times: user 7min 11s, sys: 13min 22s, total: 20min 33s
Wall time: 49min 37s


In [9]:
%time emb_dict = generate_emb_dict(text_dict, wordvectors, verbose=False)

CPU times: user 2.99 s, sys: 2.71 s, total: 5.69 s
Wall time: 11 s


In [10]:
save_emb_and_meta(text_dict, emb_dict, '../out/20200702_ftsuc.s3_cons.tsv', '../out/20200702_ftsuc.s3_cons.meta.tsv' )

# Carga modelo BERT en español y computa y guarda embeddings

In [11]:
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model = model.eval()

In [12]:
%time bert_emb_dict = generate_bert_emb_dict(text_dict, tokenizer, model, batch_size=40, verbose=True)

batch:376, examples:15047/15047
done
CPU times: user 39min 57s, sys: 4min 22s, total: 44min 19s
Wall time: 27min 11s


In [13]:
%time save_emb_and_meta(text_dict, bert_emb_dict, '../out/20200702_bert.s3_cons.tsv', '../out/20200702_bert.s3_cons.meta.tsv')

CPU times: user 54.4 s, sys: 705 ms, total: 55.1 s
Wall time: 56.4 s
