In [1]:
import csv
import re
import numpy as np
import torch
from transformers import BertModel, BertForMaskedLM, BertTokenizer
from torch.nn.utils.rnn import pad_sequence

In [2]:
infile_name = '../data/textos_20200518.csv'
text_dict = {}

id_field = 'id_movid'
date_field = 'fecha'
text_field = 's3_consulta_pqno_8_TEXT'
should_ignore = []

with open(infile_name) as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        k = int(row[id_field])
        d = row[date_field]
        text = row[text_field]
        if text not in should_ignore:
            text_dict[(k,d)] = text

In [3]:
output = '../out/beto'
today = '20200518'

In [4]:
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31002, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [13]:
letters = set('aáeéoóíúiuüàèìòùbcdfghjklmnñopqrstvwxyz')
numbers = set('1234567890')
ignore_tokens = ['soy']

def clean_text(text):
    char_tokens = []
    text = text.lower().strip()
    for char in text:
        if char in (letters | numbers):
            to_append = char
        else:
            to_append = ' '
        char_tokens.append(to_append)
    text = re.sub(' +',' ',''.join(char_tokens)).strip()
    return text

def delete_ignored_tokens(tokens):
    new_tokens = [token for token in tokens if token not in ignore_tokens]
    return new_tokens

def to_vector(text, tokenizer, model, verbose=True):
    text = clean_text(text)
    text = '[CLS] ' + text + ' [SEP]'
    tokens = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    tokens_tensor = torch.tensor([indexed_tokens]) # Batch size 1
    outputs = model(tokens_tensor)
    hidden_size = outputs[0].size()[-1]
    vectors = outputs[0].view(-1,768)
    vec = torch.mean(vectors, dim=0)
    return vec / torch.norm(vec)

def out_emb_line(emb):
    line = [str(n) for n in emb]
    line = '\t'.join(line) + '\n'
    return line

def out_emb_line_torch(emb):
    line = [str(n.item()) for n in emb]
    line = '\t'.join(line) + '\n'
    return line

def prepare_text_for_bert(text):
    text = clean_text(text)
    text = '[CLS] ' + text + ' [SEP]'
    return text

def to_vector_batch(list_of_texts, tokenizer, model, batch_size=None, device='cpu', verbose=True):
    if device:
      model = model.to(device)
    N = len(list_of_texts)
    if not batch_size:
        batch_size = N
    b, i = 0, 0
    embs_list = []
    while i < N:
        j = i + batch_size
        if j > N:
            j = N
        if verbose:
            info = f'batch:{b}, examples:{j}/{N}'
            print(info)
            b += 1
        current_list = list_of_texts[i:j]
        token_ids = tokenizer.batch_encode_plus(current_list, pad_to_max_length=True)['input_ids']
        tensor_token_ids = [torch.tensor(x) for x in token_ids]
        model_input = pad_sequence(tensor_token_ids, batch_first=True, padding_value=1)
        if device:
            model_input = model_input.to(device)
        output_embs = model(model_input)[0].data
        embeddings = output_embs.mean(dim=1)
        embeddings = embeddings.to('cpu')
        embs_list.append(embeddings)
        del output_embs
        del model_input
        i = j # next batch
    embs = torch.cat(embs_list)
    if verbose:
        print('done')
    return embs

In [6]:
l_keys_all = list(text_dict.keys())
l_text_all = [prepare_text_for_bert(text_dict[k]) for k in l_keys_all]

In [15]:
l_text = l_text_all
l_keys = l_keys_all

In [16]:
%time embs = to_vector_batch(l_text, tokenizer, model, batch_size=50)

batch:0, examples:50/10157
batch:1, examples:100/10157
batch:2, examples:150/10157
batch:3, examples:200/10157
batch:4, examples:250/10157
batch:5, examples:300/10157
batch:6, examples:350/10157
batch:7, examples:400/10157
batch:8, examples:450/10157
batch:9, examples:500/10157
batch:10, examples:550/10157
batch:11, examples:600/10157
batch:12, examples:650/10157
batch:13, examples:700/10157
batch:14, examples:750/10157
batch:15, examples:800/10157
batch:16, examples:850/10157
batch:17, examples:900/10157
batch:18, examples:950/10157
batch:19, examples:1000/10157
batch:20, examples:1050/10157
batch:21, examples:1100/10157
batch:22, examples:1150/10157
batch:23, examples:1200/10157
batch:24, examples:1250/10157
batch:25, examples:1300/10157
batch:26, examples:1350/10157
batch:27, examples:1400/10157
batch:28, examples:1450/10157
batch:29, examples:1500/10157
batch:30, examples:1550/10157
batch:31, examples:1600/10157
batch:32, examples:1650/10157
batch:33, examples:1700/10157
batch:34, 

In [17]:
emb_dict = {}

for emb, id_t in zip(embs,l_keys):
    emb_dict[id_t] = emb

In [18]:
outfilename_we = output + '_' + today + '_emb.tsv'
outfilename_metadata = output + '_' + today + '_metadata.tsv'

with open(outfilename_we,'w') as outfile_we, open(outfilename_metadata,'w') as outfile_metadata:
    header_line = 'movid_id\ttexto\n'
    outfile_metadata.write(header_line)
    for id_t in emb_dict:
        k, date = id_t
        text = text_dict[id_t]
        text_line = f'{k}\t{text}\n'
        outfile_metadata.write(text_line)
        
        emb = emb_dict[id_t]
        emb_line = out_emb_line_torch(emb)
        outfile_we.write(emb_line)

# Ahora clasifica

## Clusters observados

In [19]:
# datos base (entrenamiento)
from classes_and_labels import train_data, labels, labels_to_classes, classes

## Computa embeddings para los textos observados

In [24]:
emb_train_data = []
for cluster in train_data:
    embs = []
    for text in cluster:
        emb = to_vector(text,  tokenizer, model)
        embs.append(emb)
    emb_train_data.append(embs)

In [25]:
def similarity_text(text_1, text_2, wordvectors):
    vec_1 = to_vector(text_1, wordvectors)
    vec_2 = to_vector(text_2, wordvectors)
    return similarity(vec_1, vec_2)


def similarity(vec_1, vec_2):
    sim = vec_1 @ vec_2
    return sim

def similarity_to_cluster(vec_1, cluster, dist=0):
    sims = []
    for vec in cluster:
        sim = similarity(vec_1, vec)
        sims.append(sim)
    if dist == 0:
        return np.max(sims)
    elif dist == 1:
        return np.average(sims)
    else:
        return 0

def most_similar(vec_1, vec_set):
    best_sim = 0
    out = -1
    for i,vec in enumerate(vec_set):
        sim = similarity(vec_1, vec)
        if sim > best_sim:
            out = i
            best_sim = sim
    return out, best_sim
    
def emb_classify(vec1, emb_train_data, dist=0):
    best_sim = 0
    predicted_class = -1
    for c, cluster in enumerate(emb_train_data):
        sim = similarity_to_cluster(vec1, cluster, dist)
        if sim > best_sim:
            predicted_class = c  
            best_sim = sim
    return predicted_class, best_sim

def save_out_classes(text_cls, text_sims, outfilename):
    no_class = ['NA','']
    
    with open(outfilename, 'w') as outfile:
        fieldnames = ['id_movid', 'fecha']
        fieldnames += ['con_'+str(i) for i in range(1, len(classes))]
        fieldnames += ['class_id', 'text','class', 'sim']
        
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for k in text_cls:
            row = {}
            i, date = k
            row['id_movid'] = i
            row['fecha'] = date
            row['text'] = text_dict[k]
            
            for j in range(1, len(classes)):
                row['con_'+str(j)] = 0
            
            if row['text'] in no_class:
                class_id = 0
                row['sim'] = 0
            else:
                class_id = labels_to_classes[text_cls[k]]
                row['sim'] = text_sims[k]
                row['con_'+str(class_id)] = 1
                
            row['class_id'] = class_id
            row['class'] = classes[class_id]
            writer.writerow(row)

In [26]:
text_cls = {}
text_sims = {}
for k in emb_dict:
    c, sim = emb_classify(emb_dict[k], emb_train_data, dist=0)
    text_cls[k] = c
    text_sims[k] = sim
    if np.random.rand() > 0.995:
        print(text_dict[k], '/', classes[labels_to_classes[c]])

un dia de cefalea que cedió ante analgesia / Sintomas leves o transitorios
Dolor de garganta autolimitado por dos dias / Sintomas leves o transitorios
Jaquecosa / Sintomas leves o transitorios
Me he sentido bien / Considera que no tiene riesgo
Porque los síntomas no cumplen con los criterios para la acudir al sistema de salud / Sintomas habituales o atribuye a otra causa
Por ser Dolor tipo migraña que tengo habitualmente / Sintomas leves o transitorios
Lo asimilo ai trabajo / Considera que no tiene riesgo
porque padezco de migrañas y estoy con tratamiento / Considera que no tiene riesgo
Es un sintoma habitual cada año en esta precisa trempoarada / Sintomas habituales o atribuye a otra causa
Considero la prioridad de otras personas en el sistema de salud / Considera que no tiene riesgo
Soy médico / Sintomas leves o transitorios
Reconozco dolor de cabeza como algo tensional / Sintomas leves o transitorios
Asma descompensado por mala adherencia / Considera que no tiene riesgo
SUFRO DE ASM

In [27]:
outfilename = f'{output}_{today}_classes.csv'
save_out_classes(text_cls, text_sims, outfilename)