In [None]:
# Necesita embedding vectors en la carpeta ../we/

In [1]:
import csv
import re
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from numpy.linalg import norm
from gensim.models.wrappers import FastText

In [2]:
infile_name = '../data/textos_20200518.csv'
text_dict = {}

id_field = 'id_movid'
date_field = 'fecha'
text_field = 's3_consulta_pqno_8_TEXT'
should_ignore = []

with open(infile_name) as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        k = int(row[id_field])
        d = row[date_field]
        text = row[text_field]
        if text not in should_ignore:
            text_dict[(k,d)] = text

In [3]:
mode = 'bin' # 'vec
wordvectors_file = '../we/fasttext-suc'
output = '../out/suc'
today = '20200518'

In [4]:
if mode == 'bin':
    wordvectors = FastText.load_fasttext_format(wordvectors_file)
elif mode == 'vec':
    wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file + '.vec')

In [5]:
letters = set('aáeéoóíúiuüàèìòùbcdfghjklmnñopqrstvwxyz')
numbers = set('1234567890')
ignore_tokens = ['soy']

def clean_text(text):
    char_tokens = []
    text = text.lower().strip()
    for char in text:
        if char in (letters | numbers):
            to_append = char
        else:
            to_append = ' '
        char_tokens.append(to_append)
    text = re.sub(' +',' ',''.join(char_tokens)).strip()
    return text

def tokenize(text):
    text = clean_text(text)
    return text.split()

def delete_ignored_tokens(tokens):
    new_tokens = [token for token in tokens if token not in ignore_tokens]
    return new_tokens

def to_vector(text, we, verbose=True):
    tokens = tokenize(text)
    tokens = delete_ignored_tokens(tokens)
    vec = np.zeros(300)
    n = 0
    for word in tokens:
        # si la palabra está la acumulamos
        if word in we:
            vec += we[word]
            n += 1
    if norm(vec) == 0 or n == 0:
        if verbose:
            print('not possible to create vector for:', tokens)
        return vec
    else:
        vec = vec / n
        return vec / norm(vec)

def out_emb_line(emb):
    line = [str(n) for n in emb]
    line = '\t'.join(line) + '\n'
    return line

In [6]:
emb_dict = {}

for id_t in text_dict:
    emb_dict[id_t] = to_vector(text_dict[id_t], wordvectors)

not possible to create vector for: []
not possible to create vector for: []
not possible to create vector for: []
not possible to create vector for: []
not possible to create vector for: []
not possible to create vector for: []
not possible to create vector for: []


In [7]:
outfilename_we = output + '_' + today + '_emb.tsv'
outfilename_metadata = output + '_' + today + '_metadata.tsv'

with open(outfilename_we,'w') as outfile_we, open(outfilename_metadata,'w') as outfile_metadata:
    header_line = 'movid_id\ttexto\n'
    outfile_metadata.write(header_line)
    for id_t in emb_dict:
        k, date = id_t
        text = text_dict[id_t]
        text_line = f'{k}\t{text}\n'
        outfile_metadata.write(text_line)
        
        emb = emb_dict[id_t]
        emb_line = out_emb_line(emb)
        outfile_we.write(emb_line)

# Ahora clasifica

## Clusters observados

In [9]:
# datos base (entrenamiento)
from classes_and_labels import train_data, labels, labels_to_classes, classes

## Computa embeddings para los textos observados

In [10]:
emb_train_data = []
for cluster in train_data:
    embs = []
    for text in cluster:
        emb = to_vector(text,  wordvectors)
        embs.append(emb)
    emb_train_data.append(embs)

In [12]:
def similarity_text(text_1, text_2, wordvectors):
    vec_1 = to_vector(text_1, wordvectors)
    vec_2 = to_vector(text_2, wordvectors)
    return similarity(vec_1, vec_2)


def similarity(vec_1, vec_2):
    sim = vec_1 @ vec_2
    return sim

def similarity_to_cluster(vec_1, cluster, dist=0):
    sims = []
    for vec in cluster:
        sim = similarity(vec_1, vec)
        sims.append(sim)
    if dist == 0:
        return np.max(sims)
    elif dist == 1:
        return np.average(sims)
    else:
        return 0

def most_similar(vec_1, vec_set):
    best_sim = 0
    out = -1
    for i,vec in enumerate(vec_set):
        sim = similarity(vec_1, vec)
        if sim > best_sim:
            out = i
            best_sim = sim
    return out, best_sim
    
def emb_classify(vec1, emb_train_data, dist=0):
    best_sim = 0
    predicted_class = -1
    for c, cluster in enumerate(emb_train_data):
        sim = similarity_to_cluster(vec1, cluster, dist)
        if sim > best_sim:
            predicted_class = c  
            best_sim = sim
    return predicted_class, best_sim

def save_out_classes(text_cls, text_sims, outfilename):
    no_class = ['NA','']
    
    with open(outfilename, 'w') as outfile:
        fieldnames = ['id_movid', 'fecha']
        fieldnames += ['con_'+str(i) for i in range(1, len(classes))]
        fieldnames += ['class_id', 'text','class', 'sim']
        
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for k in text_cls:
            row = {}
            i, date = k
            row['id_movid'] = i
            row['fecha'] = date
            row['text'] = text_dict[k]
            
            for j in range(1, len(classes)):
                row['con_'+str(j)] = 0
            
            if row['text'] in no_class:
                class_id = 0
                row['sim'] = 0
            else:
                class_id = labels_to_classes[text_cls[k]]
                row['sim'] = text_sims[k]
                row['con_'+str(class_id)] = 1
                
            row['class_id'] = class_id
            row['class'] = classes[class_id]
            writer.writerow(row)

In [13]:
text_cls = {}
text_sims = {}
for k in emb_dict:
    c, sim = emb_classify(emb_dict[k], emb_train_data, dist=0)
    text_cls[k] = c
    text_sims[k] = sim
    if np.random.rand() > 0.998:
        print(text_dict[k], '/', classes[labels_to_classes[c]])

Por la fiebre sí, pero estoy con pielonefritis aguda / Sintomas habituales o atribuye a otra causa
Por que siempre tengo tos soy asmática / Sintomas habituales o atribuye a otra causa
Temor a las personas imprudente, sin ninguncuidado / Considera que no tiene riesgo
Tengo colon irritable / Sintomas habituales o atribuye a otra causa
No quiero asistir al consultorio por algo sin importancia / Miedo a contagiarse
Por que asocio el dolor de cabeza a mi perido premenstrual y la diarrea por algo muy pesado que comi / Sintomas habituales o atribuye a otra causa
porque soy jaquecosa, es normal para mí tener dolores de cabeza / Sintomas habituales o atribuye a otra causa
tomo medicamento para ello / Sintomas leves o transitorios
Porque estoy diagnosticada con fibromialgia y con tratamiento de hace años / Sintomas leves o transitorios
no tengo previsión y no puedo salir a un consultorio / Miedo a contagiarse
Son síntomas asociados a mi patología miastenia Gravis / Sintomas habituales o atribuye

In [14]:
outfilename = f'{output}_{today}_classes.csv'
save_out_classes(text_cls, text_sims, outfilename)

In [30]:
        fieldnames = ['id_movid', 'fecha']
        fieldnames += ['con_'+str(i) for i in range(1, len(classes))]
        fieldnames += ['class_id', 'text','class', 'sim']

In [31]:
fieldnames

['id_movid',
 'fecha',
 'con_1',
 'con_2',
 'con_3',
 'con_4',
 'con_5',
 'con_6',
 'class_id',
 'text',
 'class',
 'sim']