In [414]:
import os, glob, json, spacy, re
import pandas as pd
import numpy as np

from functools import reduce

from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from gensim.models import KeyedVectors

Carregamento dos modelos de embedding

In [185]:
genmodel = KeyedVectors.load_word2vec_format(r'data\wordemb\fasttext50.txt')

In [187]:
spemodel = KeyedVectors.load_word2vec_format(r'data\wordemb\specific.txt')

In [186]:
nlp = spacy.load("pt_core_news_lg")

Arquivos de texto para treinamento

In [None]:
annotated = glob.glob(r'data\annotated\*.json')

In [None]:
all_content = []
for j in annotated:
    jfile = json.load(open(j, 'r', encoding='utf-8'))
    for f in jfile:
        all_content.append(f['content'])

In [201]:
from functools import reduce
all_words = reduce(lambda x,y: x+y, [[t.text for t in nlp(c)] for c in all_content])

In [None]:
with open(r"data\trainset\utterances.json", "r", encoding='utf-8') as fp:
    jtrainset = json.load(fp)

In [None]:
trainset = pd.DataFrame(columns=['tipo','frase'])

In [None]:
trainset['frase_len'] = trainset.frase.apply(lambda x: len(nlp(x)))

In [None]:
MAX_LEN = trainset['frase_len'].max()

Character Embedding

In [170]:
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input
from keras.models import Model

Using TensorFlow backend.


In [216]:
CHAR_VOCAB = r"abcdefghijklmnopqrstuvwxyzçàáéíóúãõâêîôûABCDEFGHIJKLMNOPQRSTUVWXYZÇÀÁÉÍÓÚÃÕÂÊÎÔÛ\
,\"`'“”:;?. !()[]{}<>+–-*/=<>.#$%&@/\^_|~%§°ºª1234567890•"
CHAR_DICT = {v:i+1 for i,v in enumerate(CHAR_VOCAB)}
CHAR_DICT['unk'] = 0
MAX_CHAR_LEN = 50

In [221]:
def word2number(c):
    try:
        return CHAR_DICT[c]
    except:
        return CHAR_DICT['unk']

In [244]:
words_dataset = pad_sequences([[word2number(c) for c in w] for w in all_words], maxlen=50, padding='post')

In [292]:
l = list(CHAR_DICT.values())

In [362]:
VOCAB_SIZE = max(l)+1

In [172]:
def custom_tokenizer(nlp):
    infix_re = re.compile(r'''[\,\?\:\;\‘\’\`\“\”\"~]''')
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

In [178]:
def sentence_to_tokens(s):
    nlp.tokenizer = custom_tokenizer(nlp)
    doc = nlp(s)
    return [t.text if t.text not in ['\n','\t'] else '' for t in doc]

In [456]:
class CharCNN():
    def __init__(self, vocab, dataset):
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.ds = dataset
        
    def build_model(self):
        word_input = Input(shape=(1,), dtype='float64')
        word_embedding = Embedding(input_dim=VOCAB_SIZE,
                                   output_dim=1,
                                   input_length=1)(word_input)
        word_vec = Flatten()(word_embedding)
        self.model = Model([word_input],word_vec)
        self.model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc'])
        #print(self.model.summary())
    
    
    def get_embedding(self, sent):
        words = [t.text for t in nlp(sent)]
        seqs = pad_sequences([[word2number(c) for c in w] for w in words], maxlen=50, padding='post')
        if self.model:
            return [self.model.predict(s).flatten() for s in seqs]
        else:
            return seqs
    
    def load_model(self):
        pass
    
    def save_model(self):
        pass

In [396]:
cnn = CharCNN(CHAR_VOCAB, words_dataset)

In [397]:
cnn.build_model()

Model: "model_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        (None, 1)                 0         
_________________________________________________________________
embedding_22 (Embedding)     (None, 1, 1)              140       
_________________________________________________________________
flatten_19 (Flatten)         (None, 1)                 0         
Total params: 140
Trainable params: 140
Non-trainable params: 0
_________________________________________________________________
None


In [400]:
cnn.get_embedding("oi tudo bem")

[array([-0.01473595,  0.01067107, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411],
       dtype=float32),
 array([-0.01153847,  0.02133409,  0.03690178, -0.01473595, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.04404411, -0.04404411, -0.04404411, -0.04404411,
        -0.04404411, -0.044

In [370]:
cnn.model.predict(words_dataset[0]).flatten().shape

(50,)

In [454]:
(50*50 + 50 + 30)

2580

Transformação em vetores

In [484]:
class VectorTransformer():
    def __init__(self, trainset, max_sent_len):
        self.ds = trainset
        self.max_sent_len = max_sent_len
        self.vecs = []
        self.nlp = spacy.load("pt_core_news_lg")
        
        self.genemb = KeyedVectors.load_word2vec_format(r'data\wordemb\fasttext50.txt')
        self.speemb = KeyedVectors.load_word2vec_format(r'data\wordemb\specific.txt')
        
        self.genemb.vocab['<pad>'] = np.zeros((self.genemb.vector_size,))        
        self.speemb.vocab['<pad>'] = np.zeros((self.speemb.vector_size,))
        
        self.charemb = CharCNN(CHAR_VOCAB, words_dataset)
        self.charemb.build_model()
        
    def get_vec(self, sent):
        words = [str(a) for a in nlp(sent)]
        if len(words) < self.max_sent_len:
            words += ['<pad>'] * (self.max_sent_len - len(words))
        elif len(words) > self.max_sent_len:
            words = words[:len(words)]
            
        vecs = []
        for w in words:
            wrd_vec = []
            
            try:
                wrd_vec.append(genmodel[w])
            except:
                wrd_vec.append(np.zeros((genmodel.vector_size,)))
            
            try:
                wrd_vec.append(spemodel[w])
            except:
                wrd_vec.append(np.zeros((spemodel.vector_size,)))
                
            wrd_vec.append(self.charemb.get_embedding(w)[0])
            
            vecs.append(reduce(lambda x,y: np.append(x,y), wrd_vec))
                
        return vecs
        
    def set2vec(self):
        for i,r in self.ds.iterrows():
            self.vecs.append((r['tipo'], np.array(self.get_vec(r['frase']))))
            
    

In [485]:
v = VectorTransformer(trainset, 23)

In [486]:
v.set2vec()

In [488]:
v.vecs[0][1].shape

(23, 130)

In [452]:
v.get_vec("oi tudo bem")

[array([-0.165811  , -0.092822  , -0.13019   ,  0.54585803, -0.228778  ,
         0.41475901, -0.043485  , -0.28435999,  0.215877  , -0.053517  ,
        -0.056134  , -0.082983  ,  0.101563  , -0.47971299, -0.29444   ,
         0.121065  ,  0.25478199, -0.298601  , -0.67904598,  0.076564  ,
        -0.118904  ,  0.011322  , -0.52526802,  0.258212  , -0.033776  ,
        -0.41392499,  0.024514  ,  0.408362  , -0.076277  ,  0.15044101,
        -0.099392  , -0.213889  , -0.10139   , -0.25314   ,  0.100294  ,
        -0.05039   , -0.30548999,  0.146725  ,  0.092074  ,  0.233336  ,
         0.36060601, -0.133261  , -0.053818  ,  0.15826   , -0.227054  ,
         0.030388  ,  0.001501  , -0.375168  , -0.39585301,  0.07992   ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

Salvando para rede neural

In [489]:
CLASS_DICT = {'GENERICA':0,'DESPESAPESSOAL':1,'DESPESAPUBLICA':2}

In [490]:
vectors = [(CLASS_DICT[v[0]],v[1]) for v in v.vecs]

In [491]:
df_dataset = pd.DataFrame(columns=['label','features'], data=vectors)

In [492]:
df_dataset.to_pickle(r'data\trainset\dataset.pkl')