In [33]:
import os, glob, json, spacy, pickle
import random

In [64]:
class TemplateGenerator():
    def __init__(self):
        self.templates = json.load(open(r'../data/interim/templates.json', 'r', encoding='utf-8'))
        entidades = json.load(open(r'../data/interim/ents_para_template.json', 'r', encoding='latin9'))
        with open(r'../data/interim/despesapessoal.txt', 'r', encoding='utf-8') as f:
            self.despesa_pessoal = set([x.replace('\n','') for x in f.readlines() if x])
        with open(r'../data/interim/despesapublica.txt', 'r', encoding='utf-8') as f:
            self.despesa_publica = set([x.replace('\n','') for x in f.readlines() if x])
        with open(r'../data/interim/substantivos.txt', 'r', encoding='latin9') as f:
            self.substantivos = list(set([x.replace('\n','') for x in f.readlines() if x]))
            self.substantivos = list(filter(lambda x:x not in self.despesa_pessoal and x not in self.despesa_publica, 
                                       self.substantivos))
        
        self.utterances = {}
        self.utterances['DESPESAPESSOAL'] = []
        self.utterances['DESPESAPUBLICA'] = []
        self.utterances['GENERICA'] = []
        
    def generate(self):
        # pessoal
        self.tpessoal = self.templates['DESPESAPESSOAL']
        for t in self.tpessoal:
            for d in self.despesa_pessoal:
                self.utterances['DESPESAPESSOAL'].append(t.replace('{ENTIDADE}',d))
                
        self.tpublica = self.templates['DESPESAPUBLICA']
        for t in self.tpublica:
            for d in self.despesa_publica:
                self.utterances['DESPESAPUBLICA'].append(t.replace('{ENTIDADE}',d))
                
        len_generica = max(len(self.utterances['DESPESAPUBLICA']), len(self.utterances['DESPESAPESSOAL']))
        tgen = self.tpessoal + self.tpublica
        for _ in range(len_generica):
            t = random.choice(tgen)
            s = random.choice(self.substantivos)
            self.utterances['GENERICA'].append(t.replace('{ENTIDADE}',s))
            
    def save(self):
        with open(r'../data/interim/utterances.json', 'w', encoding='utf-8') as fp:
            fp.write(json.dumps(self.utterances, indent=2, ensure_ascii=False))
            
    def save_train_data(self):
        self.train_data = []
        
        self.tpessoal = self.templates['DESPESAPESSOAL']
        for t in self.tpessoal:
            for d in self.despesa_pessoal:
                cnt = t.replace('{ENTIDADE}',d)
                edict = {'entities':[(cnt.find(d),cnt.find(d)+len(d),"CONTROLEEXTERNO")]}
                self.train_data.append((cnt, edict))
                
                
        self.tpublica = self.templates['DESPESAPUBLICA']
        for t in self.tpublica:
            for d in self.despesa_publica:
                cnt = t.replace('{ENTIDADE}',d)
                edict = {'entities':[(cnt.find(d),cnt.find(d)+len(d),"CONTROLEEXTERNO")]}
                self.train_data.append((cnt, edict))
        
        with open(r'../data/interim/synth_train_data.pkl', 'wb') as fp:
            pickle.dump(self.train_data, fp)
       
        

In [65]:
t = TemplateGenerator()

In [66]:
t.generate()

In [67]:
t.save_train_data()

In [195]:
t.save()

Geração do embedding específico

In [196]:
with open(r'data\gazetteers\despesapessoal.txt', 'r', encoding='utf-8') as f:
    despesa_pessoal = set([x.replace('\n','') for x in f.readlines() if x])
with open(r'data\gazetteers\despesapublica.txt', 'r', encoding='utf-8') as f:
    despesa_publica = set([x.replace('\n','') for x in f.readlines() if x])

In [200]:
gazetteers = list(despesa_pessoal) + list(despesa_publica)

In [202]:
nlp

<spacy.lang.pt.Portuguese at 0x1eb68a2cb38>

In [204]:
corpus = [[str(x) for x in nlp(g)] for g in gazetteers]

In [211]:
model = Word2Vec(corpus, min_count=1, size=30, workers=3, window=2, sg=1)

In [213]:
model.wv['siai-dp']

array([-8.6278040e-03, -5.4667559e-03, -2.3543884e-03,  1.3274187e-02,
        1.2416037e-03,  5.1690312e-03,  1.6006805e-02, -1.2449811e-02,
        2.3562612e-03,  1.6510904e-02,  7.8101526e-03, -1.3202942e-02,
        1.6252829e-02, -9.7082760e-03,  8.5476117e-05, -7.3867482e-03,
       -8.7993629e-03,  8.7538036e-03, -1.6177714e-02, -1.1851009e-02,
        1.5232809e-02, -1.0236542e-02,  1.2112031e-02, -7.2901375e-03,
       -4.4440348e-03,  7.0926524e-04,  1.6374322e-02,  8.4237223e-03,
        1.4730961e-02, -1.0998782e-02], dtype=float32)

In [227]:
model.wv.save_word2vec_format(r'data\wordemb\specific.txt')