In [3]:
import numpy as np
import pandas as pd
import io
import re
from collections import Counter
import gc

import pickle
import nltk
import math
from sklearn.model_selection import train_test_split

In [5]:
with open('preprocessed_CETEN_v2.pkl', 'rb') as input:
    phrases = pickle.load(input)

#### Divide a base em treino e teste

In [6]:
phrases_train , phrases_test = train_test_split(phrases,test_size=0.2, random_state=42)

#### Converte qualquer palavra com freq < 5 para __RARE__

In [7]:
min_threshold = 5

word_counts = Counter()
for phrase in phrases_train:
    for word in phrase:
        word_counts[word[0]]+=1
        
word_counts = {word: count for word, count in word_counts.items() if word_counts[word] >= min_threshold}

In [8]:
phrases_train_rare = []
RARE_WORD = '__RARE__'
for phrase in phrases_train:
    phrases_train_rare.append([(w[0] if word_counts.get(w[0]) else RARE_WORD,w[1]) for w in phrase])

#### Contagem de palavras na base de treino

In [10]:
tags_counter = Counter()
for s in phrases_train_rare:
    for tk in s:
        tag = tk[1]
        if tags_counter[tag]:
            tags_counter[tag]=tags_counter[tag]+1
        else:
            tags_counter[tag] = 1

In [11]:
allowed_tags = tags_counter.keys()

#### Contagem de etiquetas na base de treino

In [13]:
word_freq = {}
for s in phrases_train_rare:
    for tk in s:
        if word_freq.get(tk[0]) == None:
            word_freq[tk[0]] = Counter()
        word_freq[tk[0]][tk[1]] = word_freq[tk[0]][tk[1]] + 1

#### Definição da matriz de emissão (python dict)

In [15]:
def get_emission_probs(word_freq, tags_counter):
    smooth_emission_not_seen = 0
    e = {}
    for word in word_freq.keys():
        for tag in tags_counter.keys():
            if tags_counter[tag] == 0:
                print(tag)
            e[(word,tag)] = float((word_freq[word].get(tag,smooth_emission_not_seen)+1)/float(tags_counter[tag]))
    return e
e_prob = get_emission_probs(word_freq, tags_counter)    

#### Definição da matriz de transição (python dict)

In [22]:
smooth_transition_not_seen = 0

def prep_sentence(phrase):
    return ['*'] + ['*'] + [word[1] for word in phrase] + ['STOP']
    
def get_transition_probs(phrases,word_feq,tags_counter):

    bigrams_c = Counter()
    for phrase in phrases:
        bigrams = nltk.bigrams(prep_sentence(phrase))
        for bigram in bigrams:
            bigrams_c[bigram] +=1

    trigrams_c = Counter()
    for phrase in phrases:
        trigrams = nltk.trigrams(prep_sentence(phrase))
        for trigram in trigrams:
            trigrams_c[trigram] +=1

    trigrams_p = {}
    for trigram, trigram_count in trigrams_c.items():
        bigram = (trigram[0],trigram[1])
        bigram_count = bigrams_c.get(bigram)
        if bigram_count:
            trigrams_p[trigram] = float(trigram_count)/float(bigram_count)
        else:
            trigrams_p[trigram] = 0
        
    return trigrams_p, bigrams_c
    
    
q_prob, bigrams_c = get_transition_probs(phrases,word_freq,tags_counter)

In [29]:
import timeit

In [49]:
start_time = timeit.default_timer()
c = 0
for w in word_freq.keys():
    for k in allowed_tags:
        #e_prob[(w,k)]
        c=c+1
print(c)
elapsed = timeit.default_timer() - start_time

elapsed

1408768


0.1935931120449652

In [64]:
def viterbi(sents, e_prob,q_prob,allowed_tags, tdef, tloop1, tloop2, ttag):
    
    start_time = timeit.default_timer()

    
    def S(k):
        if k == -1 or k == 0:
            return ['*']
        else:
            return allowed_tags
    pi = {}
    pi[(0, '*','*')] = 1

    bp = {}
    
    elapsed = timeit.default_timer() - start_time
    tdef=tdef+elapsed
    #print("Time definition: ", elapsed)

    tagged = []
    for sent in sents:
        c=0
        start_time = timeit.default_timer()
        for k in range(1, len(sent)+1):
            for u in S(k-1):
                for v in S(k):
                    max_p = -1
                    max_tag = None
                    for w in S(k-2):
                        c=c+1
                        #if bigrams_c[(w,u)] > 0: ### Se o bigram não existir na base de treino, não faz sentido testá-lo aqui 
                        prob = pi[(k-1,w,u)] * q_prob.get((w,u,v),0) * e_prob[(sent[k-1],v)]
                        #prob = 0
                        if (prob > max_p):
                            max_p = prob
                            max_tag = w
                    pi[(k,u,v)] = max_p
                    bp[(k,u,v)] = max_tag

        elapsed = timeit.default_timer() - start_time
        tloop1+=elapsed
        #print("First loop: ", elapsed)
        #print(c)

        start_time = timeit.default_timer()
        max_p = -1
        max_u_tag = None
        max_v_tag = None
        n = len(sent)
        for u in S(n-1):
            for v in S(n):
                #if bigrams_c[(u,v)] > 0:
                prob = pi[(n,u,v)] * q_prob.get((u,v,'STOP'),0)
                if (prob > max_p):
                    max_p = prob
                    max_u_tag = u
                    max_v_tag = v

        elapsed = timeit.default_timer() - start_time
        tloop2+=elapsed
        #print("Second loop: ", elapsed)

        start_time = timeit.default_timer()
        tags = []
        tags.append(max_v_tag)
        tags.append(max_u_tag)
        for i,k in enumerate(range(n-2,0, -1)):
            tags.append(bp[(k+2, tags[i+1], tags[i])])

        tags = list(reversed(tags))

        tagged_sentence = []
        for j in range(0, n):
            tagged_sentence.append((sent[j], tags[j]))

        elapsed = timeit.default_timer() - start_time
        ttag+=elapsed
        #print("Tags: ", elapsed)
        
        tagged.append(tagged_sentence)
    
    return tagged, tdef, tloop1, tloop2, ttag

In [None]:
tdef=0
tloop1=0
tloop2=0
ttag=0
corretas_baseline = 0
corretas_viterbi = 0
totais = 0

testwith = 10000
for s in phrases_test[:testwith]:
    sents_with_rare = []
    sents_with_rare.append([tk[0] if word_counts.get(tk[0]) != None else RARE_WORD for tk in s])
    preds,tdef, tloop1, tloop2, ttag = viterbi(sents_with_rare, e_prob, q_prob, allowed_tags, tdef, tloop1, tloop2, ttag)
    for tk_golden, tk_pred in zip(s,preds[0]):
        totais+=1
        if tk_golden[1] == word_freq[tk_pred[0]].most_common(1)[0][0]:
            corretas_baseline+=1
        if tk_golden[1] == tk_pred[1]:
            corretas_viterbi+=1

In [81]:
53/1000

0.053

In [85]:
len(phrases_test)*0.053/60/60

4.985253611111111

In [78]:
tdef, tloop1, tloop2, ttag

(0.0008938368179087774,
 53.887110182648826,
 0.128611268689383,
 0.017564173378048054)

In [86]:
corretas_viterbi, corretas_baseline, totais, corretas_viterbi/totais*100, corretas_baseline/totais*100

(15405, 15173, 15989, 96.3474888986178, 94.89649133779473)

In [74]:
tdef=0
tloop1=0
tloop2=0
ttag=0
corretas_baseline = 0
corretas_viterbi = 0
totais = 0
sents_with_rare = []

testwith = 1000
for s in phrases_test[:testwith]:
    sents_with_rare.append([tk[0] if word_counts.get(tk[0]) != None else RARE_WORD for tk in s])

preds,tdef, tloop1, tloop2, ttag = viterbi(sents_with_rare, e_prob, q_prob, allowed_tags, tdef, tloop1, tloop2, ttag)
for s_golden, s_pred in zip(phrases_test[:testwith],preds):
    for tk_golden, tk_pred in zip(s_golden,s_pred):
        totais+=1
        if tk_golden[1] == word_freq[tk_pred[0]].most_common(1)[0][0]:
            corretas_baseline+=1
        if tk_golden[1] == tk_pred[1]:
            corretas_viterbi+=1

In [75]:
corretas_viterbi, corretas_baseline, totais, corretas_viterbi/totais*100, corretas_baseline/totais*100

(15405, 15173, 15989, 96.3474888986178, 94.89649133779473)

In [76]:
tdef, tloop1, tloop2, ttag

(2.332559517981281e-06,
 52.36434300194827,
 0.12211555686576503,
 0.017630418069131792)

In [336]:
corretas_viterbi, corretas_baseline, totais, corretas_viterbi/totais*100, corretas_baseline/totais*100

(501, 504, 530, 94.52830188679245, 95.09433962264151)

In [320]:
tdef, tloop1, tloop2, ttag

(4.245258369905969e-05,
 25.797425600680697,
 0.02671713703517753,
 0.0009236935800345236)

In [303]:
corretas_viterbi, corretas_baseline, totais, corretas_viterbi/totais*100, corretas_baseline/totais*100

(501, 504, 530, 94.52830188679245, 95.09433962264151)

In [292]:
#phrase = 'ele caiu de a escada e morreu em a hora'.split(' ')
phrase = [tk[0] if word_counts.get(tk[0]) != None else RARE_WORD for tk in phrases_test[0]]
viterbi(phrase, e_prob, q_prob, allowed_tags)

TypeError: viterbi() missing 1 required positional argument: 'e_values'

In [290]:
from collections import defaultdict, deque

START_SYMBOL = '*'
STOP_SYMBOL = 'STOP'
RARE_SYMBOL = '__RARE__'

def viterbi(brown_dev_words, taglist, known_words, q_values, e_values):
    tagged = []

    # pi[(k, u, v)]: max probability of a tag sequence ending in tags u, v at position k
    # bp[(k, u, v)]: backpointers to recover the argmax of pi[(k, u, v)]
    pi = defaultdict(float)
    bp = {}

    # Initialization
    pi[(0, START_SYMBOL, START_SYMBOL)] = 1

    # Define tagsets S(k)
    def S(k):
        if k in (-1, 0):
            return {START_SYMBOL}
        else:
            return taglist

    # The Viterbi algorithm
    for sent_words_actual in brown_dev_words:
        sent_words = [word if word in known_words else RARE_SYMBOL for word in sent_words_actual]
        n = len(sent_words)
        for k in range(1, n+1):
            for u in S(k-1):
                for v in S(k):
                    max_score = -1
                    max_tag = None
                    for w in S(k - 2):
                        #if e_values.get((sent_words[k-1], v), 0) != 0:
                        score = pi[(k-1, w, u)] * \
                                q_values.get((w, u, v),0) * \
                                e_values[(sent_words[k-1], v)]
                        if score > max_score:
                            max_score = score
                            max_tag = w
                    pi[(k, u, v)] = max_score
                    bp[(k, u, v)] = max_tag

        max_score = -1
        u_max, v_max = None, None
        for u in S(n-1):
            for v in S(n):
                score = pi[(n, u, v)] * \
                        q_values.get((u, v, STOP_SYMBOL), 0)
                if score > max_score:
                    max_score = score
                    u_max = u
                    v_max = v

        tags = []
        tags.append(v_max)
        tags.append(u_max)

        for i, k in enumerate(range(n-2, 0, -1)):
            tags.append(bp[(k+2, tags[i+1], tags[i])])
        tags = list(reversed(tags))

        tagged_sentence = []
        for j in range(0, n):
            tagged_sentence.append((sent_words_actual[j] , tags[j]))
        tagged.append(tagged_sentence)
        

    return tagged

In [299]:
phr = [list(map(lambda x: x[0], ph)) for ph in phrases_test[:30]]
phrases_viterbi = viterbi(phr, allowed_tags, set(word_counts.keys()), q_prob, e_prob)

corretas = 0
totais = 0
for s_viterbi, s_gold in zip(phrases_viterbi, phrases_test[:30]):
    for tk_viterbi, tk_gold in zip(s_viterbi, s_gold):
        totais+=1
        if tk_gold[1] == tk_viterbi[1]:
            corretas+=1
corretas,totais

(501, 530)