### Imports

In [1]:
import numpy as np
import pandas as pd
import io
import re
from collections import Counter
import gc

### Carregar base original

In [2]:
file = io.open("CETENFolha-1.0_jan2014.cg", mode="r", encoding="utf-8")

In [3]:
file.seek(0)
corpus = file.read()

### Preprocessamento do corpus

In [4]:
def clean_and_split(corpus):
    aux = re.sub('<.*?>', '', corpus)
    aux = re.sub('<.*?>', '', aux)
    aux = re.sub('\[.*?\]', '', aux)
    aux = re.sub('  +', ' ', aux)
    aux = re.sub('\n\n+','\n\n', aux)
    return aux.split("\n\n")
    
sentences = clean_and_split(corpus)

In [5]:
del corpus

### Seleção das tuplas (palavra, etiqueta)

In [10]:
PUNCT_TAG = "__PUNCT__"

In [13]:
phrases = [[
        (l.split("\t")[0].lower().strip() if len(l.split("\t")) > 1 else l.split(" ")[0],
        l.split("\t")[1].split(" ")[1] if len(l.split("\t")) > 1 else PUNCT_TAG)
        for l in s.split("\n")] 
        for s in list(filter(None, sentences))]

#### Remove sentenças que possuem etiquetas fora do padrão

In [15]:
len(phrases)

1694570

In [11]:
### Remove qualquer sentença que possua uma etiqueta que não se encaixe nessa expressão regular
#tag_pattern = re.compile("^[A-Z]+$|^\$.*$")
#phrases = (list(filter(lambda s: all(tag_pattern.match(tk[1]) != None for tk in s), phrases)))

In [18]:
### Remove qualquer sentença que possua uma etiqueta que não seja uma dessas tags
#allowed_tags = ['N','DET','PRP','V','PROP','$,','ADJ','$.','ADV','NUM','KC','$"','SPEC','PERS','KS','$)','$(','$:','$--','$?','$;',"$'",'$!','IN','$...','EC','$pause','$`','$+','$=','PRON','$$','$]','$[','PP','$±','$~','GER','PU','$|','M','$_']
allowed_tags = ['N','DET','PRP','V','PROP','ADJ','ADV','NUM','KC','SPEC','PERS','KS','IN','EC','PRON',PUNCT_TAG]
phrases = (list(filter(lambda s: all(tk[1] in allowed_tags for tk in s), phrases)))

In [19]:
len(phrases)

1693101

In [20]:
1694570 - 1693101

1469

#### =====> Perda de 1469 sentenças

### Computar conjunto de etiquetas

In [21]:
tags_counter = Counter()
for s in phrases:
    for tk in s:
        tag = tk[1]
        if tags_counter[tag]:
            tags_counter[tag]=tags_counter[tag]+1
        else:
            tags_counter[tag] = 1

In [22]:
tags_counter.most_common()

[('N', 5325018),
 ('DET', 4203531),
 ('PRP', 4095498),
 ('__PUNCT__', 3936438),
 ('V', 3361824),
 ('PROP', 1835949),
 ('ADJ', 1360182),
 ('ADV', 1053826),
 ('NUM', 764485),
 ('KC', 636563),
 ('SPEC', 334770),
 ('PERS', 326286),
 ('KS', 295857),
 ('IN', 8289),
 ('EC', 1695),
 ('PRON', 83)]

### Computar frequência de etiquetas para cada palavra

In [23]:
word_freq = {}
for s in phrases:
    for tk in s:
        if word_freq.get(tk[0]) == None:
            word_freq[tk[0]] = Counter()
        word_freq[tk[0]][tk[1]] = word_freq[tk[0]][tk[1]] + 1

In [30]:
len(word_freq.keys())

450013

### Baseline: Etiqueta por Tag mais frequente

In [31]:
corretas = 0
totais = 0
for s in phrases:
    for tk in s:
        totais+=1
        if tk[1] == word_freq[tk[0]].most_common(1)[0][0]:
            corretas+=1

In [32]:
print("Acurácia: "+ str(corretas)+"/"+str(totais)+": "+str(corretas/totais*100.0)+"%")

Acurácia: 26311801/27540294: 95.53928872364253%


### Salva e contagems processadas


In [33]:
import pickle

In [35]:
with open('preprocessed_CETEN_v2.pkl', 'wb') as output:
    pickle.dump(phrases, output, pickle.HIGHEST_PROTOCOL)

In [36]:
del tags_counter

In [37]:
del word_freq