In [1]:
import nltk
import os

In [2]:
from nltk.corpus import machado, mac_morpho
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
import string
from collections import defaultdict
from nltk.stem.snowball import PortugueseStemmer
import enchant

In [3]:
nltk.download('stopwords')
nltk.download('machado')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/guilherme/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package machado to
[nltk_data]     /home/guilherme/nltk_data...
[nltk_data]   Package machado is already up-to-date!


True

In [4]:
textos = []
for p, d, f in os.walk(r'machado/machado'):
    #print( p,d,f)
    if f:
        for fileid  in f:
            if not fileid.endswith('.txt'):
                continue
            with open(os.path.join(p,fileid), encoding='iso-8859-1') as g:
                textos.append(g.read())

In [5]:
swu = stopwords.words('portuguese') + list (string.punctuation)
stemmer = PortugueseStemmer()

In [6]:
#textos = textos[:4]

## Questão 4

In [7]:
def clean_sentence(texto : str):
    return [stemmer.stem(token.lower()) for token in WordPunctTokenizer().tokenize(texto) if token not in swu]


def get_textos_limpos():
    textos_limpos = []
    for texto in textos:
        tlimpo = clean_sentence(texto)
        textos_limpos.append(tlimpo)
    
    indice = defaultdict(lambda:set([]))
    for tid,t in enumerate(textos_limpos):
        for term in t:
            indice[term].add(tid)
    return textos_limpos

def get_indice_invertido():
    textos_limpos = get_textos_limpos()
    words_positions = {}
    for i, sentence in enumerate(textos_limpos):
        for j, word in enumerate(sentence):
            if not word in words_positions:
                words_positions[word] = {}
            word_d = words_positions[word]
            if not i in word_d:
                word_d[i] = set()
            word_d[i].add(j)
    
    return words_positions

indice_invertido = get_indice_invertido()

def get_documents(sentence):
    sentence = clean_sentence(sentence)
    if not sentence:
        return {}
    def in_documento(doc_p):
        def in_start(start):
            for i, word in enumerate(sentence,start):
                if i not in indice_invertido[word][doc_p]:
                    return False
            return True
        v_pos = set()
        for start in indice_invertido.get(sentence[0],{}).get(doc_p,[]):
            if in_start(start):
                v_pos.add(start)
        return v_pos
    docs = {}
    for doc in indice_invertido.get(sentence[0], []):
        docs[doc] = in_documento(doc)
    
    return docs

get_documents('crítric raimund')

{0: {0}}

## Questão 1

In [8]:
def get_frequency(textos, word):
    return sum(word in texto for texto in textos)

get_frequency(get_textos_limpos(), 'critic')

21

In [9]:
def get_good_frequency(word):
    return get_frequency(get_textos_limpos(), clean_sentence(word)[0])

get_good_frequency('publicado')

244

In [10]:
def get_bad_frequency(word : str) -> int:
    bad_text = [[token.lower() for token in WordPunctTokenizer().tokenize(texto)] for texto in textos]
    return get_frequency(bad_text, word)

get_bad_frequency('publicado')

233

## Questão 5

In [11]:
def get_documents_loose(sentence):
    sentence = clean_sentence(sentence)
    if not sentence:
        return {}
    def in_documento(doc_p):
        def in_start(start):
            last_position = [start]
            for i, word in enumerate(sentence, last_position[0]):
                def is_word():
                    for position in range(i,i+4):
                        if position in indice_invertido[word][doc_p]:
                            last_position[0] = position
                            return True
                    return False
                if not is_word():
                    return False
            return True
                
        v_pos = set()
        for start in indice_invertido.get(sentence[0],{}).get(doc_p,[]):
            if in_start(start):
                v_pos.add(start)
        return v_pos
    docs = {}
    for doc in indice_invertido[sentence[0]]:
        docs[doc] = in_documento(doc)
    
    return docs

get_documents_loose('crítric sinfon')

{0: {0}}

## Questão 3

In [12]:
def get_document_from_wrong(word):
    d = enchant.Dict("pt_BRL")
    vword = d.suggest(word) or [word]
    word = vword[0]
    return get_documents(word)

get_document_from_wrong('criticarr')

{3: {325, 395},
 10: {1775, 6235},
 26: {8320},
 31: {13490},
 43: {1897},
 84: {1134},
 116: {23608},
 149: {492},
 181: {5977},
 192: {123},
 199: {9289},
 201: {5985},
 204: {11514, 39734, 39739},
 205: {4232},
 209: {3482},
 225: {1878},
 228: {1754},
 230: {23360, 32551, 36511, 39741, 39972, 46954, 47405, 47648},
 232: {16106, 90084, 94966, 111437, 142999, 160411},
 235: {19231},
 239: {3201}}

## Questão 2

In [13]:
def get_documents_union(words):
    docs = set()
    for word in words.split(' '):
        bad_text = [[token.lower() for token in WordPunctTokenizer().tokenize(texto)] for texto in textos]
        for i, text in enumerate(bad_text):
            if word in text:
                docs.add(i)
    return docs

get_documents_union('crítrica criticar')

{0, 3, 26, 31, 43, 116, 199, 205, 228, 232}

In [14]:
def get_documents_from_clean(word):
    return set(get_indice_invertido().get(word, []))

get_documents_from_clean('crític')

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 52,
 62,
 66,
 105,
 110,
 176,
 183,
 186,
 187,
 190,
 196,
 199,
 200,
 201,
 202,
 206,
 207,
 209,
 210,
 213,
 216,
 225,
 226,
 227,
 228,
 230,
 232,
 235,
 236,
 239,
 241}

In [15]:
def get_dif_clean_from_class(word, class_words):
    return len(get_documents_from_clean(word))-len(get_documents_union(class_words))

get_dif_clean_from_class('critic', 'crítrica criticar')

11