In [1]:
from cherche import data, retrieve
from elasticsearch import Elasticsearch
from transformers import pipeline
import pandas as pd

from nltk.tokenize import word_tokenize
from string import punctuation
import nltk
from unicodedata import normalize
from nltk.stem import RSLPStemmer
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer


In [2]:
caminho_query = "dados-conle-anonimizado-assunto-notnull (1).csv"

# Base de dados câmara dos deputados

In [3]:
df_assunto= pd.read_csv(caminho_query, encoding='utf-8', delimiter=";")

In [4]:
arr_assunto = df_assunto.to_numpy()
y,X = arr_assunto[:,0],arr_assunto[:,1]
y = [i.strip() for i in y]

In [5]:
def verificar(y,top_n):
    for d in top_n:
        if y == d:
            return 1
    return 0

In [6]:
def avaliacaoRecall(isPreprocess):

    quant_encontrado=0
    quant_relevante =0
    for l,x in zip(y,X):

        tokenized_query3 = x                   
        if isPreprocess:
            tokenized_query3 = preprocess(x)
    
        scores = retriever(tokenized_query3)
        top_n = [d["name"] for d in scores] 

        
        quant_relevante+=1
        quant_encontrado+=verificar(l,top_n)
    
    recall = quant_encontrado / quant_relevante
    print("R@20: "+str(recall))


# Utilizando elasticsearch

In [7]:
es = Elasticsearch(hosts="localhost:9200", http_auth=('elastic', '_3egIk1UEsLOV4266NWo'))

In [8]:
k_teste =20
index_es = "experimento_word_n_gram"

# Pré processamento

## 1- Sem pré processamento

In [19]:
retriever = retrieve.Elastic(es=es,on=["content"],index=index_es,key="name",k=k_teste)

Recall

In [20]:
avaliacaoRecall(False)

R@20: 0.5423728813559322


## 2- Letra mínuscula

In [21]:
def preprocess(txt):
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = " ".join(terms)
    return terms

In [22]:
retriever = retrieve.Elastic(es=es,on=["pre_text_lowercase"],index=index_es,key="name",k=k_teste)

Recall

In [23]:
avaliacaoRecall(True)

R@20: 0.5288135593220339


## 3- Letra mínuscula + remoção de pontuação

In [24]:
def preprocess(txt):
    stopwords = list(punctuation)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    terms = " ".join(terms)
    return terms

In [25]:
retriever = retrieve.Elastic(es=es,on=["pre_text_lowercase_pontuacao"],index=index_es,key="name",k=k_teste)

Recall

In [26]:
avaliacaoRecall(True)

R@20: 0.5288135593220339


## 4- Letra mínuscula + remoção de pontuação e remoção de acentuação

In [30]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    terms = " ".join(terms)
    return terms

In [31]:
retriever = retrieve.Elastic(es=es,on=["pre_text_lowercase_pontuacao_acentuacao"],index=index_es,key="name",k=k_teste)

Recall

In [32]:
avaliacaoRecall(True)

R@20: 0.5932203389830508


## 5- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword


In [36]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    terms = " ".join(terms)
    return terms

In [37]:
retriever = retrieve.Elastic(es=es,on=["pre_text_lowercase_pontuacao_acentuacao_stopword"],index=index_es,key="name",k=k_teste)

Recall

In [38]:
avaliacaoRecall(True)

R@20: 0.5898305084745763


# Stemming

## 6- Stemming (RSLP)

In [23]:
def preprocess(txt):

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    terms = [stemmer.stem(word) for word in terms]
    terms = " ".join(terms)
    return terms

In [24]:
retriever = retrieve.Elastic(es=es,on=["text_rslp"],index=index_es,key="name",k=k_teste)

Recall

In [25]:
avaliacaoRecall(True)

R@20: 0.5627118644067797


## 7- Stemming (RSLP-S)

In [9]:
class RSLP_S:
    def __plural_reduction(self, word):
        excep = ["lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis" ]
        excep_s = ["aliás","pires","lápis","cais","mais","mas","menos", "férias","fezes","pêsames","crúcis","gás", "atrás","moisés","através","convés","ês", "país","após","ambas","ambos","messias"]

        len_word = len(word)
        new_word = list(word)

        if len_word >= 3:
            if new_word[-1] == 's' and new_word[-2] == 'n':
                new_word[-2] = 'm'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'õ':
                new_word[-3] = 'ã'
                new_word[-2] = 'o'
                sing = "".join(new_word)
                sing = sing[:-1]
                return  sing

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'ã':
                if word == 'mães':
                    word = word[:-1]
                    return word
                else:
                    new_word[-2] = 'o'
                    sing = "".join(new_word)
                    sing = sing[:-1]
                    return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'a':
                if word != 'cais' and word != 'mais':
                    new_word[-2] = 'l'
                    sing = "".join(new_word)
                    sing = sing[:-1]
                    return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'é':
                new_word[-3] = 'e'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'e':
                new_word[-3] = 'e'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'ó':
                new_word[-3] = 'o'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'i':
                if word not in excep:
                    new_word[-1] = 'l'
                    sing = "".join(new_word)
                    return sing

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'l':
                word = word[:-2]
                return word

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'r':
                word = word[:-2]
                return word

            if new_word[-1] == 's':
                if word not in excep_s:
                    word = word[:-1]

        return word

    def stem(self, word):
        word = self.__plural_reduction(word)

        return word

In [27]:
def preprocess(txt):

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    terms = [stemmer.stem(word) for word in terms]
    terms = " ".join(terms)
    return terms

In [28]:
retriever = retrieve.Elastic(es=es,on=["text_rslps"],index=index_es,key="name",k=k_teste)

Recall

In [29]:
avaliacaoRecall(True)

R@20: 0.5322033898305085


## 8- Stemming (Savoy)

In [12]:
class Savoy:

    def __removeAllPTAccent(self, old_word):
        word = list(old_word)
        len_word = len(word)-1
        for i in range(len_word, -1, -1):
            if word[i] == 'ä':
                word[i] = 'a'
            if word[i] == 'â':
                word[i] = 'a'
            if word[i] == 'à':
                word[i] = 'a'
            if word[i] == 'á':
                word[i] = 'a'
            if word[i] == 'ã':
                word[i] = 'a'
            if word[i] == 'ê':
                word[i] = 'e'
            if word[i] == 'é':
                word[i] = 'e'
            if word[i] == 'è':
                word[i] = 'e'
            if word[i] == 'ë':
                word[i] = 'e'
            if word[i] == 'ï':
                word[i] = 'i'
            if word[i] == 'î':
                word[i] = 'i'
            if word[i] == 'ì':
                word[i] = 'i'
            if word[i] == 'í':
                word[i] = 'i'
            if word[i] == 'ü':
                word[i] = 'u'
            if word[i] == 'ú':
                word[i] = 'u'
            if word[i] == 'ù':
                word[i] = 'u'
            if word[i] == 'û':
                word[i] = 'u'
            if word[i] == 'ô':
                word[i] = 'o'
            if word[i] == 'ö':
                word[i] = 'o'
            if word[i] == 'ó':
                word[i] = 'o'
            if word[i] == 'ò':
                word[i] = 'o'
            if word[i] == 'õ':
                word[i] = 'o'
            if word[i] == 'ç':
                word[i] = 'c'

        new_word = "".join(word)
        return new_word

    def __finalVowelPortuguese(self, word):
        len_word = len(word)
        if len_word > 3:
            if word[-1] == 'e' or word[-1] == 'a' or word[-1] == 'o':
                word = word[:-1]

        return word

    def __remove_PTsuffix(self, word):
        len_word = len(word)

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'e' and (word[-3] == 'r' or word[-3] == 's' or word[-3] == 'z' or word[-3] == 'l'):
                word = word[:-2]
                return word
        if len_word > 2:
            if word[-1] == 's' and word[-2] == 'n':
                new_word = list(word)
                new_word[-2] = 'm'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if (word[-1] == 's' and word[-2] == 'i') and (word[-3] == 'e' or word[-3] == 'é'):
                new_word = list(word)
                new_word[-3] = 'e'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'i' and word[-3] == 'a':
                new_word = list(word)
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'i' and word[-3] == 'ó':
                new_word = list(word)
                new_word[-3] = 'o'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'i':
                new_word = list(word)
                new_word[-1] = 'l'
                sing = "".join(new_word)
                return sing

        if len_word > 2:
            if word[-1] == 's' and word[-2] == 'e' and word[-3] == 'õ':
                new_word = list(word)
                new_word[-3] = 'ã'
                new_word[-2] = 'o'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing
            if word[-1] == 's' and word[-2] == 'e' and word[-3] == 'ã':
                new_word = list(word)
                new_word[-2] = 'o'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 5:
            if word[-1] == 'e' and word[-2] == 't' and word[-3] == 'n' and word[-4] == 'e' and word[-5] == 'm':
                word = word[:-5]
                return word

        if len_word > 2:
            if word[-1] == 's':
                word = word[:-1]

        return word

    def __normFemininPortuguese(self, word):

        len_word = len(word)

        if len_word < 3 or word[-1] != 'a':
            return word

        if len_word > 6:

            if word[-2] == 'h' and word[-3] == 'n' and word[-4] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'c' and word[-3] == 'a' and word[-4] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'r' and word[-3] == 'i' and word[-4] == 'e':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

        if len_word > 5:
            if word[-2] == 'n' and word[-3] == 'o':
                new_word = list(word)
                new_word[-3] = 'ã'
                new_word[-2] = 'o'
                masc = "".join(new_word)
                masc = masc[:-1]
                return masc

            if word[-2] == 'r' and word[-3] == 'o':
                word = word[:-1]
                return word

            if word[-2] == 's' and word[-3] == 'o':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 's' and word[-3] == 'e':
                new_word = list(word)
                new_word[-3] = 'ê'
                masc = "".join(new_word)
                masc = masc[:-1]
                return masc

            if word[-2] == 'c' and word[-3] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'd' and word[-3] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'd' and word[-3] == 'a':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'v' and word[-3] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'm' and word[-3] == 'a':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'n':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

        return word

    def stem(self, word):
        len_word = len(word)
        if len_word > 2:
            word = self.__remove_PTsuffix(word)
            word = self.__normFemininPortuguese(word)
            word = self.__finalVowelPortuguese(word)
            word = self.__removeAllPTAccent(word)

        return word


In [31]:
def preprocess(txt):

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    terms = [stemmer.stem(word) for word in terms]
    terms = " ".join(terms)
    return terms

In [32]:
retriever = retrieve.Elastic(es=es,on=["text_savoy"],index=index_es,key="name",k=k_teste)

Recall

In [33]:
avaliacaoRecall(True)

R@20: 0.5254237288135594


## 9- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword + stemming (RSLP)

In [34]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
    return " ".join(terms)

In [35]:
retriever = retrieve.Elastic(es=es,on=["pre_text_rslp"],index=index_es,key="name",k=k_teste)

Recall

In [36]:
avaliacaoRecall(True)

R@20: 0.6033898305084746


## 10- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword + stemming (RSLP-S)

In [37]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
    return " ".join(terms)

In [38]:
retriever = retrieve.Elastic(es=es,on=["pre_text_rslps"],index=index_es,key="name",k=k_teste)

Recall

In [39]:
avaliacaoRecall(True)

R@20: 0.5898305084745763


## 11- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword + stemming (Savoy)

In [40]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
    return " ".join(terms)

In [41]:
retriever = retrieve.Elastic(es=es,on=["pre_text_savoy"],index=index_es,key="name",k=k_teste)

Recall

In [42]:
avaliacaoRecall(True)

R@20: 0.6169491525423729


# Word n-gram

## 12- Bigram

In [15]:
def preprocess(txt):

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    ngram = []
    
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [16]:
retriever = retrieve.Elastic(es=es,on=["text_bigram"],index=index_es,key="name",k=k_teste)

Recall

In [17]:
avaliacaoRecall(True)

R@20: 0.5016949152542373


## 13- Trigram

In [18]:
def preprocess(txt):

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    ngram = []
    
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [19]:
retriever = retrieve.Elastic(es=es,on=["text_trigram"],index=index_es,key="name",k=k_teste)

Recall

In [20]:
avaliacaoRecall(True)

R@20: 0.48135593220338985


## 14- Unigram + Bigram

In [21]:
def preprocess(txt):
    
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [22]:
retriever = retrieve.Elastic(es=es,on=["text_uni_bi"],index=index_es,key="name",k=k_teste)

Recall

In [23]:
avaliacaoRecall(True)

R@20: 0.48135593220338985


# Word n-gram + pré processamento básico

## 15- Letra mínuscula + remoção de pontuação, acentuação e stopword + bigram

In [9]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [10]:
retriever = retrieve.Elastic(es=es,on=["pre_text_bigram"],index=index_es,key="name",k=k_teste)

Recall

In [11]:
avaliacaoRecall(True)

R@20: 0.5627118644067797


## 16- Letra mínuscula + remoção de pontuação, acentuação e stopword + trigram

In [12]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [13]:
retriever = retrieve.Elastic(es=es,on=["pre_text_trigram"],index=index_es,key="name",k=k_teste)

Recall

In [14]:
avaliacaoRecall(True)

R@20: 0.5389830508474577


## 17- Letra mínuscula + remoção de pontuação, acentuação e stopword + unigram + bigram

In [15]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [16]:
retriever = retrieve.Elastic(es=es,on=["pre_text_uni_bi"],index=index_es,key="name",k=k_teste)

Recall

In [17]:
avaliacaoRecall(True)

R@20: 0.5389830508474577


# Word n-gram + pré processamento básico + RSLP

## 18- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP) + bigram

In [9]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [10]:
retriever = retrieve.Elastic(es=es,on=["pre_text_bigram_rslp"],index=index_es,key="name",k=k_teste)

Recall

In [11]:
avaliacaoRecall(True)

R@20: 0.5491525423728814


## 19- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP) + trigram

In [12]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [13]:
retriever = retrieve.Elastic(es=es,on=["pre_text_trigram_rslp"],index=index_es,key="name",k=k_teste)

Recall

In [14]:
avaliacaoRecall(True)

R@20: 0.5186440677966102


## 20- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP) + unigram + bigram

In [15]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [16]:
retriever = retrieve.Elastic(es=es,on=["pre_text_uni_bi_rslp"],index=index_es,key="name",k=k_teste)

Recall

In [17]:
avaliacaoRecall(True)

R@20: 0.535593220338983


# Word n-gram + pré processamento básico + RSLP-S

## 21- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP-S) + bigram

In [10]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [11]:
retriever = retrieve.Elastic(es=es,on=["pre_text_bigram_rslps"],index=index_es,key="name",k=k_teste)

Recall

In [13]:
avaliacaoRecall(True)

R@20: 0.5559322033898305


## 22- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP-S) + trigram

In [14]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [15]:
retriever = retrieve.Elastic(es=es,on=["pre_text_trigram_rslps"],index=index_es,key="name",k=k_teste)

Recall

In [16]:
avaliacaoRecall(True)

R@20: 0.5423728813559322


## 23- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP-S) + unigram + bigram

In [17]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [18]:
retriever = retrieve.Elastic(es=es,on=["pre_text_uni_bi_rslps"],index=index_es,key="name",k=k_teste)

Recall

In [19]:
avaliacaoRecall(True)

R@20: 0.5423728813559322


# Word n-gram + pré processamento básico + Savoy

## 24- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (Savoy) + bigram

In [9]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [10]:
retriever = retrieve.Elastic(es=es,on=["pre_text_bigram_savoy"],index=index_es,key="name",k=k_teste)

Recall

In [13]:
avaliacaoRecall(True)

R@20: 0.5796610169491525


## 25- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (Savoy) + trigram

In [14]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [15]:
retriever = retrieve.Elastic(es=es,on=["pre_text_trigram_savoy"],index=index_es,key="name",k=k_teste)

Recall

In [16]:
avaliacaoRecall(True)

R@20: 0.5694915254237288


## 26- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (Savoy) + unigram + bigram

In [17]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [18]:
retriever = retrieve.Elastic(es=es,on=["pre_text_uni_bi_savoy"],index=index_es,key="name",k=k_teste)

Recall

In [19]:
avaliacaoRecall(True)

R@20: 0.5864406779661017
