In [1]:
from pprint import pprint
from typing import Dict, Union, Any, List, Optional, Tuple

from haystack import MultiLabel, Document
from haystack.pipelines import Pipeline
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.nodes import BM25Retriever
from haystack.nodes.base import BaseComponent
import pandas as pd

from nltk.tokenize import word_tokenize
from string import punctuation
import nltk
from unicodedata import normalize
from nltk.stem import RSLPStemmer
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer


In [2]:
caminho_query = "dados-conle-anonimizado-assunto-notnull (1).csv"

In [3]:
index_nome = "experimento_word_n_gram"

# Base de dados câmara dos deputados

In [4]:
df_assunto= pd.read_csv(caminho_query, encoding='utf-8', delimiter=";")

In [5]:
arr_assunto = df_assunto.to_numpy()
y,X = arr_assunto[:,0],arr_assunto[:,1]
y = [i.strip() for i in y]

In [6]:
def verificar(y,top_n):
    if y in top_n:
        return 1
    return 0

In [7]:
def avaliacaoRecall(isPreprocess):
    
    
    quant_encontrado=0
    quant_relevante =0
    for l,x in zip(y,X):
        
        tokenized_query3 = x                   
        if isPreprocess:
            tokenized_query3 = preprocess(x)                   
    
    
        top_n_stem_l = pipeline.run(query=tokenized_query3,params={"Retriever": {"top_k": 20}})
    

        top_n = [top_n_stem_l['documents'][d].meta['name'].strip() for d in range(len(top_n_stem_l['documents']))]              #L
    
        quant_relevante+=1
        quant_encontrado+=verificar(l,top_n)
    
    recall = quant_encontrado / quant_relevante
    print("R@20: "+str(recall))
    


# Pré processamento

## 1- Sem pré processamento

In [8]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["content"])

In [9]:
retriever = BM25Retriever(document_store=document_store)

In [10]:
pipeline = Pipeline()

In [11]:
pipeline.add_node(component=retriever, name="Retriever",inputs=['Query'])

Recall

In [12]:
avaliacaoRecall(False)

R@20: 0.5423728813559322


## 2- Letra mínuscula

In [13]:
def preprocess(txt):
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = " ".join(terms)
    return terms

In [14]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def preprocess(self,txt):
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [15]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_lowercase"])

In [16]:
retriever = BM25Retriever(document_store=document_store)

In [17]:
pre = PreProcessamento()

In [18]:
pipeline = Pipeline()

In [19]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [20]:
avaliacaoRecall(True)

R@20: 0.5288135593220339


## 3- Letra mínuscula + remoção de pontuação

In [21]:
def preprocess(txt):
    stopwords = list(punctuation)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    terms = " ".join(terms)
    return terms

In [22]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def preprocess(self,txt):
        stopwords = list(punctuation)
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [word for word in terms if word not in stopwords]
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [23]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_lowercase_pontuacao"])

In [24]:
retriever = BM25Retriever(document_store=document_store)

In [25]:
pre = PreProcessamento()

In [26]:
pipeline = Pipeline()

In [27]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [28]:
avaliacaoRecall(True)

R@20: 0.5288135593220339


## 4- Letra mínuscula + remoção de pontuação e remoção de acentuação

In [37]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    terms = " ".join(terms)
    return terms

In [38]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [word for word in terms if word not in stopwords]
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [39]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_lowercase_pontuacao_acentuacao"])

In [40]:
retriever = BM25Retriever(document_store=document_store)

In [41]:
pre = PreProcessamento()

In [42]:
pipeline = Pipeline()

In [43]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [44]:
avaliacaoRecall(True)

R@20: 0.5932203389830508


## 5- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword


In [53]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    terms = " ".join(terms)
    return terms

In [54]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = nltk.corpus.stopwords.words("portuguese")
        stopwords.extend(list(punctuation))
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [word for word in terms if word not in stopwords]
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [55]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_lowercase_pontuacao_acentuacao_stopword"])

In [56]:
retriever = BM25Retriever(document_store=document_store)

In [57]:
pre = PreProcessamento()

In [58]:
pipeline = Pipeline()

In [59]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [60]:
avaliacaoRecall(True)

R@20: 0.5898305084745763


# Stemming

## 6- Stemming (RSLP)

In [8]:
def preprocess(txt):

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    terms = [stemmer.stem(word) for word in terms]
    terms = " ".join(terms)
    return terms

In [9]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def preprocess(self,txt):

        stemmer = RSLPStemmer()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt)
        terms = [stemmer.stem(word) for word in terms]
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [10]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["text_rslp"])

In [11]:
retriever = BM25Retriever(document_store=document_store)

In [12]:
pre = PreProcessamento()

In [13]:
pipeline = Pipeline()

In [14]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [15]:
avaliacaoRecall(True)

R@20: 0.5084745762711864


## 7- Stemming (RSLP-S)

In [8]:
class RSLP_S:
    def __plural_reduction(self, word):
        excep = ["lápis","cais","mais","crúcis","biquínis","pois","depois","dois","leis" ]
        excep_s = ["aliás","pires","lápis","cais","mais","mas","menos", "férias","fezes","pêsames","crúcis","gás", "atrás","moisés","através","convés","ês", "país","após","ambas","ambos","messias"]

        len_word = len(word)
        new_word = list(word)

        if len_word >= 3:
            if new_word[-1] == 's' and new_word[-2] == 'n':
                new_word[-2] = 'm'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'õ':
                new_word[-3] = 'ã'
                new_word[-2] = 'o'
                sing = "".join(new_word)
                sing = sing[:-1]
                return  sing

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'ã':
                if word == 'mães':
                    word = word[:-1]
                    return word
                else:
                    new_word[-2] = 'o'
                    sing = "".join(new_word)
                    sing = sing[:-1]
                    return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'a':
                if word != 'cais' and word != 'mais':
                    new_word[-2] = 'l'
                    sing = "".join(new_word)
                    sing = sing[:-1]
                    return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'é':
                new_word[-3] = 'e'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'e':
                new_word[-3] = 'e'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'i' and new_word[-3] == 'ó':
                new_word[-3] = 'o'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

            if new_word[-1] == 's' and new_word[-2] == 'i':
                if word not in excep:
                    new_word[-1] = 'l'
                    sing = "".join(new_word)
                    return sing

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'l':
                word = word[:-2]
                return word

            if new_word[-1] == 's' and new_word[-2] == 'e' and new_word[-3] == 'r':
                word = word[:-2]
                return word

            if new_word[-1] == 's':
                if word not in excep_s:
                    word = word[:-1]

        return word

    def stem(self, word):
        word = self.__plural_reduction(word)

        return word

In [17]:
def preprocess(txt):

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    terms = [stemmer.stem(word) for word in terms]
    terms = " ".join(terms)
    return terms

In [18]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def preprocess(self,txt):

        stemmer = RSLP_S()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt)
        terms = [stemmer.stem(word) for word in terms]
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [19]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["text_rslps"])

In [20]:
retriever = BM25Retriever(document_store=document_store)

In [21]:
pre = PreProcessamento()

In [22]:
pipeline = Pipeline()

In [23]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [24]:
avaliacaoRecall(True)

R@20: 0.5288135593220339


## 8- Stemming (Savoy)

In [8]:
class Savoy:

    def __removeAllPTAccent(self, old_word):
        word = list(old_word)
        len_word = len(word)-1
        for i in range(len_word, -1, -1):
            if word[i] == 'ä':
                word[i] = 'a'
            if word[i] == 'â':
                word[i] = 'a'
            if word[i] == 'à':
                word[i] = 'a'
            if word[i] == 'á':
                word[i] = 'a'
            if word[i] == 'ã':
                word[i] = 'a'
            if word[i] == 'ê':
                word[i] = 'e'
            if word[i] == 'é':
                word[i] = 'e'
            if word[i] == 'è':
                word[i] = 'e'
            if word[i] == 'ë':
                word[i] = 'e'
            if word[i] == 'ï':
                word[i] = 'i'
            if word[i] == 'î':
                word[i] = 'i'
            if word[i] == 'ì':
                word[i] = 'i'
            if word[i] == 'í':
                word[i] = 'i'
            if word[i] == 'ü':
                word[i] = 'u'
            if word[i] == 'ú':
                word[i] = 'u'
            if word[i] == 'ù':
                word[i] = 'u'
            if word[i] == 'û':
                word[i] = 'u'
            if word[i] == 'ô':
                word[i] = 'o'
            if word[i] == 'ö':
                word[i] = 'o'
            if word[i] == 'ó':
                word[i] = 'o'
            if word[i] == 'ò':
                word[i] = 'o'
            if word[i] == 'õ':
                word[i] = 'o'
            if word[i] == 'ç':
                word[i] = 'c'

        new_word = "".join(word)
        return new_word

    def __finalVowelPortuguese(self, word):
        len_word = len(word)
        if len_word > 3:
            if word[-1] == 'e' or word[-1] == 'a' or word[-1] == 'o':
                word = word[:-1]

        return word

    def __remove_PTsuffix(self, word):
        len_word = len(word)

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'e' and (word[-3] == 'r' or word[-3] == 's' or word[-3] == 'z' or word[-3] == 'l'):
                word = word[:-2]
                return word
        if len_word > 2:
            if word[-1] == 's' and word[-2] == 'n':
                new_word = list(word)
                new_word[-2] = 'm'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if (word[-1] == 's' and word[-2] == 'i') and (word[-3] == 'e' or word[-3] == 'é'):
                new_word = list(word)
                new_word[-3] = 'e'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'i' and word[-3] == 'a':
                new_word = list(word)
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'i' and word[-3] == 'ó':
                new_word = list(word)
                new_word[-3] = 'o'
                new_word[-2] = 'l'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 3:
            if word[-1] == 's' and word[-2] == 'i':
                new_word = list(word)
                new_word[-1] = 'l'
                sing = "".join(new_word)
                return sing

        if len_word > 2:
            if word[-1] == 's' and word[-2] == 'e' and word[-3] == 'õ':
                new_word = list(word)
                new_word[-3] = 'ã'
                new_word[-2] = 'o'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing
            if word[-1] == 's' and word[-2] == 'e' and word[-3] == 'ã':
                new_word = list(word)
                new_word[-2] = 'o'
                sing = "".join(new_word)
                sing = sing[:-1]
                return sing

        if len_word > 5:
            if word[-1] == 'e' and word[-2] == 't' and word[-3] == 'n' and word[-4] == 'e' and word[-5] == 'm':
                word = word[:-5]
                return word

        if len_word > 2:
            if word[-1] == 's':
                word = word[:-1]

        return word

    def __normFemininPortuguese(self, word):

        len_word = len(word)

        if len_word < 3 or word[-1] != 'a':
            return word

        if len_word > 6:

            if word[-2] == 'h' and word[-3] == 'n' and word[-4] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'c' and word[-3] == 'a' and word[-4] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'r' and word[-3] == 'i' and word[-4] == 'e':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

        if len_word > 5:
            if word[-2] == 'n' and word[-3] == 'o':
                new_word = list(word)
                new_word[-3] = 'ã'
                new_word[-2] = 'o'
                masc = "".join(new_word)
                masc = masc[:-1]
                return masc

            if word[-2] == 'r' and word[-3] == 'o':
                word = word[:-1]
                return word

            if word[-2] == 's' and word[-3] == 'o':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 's' and word[-3] == 'e':
                new_word = list(word)
                new_word[-3] = 'ê'
                masc = "".join(new_word)
                masc = masc[:-1]
                return masc

            if word[-2] == 'c' and word[-3] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'd' and word[-3] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'd' and word[-3] == 'a':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'v' and word[-3] == 'i':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'm' and word[-3] == 'a':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

            if word[-2] == 'n':
                new_word = list(word)
                new_word[-1] = 'o'
                masc = "".join(new_word)
                return masc

        return word

    def stem(self, word):
        len_word = len(word)
        if len_word > 2:
            word = self.__remove_PTsuffix(word)
            word = self.__normFemininPortuguese(word)
            word = self.__finalVowelPortuguese(word)
            word = self.__removeAllPTAccent(word)

        return word


In [26]:
def preprocess(txt):

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    terms = [stemmer.stem(word) for word in terms]
    terms = " ".join(terms)
    return terms

In [27]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def preprocess(self,txt):

        stemmer = Savoy()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt)
        terms = [stemmer.stem(word) for word in terms]
        terms = " ".join(terms)
        return terms
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [28]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["text_savoy"])

In [29]:
retriever = BM25Retriever(document_store=document_store)

In [30]:
pre = PreProcessamento()

In [31]:
pipeline = Pipeline()

In [32]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [33]:
avaliacaoRecall(True)

R@20: 0.5084745762711864


## 9- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword + stemming (RSLP)

In [34]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
    return " ".join(terms)

In [35]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = nltk.corpus.stopwords.words("portuguese")
        stopwords.extend(list(punctuation))

        stemmer = RSLPStemmer()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
        return " ".join(terms)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [36]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_rslp"])

In [37]:
retriever = BM25Retriever(document_store=document_store)

In [38]:
pre = PreProcessamento()

In [39]:
pipeline = Pipeline()

In [40]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [41]:
avaliacaoRecall(True)

R@20: 0.5457627118644067


## 10- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword + stemming (RSLP-S)

In [42]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
    return " ".join(terms)

In [43]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = nltk.corpus.stopwords.words("portuguese")
        stopwords.extend(list(punctuation))

        stemmer = RSLP_S()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
        return " ".join(terms)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [44]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_rslps"])

In [45]:
retriever = BM25Retriever(document_store=document_store)

In [46]:
pre = PreProcessamento()

In [47]:
pipeline = Pipeline()

In [48]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [49]:
avaliacaoRecall(True)

R@20: 0.5898305084745763


## 11- Letra mínuscula + remoção de pontuação + remoção de acentuação e remoção de stopword + stemming (Savoy)

In [50]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = nltk.corpus.stopwords.words("portuguese")
    stopwords.extend(list(punctuation))

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
    return " ".join(terms)

In [51]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1

    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = nltk.corpus.stopwords.words("portuguese")
        stopwords.extend(list(punctuation))

        stemmer = Savoy()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
        
        return " ".join(terms)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [52]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_savoy"])

In [53]:
retriever = BM25Retriever(document_store=document_store)

In [54]:
pre = PreProcessamento()

In [55]:
pipeline = Pipeline()

In [56]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [57]:
avaliacaoRecall(True)

R@20: 0.5830508474576271


# Word n-gram

## 12- Bigram

In [8]:
def preprocess(txt):
    txt=str(txt)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    ngram = []
    
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [9]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def preprocess(self,txt):
        txt=str(txt)
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt)
        ngram = []
    
        ngram_2 = list(ngrams(terms, 2))
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [10]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["text_bigram"])

In [11]:
retriever = BM25Retriever(document_store=document_store)

In [12]:
pre = PreProcessamento()

In [13]:
pipeline = Pipeline()

In [14]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [15]:
avaliacaoRecall(True)

R@20: 0.5084745762711864


## 13- Trigram

In [16]:
def preprocess(txt):
    txt=str(txt)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    ngram = []
    
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [17]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def preprocess(self,txt):
        txt=str(txt)
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt)
        ngram = []
    
        ngram_3 = list(ngrams(terms, 3))
        
        for w in ngram_3:
            string = w[0] + " " + w[1] + " " + w[2]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [18]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["text_trigram"])

In [19]:
retriever = BM25Retriever(document_store=document_store)

In [20]:
pre = PreProcessamento()

In [21]:
pipeline = Pipeline()

In [22]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [23]:
avaliacaoRecall(True)

R@20: 0.4711864406779661


## 14- Unigram + Bigram

In [24]:
def preprocess(txt):
    txt=str(txt)
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt)
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [25]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def preprocess(self,txt):
        txt=str(txt)
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt)
    
        ngram = []
        ngram_1 = list(ngrams(terms, 1))
        ngram_2 = list(ngrams(terms, 2))
        for w in ngram_1:
            ngram.append(w[0])
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [26]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["text_uni_bi"])

In [27]:
retriever = BM25Retriever(document_store=document_store)

In [28]:
pre = PreProcessamento()

In [29]:
pipeline = Pipeline()

In [30]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [31]:
avaliacaoRecall(True)

R@20: 0.4847457627118644


# Word n-gram + pré processamento básico

## 15- Letra mínuscula + remoção de pontuação, acentuação e stopword + bigram

In [8]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [9]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [word for word in terms if word not in stopwords]
    
        ngram = []
        ngram_2 = list(ngrams(terms, 2))
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [10]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_bigram"])

In [11]:
retriever = BM25Retriever(document_store=document_store)

In [12]:
pre = PreProcessamento()

In [13]:
pipeline = Pipeline()

In [14]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [15]:
avaliacaoRecall(True)

R@20: 0.5627118644067797


## 16- Letra mínuscula + remoção de pontuação, acentuação e stopword + trigram

In [16]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [17]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [word for word in terms if word not in stopwords]
    
        ngram = []
        ngram_3 = list(ngrams(terms, 3))
        
        for w in ngram_3:
            string = w[0] + " " + w[1] + " " + w[2]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [18]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_trigram"])

In [19]:
retriever = BM25Retriever(document_store=document_store)

In [20]:
pre = PreProcessamento()

In [21]:
pipeline = Pipeline()

In [22]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [26]:
avaliacaoRecall(True)

R@20: 0.5322033898305085


## 17- Letra mínuscula + remoção de pontuação, acentuação e stopword + unigram + bigram

In [27]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [word for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [28]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [word for word in terms if word not in stopwords]
    
        ngram = []
        ngram_1 = list(ngrams(terms, 1))
        ngram_2 = list(ngrams(terms, 2))
        for w in ngram_1:
            ngram.append(w[0])
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [29]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_uni_bi"])

In [30]:
retriever = BM25Retriever(document_store=document_store)

In [31]:
pre = PreProcessamento()

In [32]:
pipeline = Pipeline()

In [33]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [34]:
avaliacaoRecall(True)

R@20: 0.5389830508474577


# Word n-gram + pré processamento básico + RSLP

## 18- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP) + bigram

In [8]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [9]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = RSLPStemmer()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_2 = list(ngrams(terms, 2))
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [10]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_bigram_rslp"])

In [11]:
retriever = BM25Retriever(document_store=document_store)

In [12]:
pre = PreProcessamento()

In [13]:
pipeline = Pipeline()

In [14]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [15]:
avaliacaoRecall(True)

R@20: 0.4745762711864407


## 19- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP) + trigram

In [16]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [17]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = RSLPStemmer()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_3 = list(ngrams(terms, 3))
        
        for w in ngram_3:
            string = w[0] + " " + w[1] + " " + w[2]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [18]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_trigram_rslp"])

In [19]:
retriever = BM25Retriever(document_store=document_store)

In [20]:
pre = PreProcessamento()

In [21]:
pipeline = Pipeline()

In [22]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [23]:
avaliacaoRecall(True)

R@20: 0.4542372881355932


## 20- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP) + unigram + bigram

In [24]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLPStemmer()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [25]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = RSLPStemmer()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_1 = list(ngrams(terms, 1))
        ngram_2 = list(ngrams(terms, 2))
        for w in ngram_1:
            ngram.append(w[0])
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [26]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_uni_bi_rslp"])

In [27]:
retriever = BM25Retriever(document_store=document_store)

In [28]:
pre = PreProcessamento()

In [29]:
pipeline = Pipeline()

In [30]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [31]:
avaliacaoRecall(True)

R@20: 0.46440677966101696


# Word n-gram + pré processamento básico + RSLP-S

## 21- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP-S) + bigram

In [9]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [10]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = RSLP_S()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_2 = list(ngrams(terms, 2))
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
    
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [11]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_bigram_rslps"])

In [12]:
retriever = BM25Retriever(document_store=document_store)

In [13]:
pre = PreProcessamento()

In [14]:
pipeline = Pipeline()

In [15]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [16]:
avaliacaoRecall(True)

R@20: 0.5559322033898305


## 22- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP-S) + trigram

In [17]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [18]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = RSLP_S()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_3 = list(ngrams(terms, 3))
        
        for w in ngram_3:
            string = w[0] + " " + w[1] + " " + w[2]
            ngram.append(string)
    
        return " ".join(ngram)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [19]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_trigram_rslps"])

In [20]:
retriever = BM25Retriever(document_store=document_store)

In [21]:
pre = PreProcessamento()

In [22]:
pipeline = Pipeline()

In [23]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [24]:
avaliacaoRecall(True)

R@20: 0.5322033898305085


## 23- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (RSLP-S) + unigram + bigram

In [25]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = RSLP_S()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [26]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = RSLP_S()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_1 = list(ngrams(terms, 1))
        ngram_2 = list(ngrams(terms, 2))
        for w in ngram_1:
            ngram.append(w[0])
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [27]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_uni_bi_rslps"])

In [28]:
retriever = BM25Retriever(document_store=document_store)

In [29]:
pre = PreProcessamento()

In [30]:
pipeline = Pipeline()

In [31]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [32]:
avaliacaoRecall(True)

R@20: 0.5423728813559322


# Word n-gram + pré processamento básico + Savoy

## 24- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (Savoy) + bigram

In [9]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_2 = list(ngrams(terms, 2))
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)

In [10]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = Savoy()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_2 = list(ngrams(terms, 2))
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [11]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_bigram_savoy"])

In [12]:
retriever = BM25Retriever(document_store=document_store)

In [13]:
pre = PreProcessamento()

In [14]:
pipeline = Pipeline()

In [15]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [16]:
avaliacaoRecall(True)

R@20: 0.5322033898305085


## 25- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (Savoy) + trigram

In [17]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_3 = list(ngrams(terms, 3))
        
    for w in ngram_3:
        string = w[0] + " " + w[1] + " " + w[2]
        ngram.append(string)
    
    return " ".join(ngram)

In [18]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = Savoy()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_3 = list(ngrams(terms, 3))
        
        for w in ngram_3:
            string = w[0] + " " + w[1] + " " + w[2]
            ngram.append(string)
    
        return " ".join(ngram)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [19]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_trigram_savoy"])

In [20]:
retriever = BM25Retriever(document_store=document_store)

In [21]:
pre = PreProcessamento()

In [22]:
pipeline = Pipeline()

In [23]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [24]:
avaliacaoRecall(True)

R@20: 0.5152542372881356


## 26- Letra mínuscula + remoção de pontuação, acentuação e stopword + stemming (Savoy) + unigram + bigram

In [25]:
def _remove_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

def preprocess(txt):
    txt = _remove_acentos(txt)
    stopwords = list(punctuation)

    stemmer = Savoy()
    tokenizer = RegexpTokenizer('\w+')
    terms = tokenizer.tokenize(txt.lower())
    terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
    ngram = []
    ngram_1 = list(ngrams(terms, 1))
    ngram_2 = list(ngrams(terms, 2))
    for w in ngram_1:
        ngram.append(w[0])
        
    for w in ngram_2:
        string = w[0] + " " + w[1]
        ngram.append(string)
    
    return " ".join(ngram)


In [26]:
class PreProcessamento(BaseComponent):
    outgoing_edges = 1
    
    def _remove_acentos(self,txt):
        return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

    def preprocess(self,txt):
        txt = self._remove_acentos(txt)
        stopwords = list(punctuation)

        stemmer = Savoy()
        tokenizer = RegexpTokenizer('\w+')
        terms = tokenizer.tokenize(txt.lower())
        terms = [stemmer.stem(word) for word in terms if word not in stopwords]
    
        ngram = []
        ngram_1 = list(ngrams(terms, 1))
        ngram_2 = list(ngrams(terms, 2))
        for w in ngram_1:
            ngram.append(w[0])
        
        for w in ngram_2:
            string = w[0] + " " + w[1]
            ngram.append(string)
    
        return " ".join(ngram)
      
    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        query = self.preprocess(query)
        output = {"query": query}
        return output, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        pass


In [27]:
 document_store = ElasticsearchDocumentStore(host='localhost', port=9200, username='elastic',
                                                password='_3egIk1UEsLOV4266NWo', index=index_nome,
                                                search_fields=["pre_text_uni_bi_savoy"])

In [28]:
retriever = BM25Retriever(document_store=document_store)

In [29]:
pre = PreProcessamento()

In [30]:
pipeline = Pipeline()

In [31]:
pipeline.add_node(component=pre, name="Pre",inputs=['Query'])
pipeline.add_node(component=retriever, name="Retriever",inputs=['Pre'])

Recall

In [32]:
avaliacaoRecall(True)

R@20: 0.5423728813559322
