# PREPROCESSING

- This function changes the text removing all punctuation and special characters in the words. This is a first step to normalize the text

In [None]:
def remove_accents(text):
    accents = {
        "Á": "A","Ã": "A","À": "A","á": "a","ã": "a","à": "a",
        "É": "E","é": "e","Ê": "E","ê": "e","Í": "I","í": "i",
        "Ó": "O","ó": "o","Õ": "O","õ": "o","Ô": "O","ô": "o",
        "Ú": "U","ú": "u",";": "",",": "","/": "","\\": "",
        "{": "","}": "","(": "",")": "","-": "","_": "","Ç":"C",
        "ç":"c",
    }
    text = str(text)
    for k, v in accents.items():
        text = text.replace(k, v)
    return text

In [None]:
test_str = "Açaí é uma delícia"
print(remove_accents(test_str))

### STEPS IN THE TRANSFORMATION OF A TEXT

1. Transform text to lowercase
2. Remove all accents and other special characters
3. Tokenize text, transforming it in a list of tokens
4. Remove all tokens that are not of interest
5. If selected, stem the tokens
6. Return a list of tokens

In [3]:
from nltk.tokenize import RegexpTokenizer
import nltk

stemmer = nltk.stem.RSLPStemmer()

def normalize_texts(texts, to_stem=False):
    normal_texts = []
    tk = RegexpTokenizer(r"\w+")
    stopwords = set(nltk.corpus.stopwords.words("portuguese"))
    for t in texts:
        raw_text = remove_accents(t.lower()) # steps 1 and 2
        tokens = tk.tokenize(raw_text) # step 3
        processed_text = []
        for tkn in tokens:
            if tkn.isalpha() and tkn not in stopwords and len(tkn) > 3: # step 4
                if to_stem:
                    tkn = stemmer.stem(tkn) # step 5
                processed_text.append(tkn)
        normal_texts.append(processed_text)
    return normal_texts

In [4]:
print(normalize_texts([test_str]))

[['acai', 'delicia']]
