## Packages

In [2]:
import pandas as pd
import re
import string
import nltk

## Functions

### Main funcs

In [29]:
def normalizar_texto_coluna_df(df: pd.DataFrame, column_name: str, language: str):

    df_copy = df.copy()
    df_transformado = transformar_texto_coluna_df(df_copy, column_name, language)
    df_tokenized = tokenizar_texto_coluna_df(df_transformado, column_name)
    df_without_stopwords = remover_stopwords_coluna_df(df_tokenized, column_name, language)
    df_stemmed = stemizar_tokens_coluna_df(df_without_stopwords, column_name)
    df_resultado_final = get_resultado_final(df_stemmed, column_name)

    df_normalizado = df_resultado_final.copy()
    return df_normalizado
    # df_without_stopwords = remover_stopwords_coluna_df(df_transformado, column_name, language)


def transformar_texto_coluna_df(df: pd.DataFrame, column_name: str, language: str):
    # transformação de texto (default case, emoji, symbols, regular expressions)
    # transformar_texto
    df_transformado = df.copy()

    df_transformado[column_name] = df_transformado[column_name].map(lambda s:
                                                                    remove_acentos(s))
    
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.lower())
    # 1. Aplicar preprocessamento nos títulos e textos completos
    if language == 'pt':
        # Substituir símbolos importantes
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('-feira', ''))
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('+', 'mais '))
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('-', 'menos '))
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('%', ' por cento'))

    elif language == 'en':
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('-', 'less'))
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('+', 'plus '))
        df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('%', ' percent'))

    else:
        pass

    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('r$', ''))
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('u$', ''))
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('us$', ''))
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('s&p 500', 'spx'))

    # Transformar em String e Letras Minúsculas nas Mensagens

    # Remover Pontuações
    df_transformado[column_name] = df_transformado[column_name].map(
        lambda s: s.translate(str.maketrans('', '', string.punctuation)))

    # Remover Emojis
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: remove_emojis(s))

    # Quebras de Linha desnecessárias
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('\n', ' '))

    # Remover aspas duplas
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('\"', ''))
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('“', ''))
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('”', ''))

    # Remover valores
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: remove_numeros(s))

    # Espaços desnecessários
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.strip())
    return df_transformado

def tokenizar_texto_coluna_df(df: pd.DataFrame, column_name: str):
    df_tokenized = df.copy()
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    df_tokenized[column_name] = df_tokenized[column_name].map(lambda s: tokenizer.tokenize(s))
    return df_tokenized

def remover_stopwords_coluna_df(df: pd.DataFrame, column_name: str, language: str):
    df_without_stopwords = df.copy()
    stop_words = get_stopwords(language)
    df_without_stopwords[column_name] = df_without_stopwords[column_name].map(
        lambda tokens: remove_stopwords(tokens, stop_words))
    return df_without_stopwords

def stemizar_tokens_coluna_df(df: pd.DataFrame, column_name: str):
    
    df_stemized = df.copy()
    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer('portuguese')
    df_stemized[column_name] = df_stemized[column_name].map(lambda tokens: [stemmer.stem(token) for token in tokens])
    return df_stemized

def get_resultado_final(df: pd.DataFrame, column_name: str):
    df_resultado_final = df.copy()
    df_resultado_final[column_name] = df_resultado_final[column_name].map(lambda s: ' '.join(s))
    return df_resultado_final
    

### Aux funcs

In [22]:
def remove_emojis(sentence):
    "Remoção de Emojis nas mensagens de texto."

    # Padrões dos Emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', sentence)


def remove_numeros(sentence):
    new_sentece = ''

    for token in sentence.split():
        if token.isdigit():
            token = '<NUM>'
        new_sentece += ' {}'.format(token)

    return new_sentece


def get_stopwords(language):
    stop_words = []

    nltk.download('stopwords')

    if language == 'pt':
        stop_words = nltk.corpus.stopwords.words('portuguese')
    elif language == 'en':
        stop_words = nltk.corpus.stopwords.words('english')

    return stop_words


def remove_stopwords(tokens, stop_words):
    tokens_without_sw = []
    for word in tokens:
        if word not in stop_words:
            tokens_without_sw.append(word)

    return tokens_without_sw


def remove_acentos(text):
    """
    Função para retirar acentuações e converter para minúscula
    :param text:
    :return text_normalizado
    """
    import unicodedata

    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return text







## Scratches

In [25]:
df_vale3_without_valestr = pd.read_csv('../assets/data/vale3_without_valestr.csv', index_col=0)
df_vale3_without_valestr

Unnamed: 0,topic,title,date,search_date,link,tags
0,Mercado,Ibovespa segue exterior e amplia alta; Marfrig...,19/07/2022 11:50,2022-08-04 00:12:08,https://www.suno.com.br/noticias/ibovespa-hoje...,"['Ibovespa', 'Embraer (EMBR3)', 'Fleury (FLRY3..."
19,Negócios,Radar: Mercado Livre (MELI34) é o BDR mais neg...,07/04/2022 21:23,2022-08-04 00:13:47,https://www.suno.com.br/noticias/radar-mercado...,"['Vale (VALE3)', 'Azul', 'AZUL4', 'Eletrobras ..."
33,Negócios,Radar: Arezzo (ARZZ3) pagará proventos de R$ 6...,10/01/2022 22:14,2022-08-04 00:14:32,https://www.suno.com.br/noticias/radar-arezzo-...,"['Vale (VALE3)', 'Arezzo (ARZZ3)', 'CSN Minera..."
38,Política,Prefeituras de MG pedem que moradores deixem a...,10/01/2022 15:44,2022-08-04 00:14:35,https://www.suno.com.br/noticias/mg-prefeitura...,"['Vale (VALE3)', 'Minas Gerais', 'Vale (VALE3)']"
42,Negócios,CSN (CSNA3) e CSN Mineração (CMIN3) interrompe...,10/01/2022 10:47,2022-08-04 00:14:38,https://www.suno.com.br/noticias/csn-mineracao...,"['Vale (VALE3)', 'CSN (CSNA3)', 'CSN Mineração..."
...,...,...,...,...,...,...
1183,Mercado,Cotação do minério de ferro sobe na China após...,11/02/2019 09:12,2022-05-03 16:48:11,https://www.suno.com.br/noticias/cotacao-miner...,"['Bolsa de Valores', 'China', 'minério de ferr..."
1190,Mercado,"Ibovespa fecha em queda de 0,41% após volta de...",06/03/2019 18:50,2022-05-03 16:48:17,https://www.suno.com.br/noticias/ibovespa-qued...,"['Banco Central (BC)', 'Boletim Focus', 'Bruma..."
1196,Política,Israel enviará equipamentos de resgates em Bru...,26/01/2019 21:33,2022-05-03 16:48:51,https://www.suno.com.br/noticias/israel-enviar...,"['Benjamin Netanyahu', 'Brumadinho', 'Israel',..."
1197,Mercado,"Ibovespa abre quarta-feira com alta de +0,29%;...",24/10/2018 10:55,2022-05-03 16:48:56,https://www.suno.com.br/noticias/ibovespa-abre...,"['Ações', 'Bolsa de Valores', 'China', 'dólar'..."


In [30]:
df_normalizado = normalizar_texto_coluna_df(df_vale3_without_valestr, 'title', 'pt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Dependencies

In [None]:
train_raw = pd.read_csv('../assets/data/splits/train/raw.csv')
val_raw = pd.read_csv('../assets/data/splits/val/raw.csv')
test_raw = pd.read_csv('../assets/data/splits/test/raw.csv')
df_raw = pd.concat([train_raw, val_raw, test_raw])
df_raw.sample(10, random_state=42)[['title', 'label']].reset_index(drop=True)

## Preprocessing splits

In [None]:
train_pp = normalizar_texto_coluna_df(train_raw, 'title', 'pt')
val_pp = normalizar_texto_coluna_df(val_raw, 'title', 'pt')
test_pp = normalizar_texto_coluna_df(test_raw, 'title', 'pt')
df_pp = pd.concat([train_pp, val_pp, test_pp])
df_pp.sample(10, random_state=42)[['title', 'label']].reset_index(drop=True)


## Outputs for EDA, word2vec and preprocessing

In [None]:
train_pp.to_csv('../assets/data/splits/train/preprocessed.csv', index=False)
val_pp.to_csv('../assets/data/splits/val/preprocessed.csv', index=False)
test_pp.to_csv('../assets/data/splits/test/preprocessed.csv', index=False)