## Packages

In [1]:
import pandas as pd
import re
import string
import nltk
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

## Functions

In [2]:
def transformDocument(df, column_name, language):
    stop_words = usingStopwords(language)
    df_pp = df.copy()
    # 1. Aplicar preprocessamento nos títulos e textos completos
    if language == 'pt':
        # Substituir símbolos importantes
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('-feira', ''))
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('+', 'mais '))
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('-', 'menos '))
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('%', ' por cento'))
        df_pp[column_name] = df_pp[column_name].map(lambda s: removeStopwords(s, stop_words))

    elif language == 'en':
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('-', 'less'))
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('+', 'plus '))
        df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('%', ' percent'))
        df_pp[column_name] = df_pp[column_name].map(lambda s: removeStopwords(s, stop_words))

    else:
        pass

    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('R$', ''))
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('U$', ''))
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('US$', ''))
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('S&P 500', 'spx'))

    # Transformar em String e Letras Minúsculas nas Mensagens
    df_pp[column_name] = df_pp[column_name].map(lambda s:
                                              normalizarString(s))


    # Remover Pontuações
    # Remover Pontuações
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.translate(str.maketrans('', '', string.punctuation)))

    # Remover Emojis
    df_pp[column_name] = df_pp[column_name].map(lambda s: removeEmojis(s))

    # Quebras de Linha desnecessárias
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('\n', ' '))

    # Remover aspas duplas
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('\"', ''))
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('“', ''))
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.replace('”', ''))

    # Remover valores
    df_pp[column_name] = df_pp[column_name].map(lambda s: removeValores(s))

    # Espaços desnecessários
    df_pp[column_name] = df_pp[column_name].map(lambda s: s.strip())
    return df_pp



def removeEmojis(sentence):
    "Remoção de Emojis nas mensagens de texto."

    # Padrões dos Emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               "]+", flags=re.UNICODE)

    return emoji_pattern.sub(r'', sentence)

def removeValores(sentence):
    new_sentece = ''

    for token in sentence.split():
        if token.isdigit():
            token = '<NUM>'
        new_sentece += ' {}'.format(token)

    return new_sentece

def usingStopwords(language):
    stop_words = []

    nltk.download('stopwords')

    if language == 'pt':
        stop_words = nltk.corpus.stopwords.words('portuguese')
    elif language == 'en':
        stop_words = nltk.corpus.stopwords.words('english')

    return stop_words

def removeStopwords(text, stop_words):
    tokens = []
    for word in text.split():
        if word not in stop_words:
            tokens.append(word)

    text = ' '.join(tokens)
    return text

def normalizarString(text):
    """
    Função para retirar acentuações e converter para minúscula
    :param text:
    :return text_normalizado
    """
    import unicodedata

    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return str(text.lower())

## Dependencies

In [6]:
train_raw = pd.read_csv('../assets/data/splits/train/raw.csv')
val_raw = pd.read_csv('../assets/data/splits/val/raw.csv')
test_raw = pd.read_csv('../assets/data/splits/test/raw.csv')

df_raw = pd.concat([train_raw, val_raw, test_raw])
df_raw

Unnamed: 0,title,tags,url,time,Open,High,Low,Close,Adj Close,Volume,profit,profit (%),label
0,Petrobras avalia funcionários de carreira para...,"['Empresas', 'General Silva e Luna', 'Jair Bol...",https://www.moneytimes.com.br/petrobras-avalia...,09:19:00,23.129999,23.240000,22.809999,22.950001,9.412467,59567300,-0.007782,-0.78,0
1,BR com Ferreira pode acelerar saída da Petrobr...,"['Ações', 'BR Distribuidora', 'BTG Pactual', '...",https://www.moneytimes.com.br/br-com-ferreira-...,12:26:00,27.250000,27.969999,26.820000,27.000000,10.712937,64514600,-0.009174,-0.92,0
2,Produtores de petróleo da América Latina suam ...,"['América Latina', 'Commodities', 'Economia', ...",https://www.moneytimes.com.br/produtores-de-pe...,15:41:00,12.570000,13.540000,12.280000,13.250000,5.256886,114259200,0.054097,5.41,2
3,Veja as duas ações mais indicadas da semana po...,"['Ações', 'Ativa Investimentos', 'BB Investime...",https://www.moneytimes.com.br/veja-as-duas-aco...,11:24:00,31.690001,32.250000,31.040001,31.950001,15.596828,79117100,0.008204,0.82,2
4,Veja as 11 principais notícias para operar na ...,"['Ações', 'Agronegócio', 'Ásia', 'Bancos', 'Br...",https://www.moneytimes.com.br/veja-as-11-princ...,09:35:00,20.090000,20.370001,19.969999,19.969999,7.923606,44584300,-0.005973,-0.60,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,Senado aprova projeto que cria conta de estabi...,"['Câmara dos Deputados', 'Combustíveis', 'Dies...",https://www.moneytimes.com.br/senado-aprova-pr...,16:26:00,32.599998,34.599998,32.520000,33.700001,16.451113,136437700,0.033742,3.37,2
440,Petrobras (PETR4): Gerente de RH suspeito de i...,['Petrobras (PETR4)'],https://www.suno.com.br/noticias/petrobras-pet...,08:59:00,24.129999,24.200001,23.680000,24.040001,9.538484,40454400,-0.003730,-0.37,0
441,"George Washington, Índias Orientais e Petrobra...","['Opinião', 'Petrobras', 'Terraço Econômico']",https://www.moneytimes.com.br/george-washingto...,11:30:00,16.100000,16.100000,16.100000,16.100000,5.926244,0,0.000000,0.00,1
442,ANP adia chamada pública para alocação de capa...,"['ANP', 'Bolívia', 'Brasil', 'Empresas', 'Gás'...",https://www.moneytimes.com.br/anp-adia-chamada...,14:56:00,13.150000,13.740000,12.830000,13.380000,5.308463,110947200,0.017490,1.75,2


## Preprocessing splits

In [7]:
train_pp = transformDocument(train_raw, 'title', 'pt')
val_pp = transformDocument(val_raw, 'title', 'pt')
test_pp = transformDocument(test_raw, 'title', 'pt')

df_pp = pd.concat([train_pp, val_pp, test_pp])
df_pp

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,tags,url,time,Open,High,Low,Close,Adj Close,Volume,profit,profit (%),label
0,petrobras avalia funcionarios carreira compor ...,"['Empresas', 'General Silva e Luna', 'Jair Bol...",https://www.moneytimes.com.br/petrobras-avalia...,09:19:00,23.129999,23.240000,22.809999,22.950001,9.412467,59567300,-0.007782,-0.78,0
1,br ferreira pode acelerar saida petrobras duvi...,"['Ações', 'BR Distribuidora', 'BTG Pactual', '...",https://www.moneytimes.com.br/br-com-ferreira-...,12:26:00,27.250000,27.969999,26.820000,27.000000,10.712937,64514600,-0.009174,-0.92,0
2,produtores petroleo america latina suam cobrir...,"['América Latina', 'Commodities', 'Economia', ...",https://www.moneytimes.com.br/produtores-de-pe...,15:41:00,12.570000,13.540000,12.280000,13.250000,5.256886,114259200,0.054097,5.41,2
3,veja duas acoes indicadas semana <NUM> analistas,"['Ações', 'Ativa Investimentos', 'BB Investime...",https://www.moneytimes.com.br/veja-as-duas-aco...,11:24:00,31.690001,32.250000,31.040001,31.950001,15.596828,79117100,0.008204,0.82,2
4,veja <NUM> principais noticias operar bolsa ne...,"['Ações', 'Agronegócio', 'Ásia', 'Bancos', 'Br...",https://www.moneytimes.com.br/veja-as-11-princ...,09:35:00,20.090000,20.370001,19.969999,19.969999,7.923606,44584300,-0.005973,-0.60,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,senado aprova projeto cria conta estabilizacao...,"['Câmara dos Deputados', 'Combustíveis', 'Dies...",https://www.moneytimes.com.br/senado-aprova-pr...,16:26:00,32.599998,34.599998,32.520000,33.700001,16.451113,136437700,0.033742,3.37,2
440,petrobras petr4 gerente rh suspeito insider re...,['Petrobras (PETR4)'],https://www.suno.com.br/noticias/petrobras-pet...,08:59:00,24.129999,24.200001,23.680000,24.040001,9.538484,40454400,-0.003730,-0.37,0
441,george washington indias orientais petrobras r...,"['Opinião', 'Petrobras', 'Terraço Econômico']",https://www.moneytimes.com.br/george-washingto...,11:30:00,16.100000,16.100000,16.100000,16.100000,5.926244,0,0.000000,0.00,1
442,anp adia chamada publica alocacao capacidade g...,"['ANP', 'Bolívia', 'Brasil', 'Empresas', 'Gás'...",https://www.moneytimes.com.br/anp-adia-chamada...,14:56:00,13.150000,13.740000,12.830000,13.380000,5.308463,110947200,0.017490,1.75,2


## Outputs for EDA, word2vec and preprocessing

In [9]:
train_pp.to_csv('../assets/data/splits/train/preprocessed.csv', index=False)
val_pp.to_csv('../assets/data/splits/val/preprocessed.csv', index=False)
test_pp.to_csv('../assets/data/splits/test/preprocessed.csv', index=False)