# Categorização de notícias do mercado financeiro nas classes Petrobras, Vale ou Itaú (Parte 1): Pré-processamento

## Packages and Assets

In [23]:
import pandas as pd
import string
import nltk
import numpy as np

In [24]:
train = pd.read_csv('../assets/train/raw.csv')
val = pd.read_csv('../assets/val/raw.csv')
test = pd.read_csv('../assets/test/raw.csv')

train

Unnamed: 0,title,tags,link,label
0,"IFIX fecha aos 2.811,53 pontos com alta de 0,40%","['cotação do ifix', 'FII', 'FIIs', 'fundo de i...",https://www.sunoresearch.com.br/noticias/ifix-...,0
1,Nissan é acusada e Ghosn tem indiciamento por ...,"['Fraudes', 'internacional']",https://www.sunoresearch.com.br/noticias/ghosn...,0
2,Votação de projetos que miram alta dos combust...,"['Combustíveis', 'ICMS', 'Inflação', 'Petrobra...",https://www.moneytimes.com.br/votacao-de-proje...,1
3,"Vendas dos títulos do Tesouro Direto caem 27,6...","['Tesouro Direto', 'Tesouro Nacional']",https://www.sunoresearch.com.br/noticias/venda...,0
4,Agenda do Dia: Petrobras; Yduqs; Oi; AES Tietê...,"['AES Tietê (TIET3/TIET4/TIET11)', 'agenda do ...",https://www.suno.com.br/noticias/agenda-do-dia...,1
...,...,...,...,...
14404,MyCap tem três novas ações na carteira recomen...,"['Ações', 'B3', 'BR Malls', 'Carteira Recomend...",https://www.moneytimes.com.br/mycap-tem-tres-n...,1
14405,Futuros do minério de ferro registram perdas d...,"['China', 'Commodities', 'Minério de ferro', '...",https://www.moneytimes.com.br/futuros-do-miner...,2
14406,Ações para investir em dezembro: Modalmais esc...,"['ABC Brasil', 'Ações', 'Banco do Brasil', 'Ca...",https://www.moneytimes.com.br/acoes-para-inves...,2
14407,Indexados do Tesouro Direto apresentam queda n...,"['Tesouro Direto', 'Tesouro Nacional', 'títulos']",https://www.sunoresearch.com.br/noticias/index...,0


## Normalização de textos

### Transformação de texto

#### Funções auxiliares

In [25]:
def transformar_texto_coluna_df(df: pd.DataFrame, column_name: str):

    df_transformado = df.copy()
    
    # Remove acentos do texto - "CAFÉ" -> "CAFE"
    df_transformado[column_name] = df_transformado[column_name].map(lambda s:
                                                                    remove_acentos(s))
    # converte texto para lower case - "CAFE" -> "cafe"
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.lower())

    # Remove pontuações
    df_transformado[column_name] = df_transformado[column_name].map(
        lambda s: s.translate(str.maketrans('', '', string.punctuation)))
    
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.replace('r$', ''))

    # Remove números
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: remove_numeros(s))

    # Remove espaços desnecessários
    df_transformado[column_name] = df_transformado[column_name].map(lambda s: s.strip())
    return df_transformado

In [26]:
def remove_numeros(sentence):
    new_sentece = ''

    for token in sentence.split():
        if token.isdigit():
            token = '<NUM>'
        new_sentece += ' {}'.format(token)

    return new_sentece

def remove_acentos(text):
    """
    Função para retirar acentuações e converter para minúscula
    :param text:
    :return text_normalizado
    """
    import unicodedata

    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode("utf-8")
    return text

#### Etapa

In [27]:
train_transformado = transformar_texto_coluna_df(train, 'title')
val_transformado = transformar_texto_coluna_df(val, 'title')
test_transformado = transformar_texto_coluna_df(test, 'title')

train_transformado

Unnamed: 0,title,tags,link,label
0,ifix fecha aos <NUM> pontos com alta de <NUM>,"['cotação do ifix', 'FII', 'FIIs', 'fundo de i...",https://www.sunoresearch.com.br/noticias/ifix-...,0
1,nissan e acusada e ghosn tem indiciamento por ...,"['Fraudes', 'internacional']",https://www.sunoresearch.com.br/noticias/ghosn...,0
2,votacao de projetos que miram alta dos combust...,"['Combustíveis', 'ICMS', 'Inflação', 'Petrobra...",https://www.moneytimes.com.br/votacao-de-proje...,1
3,vendas dos titulos do tesouro direto caem <NUM...,"['Tesouro Direto', 'Tesouro Nacional']",https://www.sunoresearch.com.br/noticias/venda...,0
4,agenda do dia petrobras yduqs oi aes tiete car...,"['AES Tietê (TIET3/TIET4/TIET11)', 'agenda do ...",https://www.suno.com.br/noticias/agenda-do-dia...,1
...,...,...,...,...
14404,mycap tem tres novas acoes na carteira recomen...,"['Ações', 'B3', 'BR Malls', 'Carteira Recomend...",https://www.moneytimes.com.br/mycap-tem-tres-n...,1
14405,futuros do minerio de ferro registram perdas d...,"['China', 'Commodities', 'Minério de ferro', '...",https://www.moneytimes.com.br/futuros-do-miner...,2
14406,acoes para investir em dezembro modalmais esca...,"['ABC Brasil', 'Ações', 'Banco do Brasil', 'Ca...",https://www.moneytimes.com.br/acoes-para-inves...,2
14407,indexados do tesouro direto apresentam queda n...,"['Tesouro Direto', 'Tesouro Nacional', 'títulos']",https://www.sunoresearch.com.br/noticias/index...,0


### Tokenização

#### Funções auxiliares

In [28]:
def tokenizar_texto_coluna_df(df: pd.DataFrame, column_name: str):
    df_tokenized = df.copy()
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    df_tokenized[column_name] = df_tokenized[column_name].map(lambda s: tokenizer.tokenize(s))
    return df_tokenized

#### Etapa



In [29]:
train_tokenizado = tokenizar_texto_coluna_df(train_transformado, 'title')
val_tokenizado = tokenizar_texto_coluna_df(val_transformado, 'title')
test_tokenizado = tokenizar_texto_coluna_df(test_transformado, 'title')
train_tokenizado

Unnamed: 0,title,tags,link,label
0,"[ifix, fecha, aos, NUM, pontos, com, alta, de,...","['cotação do ifix', 'FII', 'FIIs', 'fundo de i...",https://www.sunoresearch.com.br/noticias/ifix-...,0
1,"[nissan, e, acusada, e, ghosn, tem, indiciamen...","['Fraudes', 'internacional']",https://www.sunoresearch.com.br/noticias/ghosn...,0
2,"[votacao, de, projetos, que, miram, alta, dos,...","['Combustíveis', 'ICMS', 'Inflação', 'Petrobra...",https://www.moneytimes.com.br/votacao-de-proje...,1
3,"[vendas, dos, titulos, do, tesouro, direto, ca...","['Tesouro Direto', 'Tesouro Nacional']",https://www.sunoresearch.com.br/noticias/venda...,0
4,"[agenda, do, dia, petrobras, yduqs, oi, aes, t...","['AES Tietê (TIET3/TIET4/TIET11)', 'agenda do ...",https://www.suno.com.br/noticias/agenda-do-dia...,1
...,...,...,...,...
14404,"[mycap, tem, tres, novas, acoes, na, carteira,...","['Ações', 'B3', 'BR Malls', 'Carteira Recomend...",https://www.moneytimes.com.br/mycap-tem-tres-n...,1
14405,"[futuros, do, minerio, de, ferro, registram, p...","['China', 'Commodities', 'Minério de ferro', '...",https://www.moneytimes.com.br/futuros-do-miner...,2
14406,"[acoes, para, investir, em, dezembro, modalmai...","['ABC Brasil', 'Ações', 'Banco do Brasil', 'Ca...",https://www.moneytimes.com.br/acoes-para-inves...,2
14407,"[indexados, do, tesouro, direto, apresentam, q...","['Tesouro Direto', 'Tesouro Nacional', 'títulos']",https://www.sunoresearch.com.br/noticias/index...,0


### Remoção de stopwords

#### Funções auxiliares

In [30]:
def remover_stopwords_coluna_df(df: pd.DataFrame, column_name: str, language: str):
    df_without_stopwords = df.copy()
    stop_words = get_stopwords(language)
    df_without_stopwords[column_name] = df_without_stopwords[column_name].map(
        lambda tokens: remove_stopwords(tokens, stop_words))
    return df_without_stopwords

In [31]:
nltk.download('stopwords')
def get_stopwords(language):
    stop_words = []

    if language == 'pt':
        stop_words = nltk.corpus.stopwords.words('portuguese')
    elif language == 'en':
        stop_words = nltk.corpus.stopwords.words('english')

    return stop_words

def remove_stopwords(tokens, stop_words):
    tokens_without_sw = []
    for word in tokens:
        if word not in stop_words:
            tokens_without_sw.append(word)

    return tokens_without_sw

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Guilherme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Etapa

In [32]:
train_sem_stopwords = remover_stopwords_coluna_df(train_tokenizado, 'title', 'pt')
val_sem_stopwords = remover_stopwords_coluna_df(val_tokenizado, 'title', 'pt')
test_sem_stopwords = remover_stopwords_coluna_df(test_tokenizado, 'title', 'pt')
train_sem_stopwords

Unnamed: 0,title,tags,link,label
0,"[ifix, fecha, NUM, pontos, alta, NUM]","['cotação do ifix', 'FII', 'FIIs', 'fundo de i...",https://www.sunoresearch.com.br/noticias/ifix-...,0
1,"[nissan, acusada, ghosn, indiciamento, fraude,...","['Fraudes', 'internacional']",https://www.sunoresearch.com.br/noticias/ghosn...,0
2,"[votacao, projetos, miram, alta, combustiveis,...","['Combustíveis', 'ICMS', 'Inflação', 'Petrobra...",https://www.moneytimes.com.br/votacao-de-proje...,1
3,"[vendas, titulos, tesouro, direto, caem, NUM, ...","['Tesouro Direto', 'Tesouro Nacional']",https://www.sunoresearch.com.br/noticias/venda...,0
4,"[agenda, dia, petrobras, yduqs, oi, aes, tiete...","['AES Tietê (TIET3/TIET4/TIET11)', 'agenda do ...",https://www.suno.com.br/noticias/agenda-do-dia...,1
...,...,...,...,...
14404,"[mycap, tres, novas, acoes, carteira, recomend...","['Ações', 'B3', 'BR Malls', 'Carteira Recomend...",https://www.moneytimes.com.br/mycap-tem-tres-n...,1
14405,"[futuros, minerio, ferro, registram, perdas, N...","['China', 'Commodities', 'Minério de ferro', '...",https://www.moneytimes.com.br/futuros-do-miner...,2
14406,"[acoes, investir, dezembro, modalmais, escala,...","['ABC Brasil', 'Ações', 'Banco do Brasil', 'Ca...",https://www.moneytimes.com.br/acoes-para-inves...,2
14407,"[indexados, tesouro, direto, apresentam, queda...","['Tesouro Direto', 'Tesouro Nacional', 'títulos']",https://www.sunoresearch.com.br/noticias/index...,0


### Stemização

#### Funções auxiliares

In [33]:
def stemizar_tokens_coluna_df(df: pd.DataFrame, column_name: str):
    
    df_stemized = df.copy()
    from nltk.stem import SnowballStemmer
    stemmer = SnowballStemmer('portuguese')
    df_stemized[column_name] = df_stemized[column_name].map(lambda tokens: [stemmer.stem(token) for token in tokens])
    return df_stemized

#### Etapa

In [34]:
train_stemizado = stemizar_tokens_coluna_df(train_sem_stopwords, 'title')
val_stemizado = stemizar_tokens_coluna_df(val_sem_stopwords, 'title')
test_stemizado = stemizar_tokens_coluna_df(test_sem_stopwords, 'title')
train_stemizado

Unnamed: 0,title,tags,link,label
0,"[ifix, fech, num, pont, alta, num]","['cotação do ifix', 'FII', 'FIIs', 'fundo de i...",https://www.sunoresearch.com.br/noticias/ifix-...,0
1,"[nissan, acus, ghosn, indic, fraud, oficializ]","['Fraudes', 'internacional']",https://www.sunoresearch.com.br/noticias/ghosn...,0
2,"[votaca, projet, mir, alta, combustiv, dev, fi...","['Combustíveis', 'ICMS', 'Inflação', 'Petrobra...",https://www.moneytimes.com.br/votacao-de-proje...,1
3,"[vend, titul, tesour, diret, caem, num, janeir]","['Tesouro Direto', 'Tesouro Nacional']",https://www.sunoresearch.com.br/noticias/venda...,0
4,"[agend, dia, petrobr, yduqs, oi, aes, tiet, ca...","['AES Tietê (TIET3/TIET4/TIET11)', 'agenda do ...",https://www.suno.com.br/noticias/agenda-do-dia...,1
...,...,...,...,...
14404,"[mycap, tres, nov, aco, carteir, recomend, men...","['Ações', 'B3', 'BR Malls', 'Carteira Recomend...",https://www.moneytimes.com.br/mycap-tem-tres-n...,1
14405,"[futur, mineri, ferr, registr, perd, num, dalian]","['China', 'Commodities', 'Minério de ferro', '...",https://www.moneytimes.com.br/futuros-do-miner...,2
14406,"[aco, invest, dezembr, modalm, escal, num, mel...","['ABC Brasil', 'Ações', 'Banco do Brasil', 'Ca...",https://www.moneytimes.com.br/acoes-para-inves...,2
14407,"[index, tesour, diret, apresent, qued, tax, re...","['Tesouro Direto', 'Tesouro Nacional', 'títulos']",https://www.sunoresearch.com.br/noticias/index...,0


### Resultado final da Normalização de textos

#### Funções auxiliares

In [35]:
def get_resultado_final(df: pd.DataFrame, column_name: str):
    df_resultado_final = df.copy()
    df_resultado_final[column_name] = df_resultado_final[column_name].map(lambda s: ' '.join(s))
    return df_resultado_final

#### Etapa

In [36]:
train_final = get_resultado_final(train_stemizado, 'title')
val_final = get_resultado_final(val_stemizado, 'title')
test_final = get_resultado_final(test_stemizado, 'title')
train_final

Unnamed: 0,title,tags,link,label
0,ifix fech num pont alta num,"['cotação do ifix', 'FII', 'FIIs', 'fundo de i...",https://www.sunoresearch.com.br/noticias/ifix-...,0
1,nissan acus ghosn indic fraud oficializ,"['Fraudes', 'internacional']",https://www.sunoresearch.com.br/noticias/ghosn...,0
2,votaca projet mir alta combustiv dev fic caranval,"['Combustíveis', 'ICMS', 'Inflação', 'Petrobra...",https://www.moneytimes.com.br/votacao-de-proje...,1
3,vend titul tesour diret caem num janeir,"['Tesouro Direto', 'Tesouro Nacional']",https://www.sunoresearch.com.br/noticias/venda...,0
4,agend dia petrobr yduqs oi aes tiet carrefour ...,"['AES Tietê (TIET3/TIET4/TIET11)', 'agenda do ...",https://www.suno.com.br/noticias/agenda-do-dia...,1
...,...,...,...,...
14404,mycap tres nov aco carteir recomend mensal vej,"['Ações', 'B3', 'BR Malls', 'Carteira Recomend...",https://www.moneytimes.com.br/mycap-tem-tres-n...,1
14405,futur mineri ferr registr perd num dalian,"['China', 'Commodities', 'Minério de ferro', '...",https://www.moneytimes.com.br/futuros-do-miner...,2
14406,aco invest dezembr modalm escal num melhor ind...,"['ABC Brasil', 'Ações', 'Banco do Brasil', 'Ca...",https://www.moneytimes.com.br/acoes-para-inves...,2
14407,index tesour diret apresent qued tax rentabil,"['Tesouro Direto', 'Tesouro Nacional', 'títulos']",https://www.sunoresearch.com.br/noticias/index...,0


## Extração de Caracteríosticas (Bag of Words)

In [37]:
train_final_sample = train_final.iloc[1096:1500]

### Count Vectorizer (Tipo contagem)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

cv_vec = CountVectorizer(ngram_range=(1, 1),  analyzer='word')
X_train_cv = cv_vec.fit_transform(train_final_sample['title']).toarray()

X_train_names_cv = pd.DataFrame(X_train_cv, columns=cv_vec.get_feature_names_out())
X_train_names_cv

Unnamed: 0,2t20,2t22,3t21,4t19,4t20,4t21,5g,abaix,abastec,abertur,...,whatsapp,wirecard,wis,wood,xi,xp,xpbr31,yduq3,yduqs,zon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
X_train_cv_vis = X_train_names_cv.copy()
X_train_cv_vis = X_train_cv_vis.loc[0:1]
X_train_cv_vis = X_train_cv_vis.replace(0, np.nan).dropna(axis=1, how='all')
X_train_cv_vis = X_train_cv_vis.replace(np.nan, 0)
X_train_cv_vis.astype(int)

Unnamed: 0,bilha,camp,chuv,comparec,dav,diz,econom,fiemg,forum,ger,min,mundial,nao,net,num,perd,pod
0,1,0,1,0,0,1,0,1,0,2,1,0,0,0,1,1,1
1,0,1,0,1,1,0,1,0,1,0,0,1,1,1,0,0,0


[0] "Chuvas em Minas Gerais podem gerar perdas de R$ 1,1 bilhão, diz Fiemg"
[1] "Davos: Campos Neto não comparecerá ao Fórum Econômico Mundial"

### TF-IDF Vectorizer (Tipo frequência)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), analyzer='word')
X_train_tfidf = tfidf_vec.fit_transform(train_final_sample['title']).toarray()

X_train_names_tfidf = pd.DataFrame(X_train_tfidf, columns=tfidf_vec.get_feature_names_out())
X_train_names_tfidf

Unnamed: 0,2t20,2t22,3t21,4t19,4t20,4t21,5g,abaix,abastec,abertur,...,whatsapp,wirecard,wis,wood,xi,xp,xpbr31,yduq3,yduqs,zon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
X_train_tfidf_vis = X_train_names_tfidf.copy()
X_train_tfidf_vis = X_train_tfidf_vis.loc[0:1]
X_train_tfidf_vis = X_train_tfidf_vis.replace(0, np.nan).dropna(axis=1, how='all')
X_train_tfidf_vis = X_train_tfidf_vis.replace(np.nan, 0)
X_train_tfidf_vis

Unnamed: 0,bilha,camp,chuv,comparec,dav,diz,econom,fiemg,forum,ger,min,mundial,nao,net,num,perd,pod
0,0.319008,0.0,0.319008,0.0,0.0,0.177897,0.0,0.35837,0.0,0.638016,0.261562,0.0,0.0,0.0,0.104763,0.306336,0.236842
1,0.0,0.299555,0.0,0.383883,0.383883,0.0,0.317055,0.0,0.383883,0.0,0.0,0.383883,0.270021,0.383883,0.0,0.0,0.0


[0] "Chuvas em Minas Gerais podem gerar perdas de R$ 1,1 bilhão, diz Fiemg"
[1] "Davos: Campos Neto não comparecerá ao Fórum Econômico Mundial"