In [2]:
import json
import numpy as np
import pandas as pd
import unidecode
from pandas.io.json import json_normalize
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import KeyedVectors

word_vec_model = KeyedVectors.load_word2vec_format('../datasets/word-vectors/wiki.pt/wiki.pt.vec')

word_vec_model

Slow version of gensim.models.doc2vec is being used


<gensim.models.keyedvectors.KeyedVectors at 0x11c0fae48>

## Creating the news dataframe
The first step is import the news dataset and creating a Pandas Dataframe

In [12]:
news_data_file = open('../datasets/raw-data/folha_de_sao_paulo_news.json', 'r')
news_json = json.load(news_data_file)

news_df = json_normalize(news_json['news'])

news_df.head(5)

Unnamed: 0,authors,categories,date_published,link,locations,news_body,sub_title,title
0,[thiago resende],"[mercado, indústria_4.0, tec, folhainvest, mer...",2019-06-27 21:01:00,https://www1.folha.uol.com.br/mercado/2019/06/...,[brasília],"uma conjunção de fatores, como de categorias, ...","perto da reta final em comissão na câmara, pro...","desarticulação, pressão de partidos e lobby de..."
1,"[camila mattoso, ranier bragon]","[poder, governo_bolsonaro, lava_jato, legislat...",2019-06-27 07:54:00,https://www1.folha.uol.com.br/poder/2019/06/pf...,[brasília],a polícia federal prendeu na manhã desta quint...,ação deflagrada mira candidaturas laranjas pat...,pf prende assessores de ministro do turismo em...
2,[NO AUTHOR],"[mercado, indústria_4.0, tec, folhainvest, mer...",2019-06-27 10:41:00,https://www1.folha.uol.com.br/mercado/2019/06/...,"[paris, reuters]","o presidente francês, emmanuel macron, afirmou...",frança está preocupada com impacto sobre sua i...,macron diz que não terá acordo com mercosul se...
3,[júlia moura],"[mercado, indústria_4.0, tec, folhainvest, mer...",2019-06-27 17:33:00,https://www1.folha.uol.com.br/mercado/2019/06/...,[são_paulo],o e a indecisão sobre a inclusão de estados e ...,"ibovespa chegou a recuar 1,26% nesta quinta, m...",maia contém apreensão com atraso na previdênci...
4,[daniel carvalho],"[mercado, indústria_4.0, tec, folhainvest, mer...",2019-06-27 16:58:00,https://www1.folha.uol.com.br/mercado/2019/06/...,[brasília],"o ministro da economia, paulo guedes, disse ne...",presidente do senado disse que haverá conversa...,guedes defende estados na previdência para que...


In [13]:
news_df = news_df[['date_published', 'title']]

news_df.head(5)

Unnamed: 0,date_published,title
0,2019-06-27 21:01:00,"desarticulação, pressão de partidos e lobby de..."
1,2019-06-27 07:54:00,pf prende assessores de ministro do turismo em...
2,2019-06-27 10:41:00,macron diz que não terá acordo com mercosul se...
3,2019-06-27 17:33:00,maia contém apreensão com atraso na previdênci...
4,2019-06-27 16:58:00,guedes defende estados na previdência para que...


In [14]:
def embed_text(text):
    if not text:
        return 0
    
    sentences = [t for t in sent_tokenize(text, language='portuguese')]
    vectors = []
    for sentence in sentences:
        for token in sentence:
            if token in word_vec_model.vocab:
                vectors.append(word_vec_model[token])
                
    news_title_centroid = np.sum(vectors, axis=0) / len(vectors[0])
    return news_title_centroid

news_df['news_title_embedded'] = news_df.title.apply(embed_text)


In [31]:
for i in range(300):
    column_name = 'dimension_' + str(i)
    news_df[column_name] = news_df.apply(lambda row: format(row['news_title_embedded'][i], '.15f'), axis=1)
#     value = news_df['news_title_embedded'][i]
#     print(value)
#     news_df[column_name] = value

news_df.head(5)

Unnamed: 0,date_published,title,news_title_embedded,dimension_0,dimension_1,dimension_2,dimension_3,dimension_4,dimension_5,dimension_6,...,dimension_290,dimension_291,dimension_292,dimension_293,dimension_294,dimension_295,dimension_296,dimension_297,dimension_298,dimension_299
0,2019-06-27 21:01:00,"desarticulação, pressão de partidos e lobby de...","[0.0024119124, 0.017807769, -0.0808521, -0.020...",0.002411912428215,0.017807768657804,-0.080852098762989,-0.020943040028214,0.00996917206794,0.011117429472506,0.017982227727771,...,-0.005528440698981,0.09079746901989,0.018090328201652,0.054718483239412,-0.051992207765579,0.016649156808853,0.021403668448329,-0.010561862029135,-0.047357089817524,-0.028788531199098
1,2019-06-27 07:54:00,pf prende assessores de ministro do turismo em...,"[0.010801417, 0.0071813343, -0.064497106, -0.0...",0.010801416821778,0.00718133430928,-0.064497105777264,-0.020461045205593,0.016587099060416,0.014774425886571,0.023906799033284,...,-0.022600866854191,0.07350942492485,0.020824460312724,0.040061727166176,-0.031220989301801,0.005208480171859,0.016351629048586,-0.013720291666687,-0.033963970839977,-0.024018267169595
2,2019-06-27 10:41:00,macron diz que não terá acordo com mercosul se...,"[0.01132366, 0.013236635, -0.06111137, -0.0217...",0.011323659680784,0.013236635364592,-0.061111368238926,-0.021744018420577,0.006587916985154,0.015891492366791,0.012549749575555,...,0.008947509340942,0.070754364132881,0.013162839226425,0.045384187251329,-0.048811923712492,0.010775921866298,0.015192896127701,-0.008698048070073,-0.045283153653145,-0.023791687563062
3,2019-06-27 17:33:00,maia contém apreensão com atraso na previdênci...,"[-0.0017408186, 0.0095127635, -0.064294055, -0...",-0.001740818610415,0.009512763470411,-0.064294055104256,-0.025135856121778,0.007809324190021,0.014931591227651,0.014146867208183,...,-0.002240774454549,0.06767699867487,0.01073313318193,0.038554009050131,-0.043319083750248,0.012472147122025,0.009228114970028,-0.012373716570437,-0.038383673876524,-0.017972139641643
4,2019-06-27 16:58:00,guedes defende estados na previdência para que...,"[-0.003856063, 0.013680325, -0.09695509, -0.03...",-0.003856062889099,0.013680324889719,-0.096955090761185,-0.031713150441647,0.00848706625402,0.01936960965395,0.024848597124219,...,-0.016506308689713,0.10003536939621,0.021710079163313,0.052848432213068,-0.055315069854259,0.014186220243573,0.014087217859924,-0.015086040832102,-0.045277182012796,-0.026450065895915


In [24]:

dimensions_lables = ['dimension_' + str(i) for i in range(300)]

embedded_news_titles = 
embedded_dataframe = pd.DataFrame(data=dict(zip(dimensions_lables,embedded_news_titles)))
embedded_dataframe.head(5)

Unnamed: 0,dimension_0,dimension_1,dimension_2,dimension_3,dimension_4,dimension_5,dimension_6,dimension_7,dimension_8,dimension_9,...,dimension_290,dimension_291,dimension_292,dimension_293,dimension_294,dimension_295,dimension_296,dimension_297,dimension_298,dimension_299
0,0.002412,0.010801,0.011324,-0.001741,-0.003856,0.012267,0.007209,0.001701,-0.000498,0.013692,...,4.9e-05,0.001691,-0.002287,-0.003851,0.000327,-0.003968,0.010142,0.006111,0.002777,0.006914
1,0.017808,0.007181,0.013237,0.009513,0.01368,0.013769,0.016749,0.013176,0.015717,0.010651,...,0.01068,0.017439,0.00802,0.019904,0.017428,0.011945,0.015189,0.019287,0.014585,0.019124
2,-0.080852,-0.064497,-0.061111,-0.064294,-0.096955,-0.065313,-0.064896,-0.063416,-0.068635,-0.065386,...,-0.062848,-0.083465,-0.075255,-0.075312,-0.06685,-0.063402,-0.064433,-0.081898,-0.062563,-0.077577
3,-0.020943,-0.020461,-0.021744,-0.025136,-0.031713,-0.023776,-0.015341,-0.025896,-0.029268,-0.025495,...,-0.021473,-0.030983,-0.030823,-0.023702,-0.027272,-0.026234,-0.022236,-0.03073,-0.02024,-0.023136
4,0.009969,0.016587,0.006588,0.007809,0.008487,0.011306,-0.000405,0.008934,0.012294,0.009364,...,0.012307,0.011502,0.009913,0.014571,0.005957,0.006928,0.012491,0.009496,0.011444,0.008603


In [6]:
news_df = news_df.set_index("date_published")
news_df.head(5)

Unnamed: 0_level_0,title,news_title_embedded
date_published,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-06-27 21:01:00,"desarticulação, pressão de partidos e lobby de...","[0.0024119124, 0.017807769, -0.0808521, -0.020..."
2019-06-27 07:54:00,pf prende assessores de ministro do turismo em...,"[0.010801417, 0.0071813343, -0.064497106, -0.0..."
2019-06-27 10:41:00,macron diz que não terá acordo com mercosul se...,"[0.01132366, 0.013236635, -0.06111137, -0.0217..."
2019-06-27 17:33:00,maia contém apreensão com atraso na previdênci...,"[-0.0017408186, 0.0095127635, -0.064294055, -0..."
2019-06-27 16:58:00,guedes defende estados na previdência para que...,"[-0.003856063, 0.013680325, -0.09695509, -0.03..."


AttributeError: 'Series' object has no attribute 'news_title_embedded'

In [78]:
#news_df.to_csv('../datasets/pre-processed/news.csv', sep=';', index= False, encoding='utf-8')
dolar_rates_df = pd.read_csv('../datasets/pre-processed/dolar_rates.csv')

dolar_rates_df = dolar_rates_df.set_index("quote_timestamp")

dolar_rates_df.tail(5)

Unnamed: 0_level_0,variation
quote_timestamp,Unnamed: 1_level_1
2019-06-28 10:02:17.893,down
2019-06-28 11:05:18.692,up
2019-06-28 12:06:28.871,down
2019-06-28 13:06:29.675,up
2019-06-28 13:06:29.684,down


In [79]:
dolar_rates_df = dolar_rates_df.reindex(news_df.index, method='backfill')

dolar_rates_df = dolar_rates_df.loc[~dolar_rates_df.index.duplicated(keep='first')]

dolar_rates_df.head(10)

Unnamed: 0_level_0,variation
date_published,Unnamed: 1_level_1
2019-06-27 21:01:00,down
2019-06-27 07:54:00,up
2019-06-27 10:41:00,up
2019-06-27 17:33:00,down
2019-06-27 16:58:00,down
2019-06-27 10:11:00,up
2019-06-27 14:50:00,down
2019-06-27 18:11:00,down
2019-06-27 15:03:00,down
2019-06-27 11:34:00,down


In [80]:
news_df = news_df.join(dolar_rates_df)
news_df = news_df.reset_index()
news_df = news_df.drop(['date_published', 'title'], axis=1)
news_df = news_df.dropna(subset=['variation'])

news_df.tail(5)

Unnamed: 0,news_title_embedded,variation
4209,"[[0.058721, 0.26045, -0.48566, -0.030724, -0.1...",down
4210,"[[-0.42465, -0.086821, -0.50489, 0.042979, -0....",down
4211,"[[0.42724, -0.31685, -0.25367, -0.14777, 0.170...",down
4212,"[[0.018589, 0.0042079, -0.39838, 0.0037556, 0....",down
4213,"[[0.018589, 0.0042079, -0.39838, 0.0037556, 0....",down


In [96]:
#features = news_df.drop(['variation'], axis = 1)
features = news_df['news_title_embedded']
#features = np_utils.normalize(features, axis=-1, order=2)

features[0]

[array([ 0.42724  , -0.31685  , -0.25367  , -0.14777  ,  0.17041  ,
         0.26962  ,  0.070469 ,  0.090935 , -0.28564  , -0.19999  ,
         0.24735  , -0.029271 ,  0.033065 , -0.13963  , -0.12631  ,
         0.34787  , -0.20253  , -0.12504  ,  0.13448  , -0.19695  ,
         0.27438  , -0.16255  ,  0.23316  ,  0.24294  ,  0.18979  ,
         0.1358   , -0.21664  , -0.0085868,  0.064912 , -0.58563  ,
        -0.082001 ,  0.2252   , -0.17097  , -0.54094  ,  0.64383  ,
        -0.067617 , -0.18942  ,  0.076536 , -0.15135  , -0.36522  ,
         0.079728 ,  0.26588  ,  0.21666  , -0.2796   , -0.37752  ,
         0.23482  ,  0.38395  , -0.22039  ,  0.22961  , -0.12516  ,
         0.32822  , -0.25515  ,  0.23504  ,  0.54664  ,  0.13122  ,
        -0.6237   ,  0.23645  , -0.2938   , -0.4775   , -0.62636  ,
         0.024095 , -0.036731 , -0.067262 , -0.40248  , -0.16506  ,
         0.050363 , -0.12291  , -0.21365  , -0.58058  ,  0.083504 ,
        -0.24812  , -0.3013   ,  0.35229  , -0.4

In [97]:
news_df['variation'] = news_df['variation'].apply(lambda variation: 2 if variation == 'up' else (0 if variation == 'down' else 1))
labels = news_df[['variation']].to_numpy()

labels

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [98]:
np.savetxt("../datasets/final-data/features.csv", features, delimiter=",")
#np.savetxt("../datasets/final-data/labels.csv", labels, delimiter=",")
#features.dtype

TypeError: Mismatch between array dtype ('object') and format specifier ('%.18e')