# Labeling the news with the dollar variation

Now we ave to label each news with the respective dollar variation

## Importing and indexing the dollar dataframe

In [8]:
import numpy as np
import pandas as pd
from tensorflow import keras

from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt 

dolar_rates_df = pd.read_csv('../datasets/pre-processed/dolar_rates.csv')

dolar_rates_df = dolar_rates_df.set_index("quote_timestamp")

dolar_rates_df.tail(5)

Unnamed: 0_level_0,variation
quote_timestamp,Unnamed: 1_level_1
2019-06-28 10:02:17.893,down
2019-06-28 11:05:18.692,up
2019-06-28 12:06:28.871,down
2019-06-28 13:06:29.675,up
2019-06-28 13:06:29.684,down


## Importing and indexing the news dataframe

In [31]:
news_df = pd.read_csv('../datasets/pre-processed/news.csv')
news_df = news_df.set_index("date_published")

news_df['news_title_embedded'].to_list()

['[[ 0.018589   0.0042079 -0.39838   ...  0.13387   -0.82486   -0.38152  ]\n [ 0.089803   0.14945   -0.33631   ... -0.062494  -0.13707   -0.013667 ]\n [ 0.26299   -0.13228   -0.22075   ... -0.17962    0.085051  -0.045975 ]\n ...\n [-0.018302   0.039233  -0.12613   ...  0.039955  -0.57875    0.0014487]\n [-0.036049   0.33402   -0.24751   ... -0.032348  -0.52316   -0.30314  ]\n [-0.20458   -0.0065224 -0.28982   ... -0.074011  -0.071294   0.039239 ]]',
 '[[-0.74519    0.025798  -0.97006   ... -0.026199  -0.05939    0.094255 ]\n [ 0.0099654  0.27158   -0.61198   ... -0.2139    -0.79553   -0.34838  ]\n [-0.74519    0.025798  -0.97006   ... -0.026199  -0.05939    0.094255 ]\n ...\n [-0.74519    0.025798  -0.97006   ... -0.026199  -0.05939    0.094255 ]\n [ 0.26299   -0.13228   -0.22075   ... -0.17962    0.085051  -0.045975 ]\n [ 0.10742    0.21373   -0.013338  ... -0.25815   -0.10925   -0.23489  ]]',
 '[[ 0.42724   -0.31685   -0.25367   ... -0.31065   -0.078432   0.044734 ]\n [-0.20458   -0.

## Reindexing the dollar dataframe by the nearest news timestamp

In [10]:
dolar_rates_df = dolar_rates_df.reindex(news_df.index, method='backfill')

dolar_rates_df = dolar_rates_df.loc[~dolar_rates_df.index.duplicated(keep='first')]

dolar_rates_df.head(10)

Unnamed: 0_level_0,variation
date_published,Unnamed: 1_level_1
2019-06-27 21:01:00,down
2019-06-27 07:54:00,up
2019-06-27 10:41:00,up
2019-06-27 17:33:00,down
2019-06-27 16:58:00,down
2019-06-27 10:11:00,up
2019-06-27 14:50:00,down
2019-06-27 18:11:00,down
2019-06-27 15:03:00,down
2019-06-27 11:34:00,down


## Joining the two dataframes by their indexes

In [11]:
news_df = news_df.join(dolar_rates_df)
news_df = news_df.reset_index()
news_df = news_df.drop(['date_published', 'title'], axis=1)
news_df = news_df.dropna(subset=['variation'])

news_df.tail(5)

Unnamed: 0,news_title_embedded,variation
4209,[[ 0.058721 0.26045 -0.48566 ... 0.044547 ...,down
4210,[[-0.42465 -0.086821 -0.50489 ... -0.0996...,down
4211,[[ 0.42724 -0.31685 -0.25367 ... -0.3106...,down
4212,[[ 0.018589 0.0042079 -0.39838 ... 0.1338...,down
4213,[[ 0.018589 0.0042079 -0.39838 ... 0.1338...,down


In [26]:
features = news_df.drop(['variation'], axis = 1)
features = news_df['news_title_embedded'].to_list()
#features = np_utils.normalize(features, axis=-1, order=2)

features

['[[ 0.42724   -0.31685   -0.25367   ... -0.31065   -0.078432   0.044734 ]\n [ 0.089803   0.14945   -0.33631   ... -0.062494  -0.13707   -0.013667 ]\n [-0.036049   0.33402   -0.24751   ... -0.032348  -0.52316   -0.30314  ]\n ...\n [ 0.018589   0.0042079 -0.39838   ...  0.13387   -0.82486   -0.38152  ]\n [ 0.27388    0.11287   -0.090481  ...  0.032811  -0.025585  -0.21247  ]\n [ 0.083144  -0.085586  -0.37444   ...  0.4959     0.41419   -0.27082  ]]',
 '[[-0.018302   0.039233  -0.12613   ...  0.039955  -0.57875    0.0014487]\n [-0.20458   -0.0065224 -0.28982   ... -0.074011  -0.071294   0.039239 ]\n [-0.090633  -0.042767  -0.32836   ... -0.20081   -0.35889    0.11824  ]\n ...\n [-0.090633  -0.042767  -0.32836   ... -0.20081   -0.35889    0.11824  ]\n [ 0.27388    0.11287   -0.090481  ...  0.032811  -0.025585  -0.21247  ]\n [ 0.058721   0.26045   -0.48566   ...  0.044547  -0.27553   -0.32517  ]]',
 '[[-3.4224e-01 -5.8811e-04 -2.7114e-01 ... -7.8284e-03 -1.5071e-01\n  -1.0188e-01]\n [-2.04

In [24]:
news_df['variation'] = news_df['variation'].apply(lambda variation: 2 if variation == 'up' else (0 if variation == 'down' else 1))
labels = news_df[['variation']].to_numpy()
# encoder = LabelEncoder()
# encoder.fit(labels)
# labels = encoder.transform(labels)
# labels = np_utils.to_categorical(labels)
labels
#plt.hist(labels)

array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [23]:
np.savetxt("../datasets/final-data/features.csv", features, delimiter=",")
np.savetxt("../datasets/final-data/labels.csv", labels, delimiter=",")

TypeError: Mismatch between array dtype ('object') and format specifier ('%.18e,%.18e')