# Library import

In [57]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

# Global Variables

In [58]:
stock_date_start = "2009-11-29"
stock_date_end = "2019-12-31"
target = "MSFT"

# 1. Data Preprocessing

In [59]:
stock_raw = yf.download(target, start=stock_date_start, end=stock_date_end)

[*********************100%***********************]  1 of 1 completed


In [60]:
stock_raw.transpose()

Date,2009-11-30,2009-12-01,2009-12-02,2009-12-03,2009-12-04,2009-12-07,2009-12-08,2009-12-09,2009-12-10,2009-12-11,...,2019-12-16,2019-12-17,2019-12-18,2019-12-19,2019-12-20,2019-12-23,2019-12-24,2019-12-26,2019-12-27,2019-12-30
Open,29.15,29.52,29.9,29.84,30.05,29.78,29.52,29.47,29.71,29.97,...,155.11,155.45,154.3,154.0,157.35,158.12,157.48,157.56,159.45,158.99
High,29.45,30.05,29.99,30.2,30.37,30.08,29.74,29.81,29.96,30.0,...,155.9,155.71,155.48,155.77,158.49,158.12,157.71,158.73,159.55,159.02
Low,29.0,29.41,29.65,29.76,29.83,29.68,29.38,29.25,29.66,29.79,...,154.82,154.45,154.18,153.75,156.29,157.27,157.12,157.4,158.22,156.73
Close,29.41,30.01,29.78,29.83,29.98,29.79,29.57,29.71,29.87,29.85,...,155.53,154.69,154.37,155.71,157.41,157.41,157.38,158.67,158.96,157.59
Adj Close,23.08554,23.55651,23.37597,23.41522,23.53296,23.38382,23.21113,23.32102,23.44662,23.43092,...,155.1063,154.2686,153.9495,155.2859,156.9812,156.9812,156.9513,158.2378,158.527,157.1607
Volume,44172000.0,49904200.0,36308600.0,43095200.0,58810700.0,38082700.0,37402200.0,44713300.0,45940200.0,43744200.0,...,24144200.0,25425600.0,24129200.0,24958900.0,53477500.0,17718200.0,8989200.0,14520600.0,18412800.0,16348400.0


## Financial Indicators Features

In [61]:
# origin data features: ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
stock_raw_data = stock_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
stock_raw_data['high_low_diff'] = (stock_raw_data['High'] - stock_raw_data['Low'])
stock_raw_data['open_close_diff'] = (stock_raw_data['Open'] - stock_raw_data['Close'])
stock_raw_data['high_low_diff_ratio'] = (stock_raw_data['High'] - stock_raw_data['Low']) / stock_raw_data['Close']
stock_raw_data['open_close_diff_ratio'] = (stock_raw_data['Open'] - stock_raw_data['Close']) / stock_raw_data['Close']

# Add financial indicators
stock_stats_data = stockstats.StockDataFrame.retype(stock_raw_data)
stock_stats_data[['close_1_d', 'close_7_d', 'close_30_d','change', 'open_delta','close_delta','volume_delta', 
       'close_-2_r','close_-6_r', 'boll', 'boll_ub', 'boll_lb', 'boll_-1_d', 'boll_ub_-1_d', 'boll_lb_-1_d' , 
       'kdjk','kdjd','kdjj', 'macd','macds','macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr']]

# Add financial indicators
stock_data = pd.DataFrame(stock_stats_data)
stock_data = stock_data.dropna()
stock_data['boll_k_diff'] = stock_data['boll'] - stock_data['close']

# Generate absolute label (namely 0,1) and non-absolute label (real price)
label_abs_1d = stock_data['close_1_d'].apply(lambda x: 1 if x > 0 else 0)
label_abs_7d = stock_data['close_7_d'].apply(lambda x: 1 if x > 0 else 0)
label_abs_30d = stock_data['close_30_d'].apply(lambda x: 1 if x > 0 else 0)

label_value_1d = stock_data['close_1_s']
label_value_7d = stock_data['close_7_s']
label_value_30d = stock_data['close_30_s']

# training datasets of with or without absolute data
stock_without_absolute = stock_data[['change', 'open_delta','close_delta','volume_delta', 'high_low_diff_ratio', 
                                 'open_close_diff_ratio','close_-2_r','close_-6_r','kdjk','kdjd','kdjj', 'macd',
                                 'macds', 'macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 
                                 'boll_-1_d','boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff',
                                 'high_low_diff', 'open_close_diff']]

stock_with_absolute = stock_data[['change', 'open_delta','close_delta','volume_delta', 'high_low_diff_ratio', 
                                'open_close_diff_ratio','close_-2_r','close_-6_r','kdjk','kdjd','kdjj', 'macd',
                                'macds', 'macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 
                                'boll_-1_d','boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff',
                                'high_low_diff', 'open_close_diff', 
                                'open', 'high', 'low', 'close', 'adj close', 'volume', ]]
print("Process data from: ", stock_with_absolute.index[0], " to ", stock_with_absolute.index[-1])
final_index = stock_with_absolute

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
NOTE: Behavior of MACDH calculation has changed as of July 2017 - it is now 1/2 of previous calculated values


Process data from:  2009-12-08 00:00:00  to  2019-11-14 00:00:00


In [62]:
stock_with_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,boll_lb_-1_d,boll_k_diff,high_low_diff,open_close_diff,open,high,low,close,adj close,volume
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,...,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,0.074439,0.046859,0.04727,-7335.292,0.017341,-0.000235,0.147704,0.436701,57.119822,57.10607,...,0.042618,-0.437239,0.942222,-0.01026,54.937342,55.391143,54.448921,54.947602,50.905797,40165840.0
std,1.438169,0.879032,0.911218,20207800.0,0.009083,0.01136,1.99477,3.286971,22.227605,19.199233,...,0.2738,1.7246,0.811377,0.738079,32.262246,32.498733,31.957023,32.245552,33.439694,22986030.0
min,-11.399546,-6.469994,-6.099998,-229492300.0,0.00412,-0.053898,-12.380344,-14.475906,5.849243,11.387073,...,-2.261275,-8.076999,0.15,-5.419998,23.09,23.32,22.73,23.01,18.228165,7425600.0
25%,-0.674071,-0.290001,-0.280005,-6549600.0,0.011445,-0.006787,-0.931561,-1.365578,40.394887,43.08502,...,-0.051839,-1.213751,0.459999,-0.310001,29.530001,29.74,29.18,29.5125,24.267555,24594550.0
50%,0.053514,0.030001,0.02,-419450.0,0.01523,-0.000389,0.192823,0.52926,59.434207,58.800491,...,0.014187,-0.337751,0.669998,-0.019999,43.875,44.195,43.5,43.92,39.584156,34547050.0
75%,0.819278,0.370001,0.360001,6349025.0,0.020423,0.005703,1.261678,2.351975,76.371931,72.923998,...,0.125899,0.4545,1.110001,0.25,68.907503,69.425001,68.479998,69.030001,65.731054,49851100.0
max,10.452235,5.170006,6.599998,239117900.0,0.085416,0.061138,12.950242,15.401249,95.333442,92.557744,...,2.654123,12.149503,7.07,6.090004,147.020004,148.410004,147.0,148.059998,147.15596,319317900.0


## News Sentiment Features

In [63]:
# First run financial_news_data.ipynb to scrape financial news

# open file
with open('./data/'+target+'/news_09-19.json') as json_file:
    news_data = json.load(json_file)

time_index = list(stock_with_absolute.index)
score = {}
des_score = {}
for time in time_index:
    yesterday = (time - timedelta(days=1)).strftime("%m/%d/%Y")
    if(yesterday in news_data):
        num_news = len(news_data[yesterday])
        sentiment = 0
        des_sentiment = 0
        for news in news_data[yesterday]:
            news_title = news['news_title'].replace('...', '')
            news_des = news['news_text'].encode("ascii", "ignore").decode("ascii").replace('...', '')
            blob = TextBlob(news_title)
            des_blob = TextBlob(news_des)
            sentiment += blob.sentiment.polarity
            des_sentiment += des_blob.sentiment.polarity
        score[time] = sentiment / num_news
        des_score[time] = des_sentiment / num_news
    else:
        score[time] = 0.0
        des_score[time] = 0.0
des_score_series = pd.Series(des_score)
des_score_series.name = "news_des_score"
title_score_series = pd.Series(score)
title_score_series.name = "news_title_score"
stock_without_absolute = stock_without_absolute.join([title_score_series, des_score_series])
stock_with_absolute = stock_with_absolute.join([title_score_series, des_score_series])

In [64]:
stock_with_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,high_low_diff,open_close_diff,open,high,low,close,adj close,volume,news_title_score,news_des_score
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,...,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,0.074439,0.046859,0.04727,-7335.292,0.017341,-0.000235,0.147704,0.436701,57.119822,57.10607,...,0.942222,-0.01026,54.937342,55.391143,54.448921,54.947602,50.905797,40165840.0,0.075608,0.087938
std,1.438169,0.879032,0.911218,20207800.0,0.009083,0.01136,1.99477,3.286971,22.227605,19.199233,...,0.811377,0.738079,32.262246,32.498733,31.957023,32.245552,33.439694,22986030.0,0.146826,0.127471
min,-11.399546,-6.469994,-6.099998,-229492300.0,0.00412,-0.053898,-12.380344,-14.475906,5.849243,11.387073,...,0.15,-5.419998,23.09,23.32,22.73,23.01,18.228165,7425600.0,-0.714286,-0.75
25%,-0.674071,-0.290001,-0.280005,-6549600.0,0.011445,-0.006787,-0.931561,-1.365578,40.394887,43.08502,...,0.459999,-0.310001,29.530001,29.74,29.18,29.5125,24.267555,24594550.0,0.0,0.0
50%,0.053514,0.030001,0.02,-419450.0,0.01523,-0.000389,0.192823,0.52926,59.434207,58.800491,...,0.669998,-0.019999,43.875,44.195,43.5,43.92,39.584156,34547050.0,0.034091,0.06721
75%,0.819278,0.370001,0.360001,6349025.0,0.020423,0.005703,1.261678,2.351975,76.371931,72.923998,...,1.110001,0.25,68.907503,69.425001,68.479998,69.030001,65.731054,49851100.0,0.14,0.157197
max,10.452235,5.170006,6.599998,239117900.0,0.085416,0.061138,12.950242,15.401249,95.333442,92.557744,...,7.07,6.090004,147.020004,148.410004,147.0,148.059998,147.15596,319317900.0,1.0,1.0


## External Features

### a) S&P 500

In [65]:
SP500_raw = yf.download("^GSPC", start=stock_date_start, end=stock_date_end)
SP500_raw_data = SP500_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
SP500_raw_data['high_low_diff'] = (SP500_raw['High'] - SP500_raw['Low'])
SP500_raw_data['open_close_diff'] = (SP500_raw['Open'] - SP500_raw['Close'])
SP500_raw_data['high_low_diff_ratio'] = (SP500_raw['High'] - SP500_raw['Low']) / SP500_raw['Close']
SP500_raw_data['open_close_diff_ratio'] = (SP500_raw['Open'] - SP500_raw['Close']) / SP500_raw['Close']

# Add financial indicators
SP500_stats_data = stockstats.StockDataFrame.retype(SP500_raw_data)
SP500_stats_data[['change','close_delta','volume_delta', 'close_-2_r','close_-6_r']]

SP500_stock = pd.DataFrame(SP500_stats_data).add_prefix('sp500_')

[*********************100%***********************]  1 of 1 completed


In [66]:
stock_without_absolute = stock_without_absolute.join(SP500_stock[['sp500_high_low_diff',
       'sp500_open_close_diff', 'sp500_high_low_diff_ratio',
       'sp500_open_close_diff_ratio', 'sp500_change', 'sp500_close_delta',
       'sp500_volume_delta', 'sp500_close_-2_r', 'sp500_close_-6_r']].loc[stock_with_absolute.index])
stock_with_absolute = stock_with_absolute.join(SP500_stock.loc[stock_with_absolute.index])

In [67]:
stock_without_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,news_des_score,sp500_high_low_diff,sp500_open_close_diff,sp500_high_low_diff_ratio,sp500_open_close_diff_ratio,sp500_change,sp500_close_delta,sp500_volume_delta,sp500_close_-2_r,sp500_close_-6_r
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,...,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,0.074439,0.046859,0.04727,-7335.292,0.017341,-0.000235,0.147704,0.436701,57.119822,57.10607,...,0.087938,19.113069,-0.443317,0.010407,-0.000232,0.04562,0.796715,-330651.5,0.090709,0.269849
std,1.438169,0.879032,0.911218,20207800.0,0.009083,0.01136,1.99477,3.286971,22.227605,19.199233,...,0.127471,13.318713,15.47623,0.007214,0.008641,0.93313,17.339129,689555400.0,1.286836,2.116332
min,-11.399546,-6.469994,-6.099998,-229492300.0,0.00412,-0.053898,-12.380344,-14.475906,5.849243,11.387073,...,-0.75,3.680054,-104.579834,0.001456,-0.044604,-6.663446,-113.189941,-4995080000.0,-7.000929,-13.373268
25%,-0.674071,-0.290001,-0.280005,-6549600.0,0.011445,-0.006787,-0.931561,-1.365578,40.394887,43.08502,...,0.0,10.73999,-8.179932,0.005631,-0.004394,-0.328627,-6.202454,-297782500.0,-0.496023,-0.700018
50%,0.053514,0.030001,0.02,-419450.0,0.01523,-0.000389,0.192823,0.52926,59.434207,58.800491,...,0.06721,15.419922,-1.01001,0.008488,-0.000533,0.060568,1.054993,-10565000.0,0.156177,0.45918
75%,0.819278,0.370001,0.360001,6349025.0,0.020423,0.005703,1.261678,2.351975,76.371931,72.923998,...,0.157197,23.122498,5.91748,0.012834,0.003075,0.50668,9.25,311447500.0,0.773983,1.494375
max,10.452235,5.170006,6.599998,239117900.0,0.085416,0.061138,12.950242,15.401249,95.333442,92.557744,...,1.0,125.219971,104.01001,0.090227,0.070588,4.959374,116.599854,4299510000.0,6.428004,8.761593


### b) Gold

In [68]:
gold_raw = yf.download("GLD", start=stock_date_start, end=stock_date_end)
gold_raw_data = gold_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
gold_raw_data['high_low_diff'] = (gold_raw['High'] - gold_raw['Low'])
gold_raw_data['open_close_diff'] = (gold_raw['Open'] - gold_raw['Close'])
gold_raw_data['high_low_diff_ratio'] = (gold_raw['High'] - gold_raw['Low']) / gold_raw['Close']
gold_raw_data['open_close_diff_ratio'] = (gold_raw['Open'] - gold_raw['Close']) / gold_raw['Close']

# Add financial indicators
gold_stats_data = stockstats.StockDataFrame.retype(gold_raw_data)
gold_stats_data[['change','close_delta','volume_delta', 'close_-2_r','close_-6_r']]

gold_stock = pd.DataFrame(gold_stats_data).add_prefix('gold_')

[*********************100%***********************]  1 of 1 completed


In [69]:
stock_without_absolute = stock_without_absolute.join(gold_stock[['gold_high_low_diff', 'gold_open_close_diff',
       'gold_high_low_diff_ratio', 'gold_open_close_diff_ratio', 'gold_change',
       'gold_close_delta', 'gold_volume_delta', 'gold_close_-2_r',
       'gold_close_-6_r']].loc[stock_with_absolute.index])
stock_with_absolute = stock_with_absolute.join(gold_stock.loc[stock_with_absolute.index])

In [70]:
stock_without_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,sp500_close_-6_r,gold_high_low_diff,gold_open_close_diff,gold_high_low_diff_ratio,gold_open_close_diff_ratio,gold_change,gold_close_delta,gold_volume_delta,gold_close_-2_r,gold_close_-6_r
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,...,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,0.074439,0.046859,0.04727,-7335.292,0.017341,-0.000235,0.147704,0.436701,57.119822,57.10607,...,0.269849,1.203102,0.00546,0.009169,6.6e-05,0.012975,0.010172,-14915.35,0.025194,0.068703
std,1.438169,0.879032,0.911218,20207800.0,0.009083,0.01136,1.99477,3.286971,22.227605,19.199233,...,2.116332,0.840345,0.889768,0.005766,0.006542,0.984758,1.331456,5665652.0,1.364206,2.355535
min,-11.399546,-6.469994,-6.099998,-229492300.0,0.00412,-0.053898,-12.380344,-14.475906,5.849243,11.387073,...,-13.373268,0.209999,-6.689995,0.001707,-0.050678,-8.780826,-12.639999,-48153400.0,-13.068524,-14.06976
25%,-0.674071,-0.290001,-0.280005,-6549600.0,0.011445,-0.006787,-0.931561,-1.365578,40.394887,43.08502,...,-0.700018,0.670006,-0.377499,0.005415,-0.002941,-0.490806,-0.620001,-2728850.0,-0.722545,-1.320687
50%,0.053514,0.030001,0.02,-419450.0,0.01523,-0.000389,0.192823,0.52926,59.434207,58.800491,...,0.45918,0.980003,0.004997,0.007678,2.8e-05,0.036447,0.049992,-260050.0,0.057978,0.127849
75%,0.819278,0.370001,0.360001,6349025.0,0.020423,0.005703,1.261678,2.351975,76.371931,72.923998,...,1.494375,1.43,0.379997,0.010896,0.00301,0.518419,0.660004,2519625.0,0.827276,1.611773
max,10.452235,5.170006,6.599998,239117900.0,0.085416,0.061138,12.950242,15.401249,95.333442,92.557744,...,8.761593,9.589996,8.900009,0.058372,0.054173,4.903838,5.970001,47538600.0,5.469986,8.979403


### c) Bonds

In [71]:
# 5 year bonds
y5bond_raw = yf.download("^FVX", start=stock_date_start, end=stock_date_end)
y5bond_raw_data = y5bond_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
y5bond_raw_data['high_low_diff'] = (y5bond_raw['High'] - y5bond_raw['Low'])
y5bond_raw_data['open_close_diff'] = (y5bond_raw['Open'] - y5bond_raw['Close'])
y5bond_raw_data['high_low_diff_ratio'] = (y5bond_raw['High'] - y5bond_raw['Low']) / y5bond_raw['Close']
y5bond_raw_data['open_close_diff_ratio'] = (y5bond_raw['Open'] - y5bond_raw['Close']) / y5bond_raw['Close']

# Add financial indicators
y5bond_stats_data = stockstats.StockDataFrame.retype(y5bond_raw_data)
y5bond_stats_data[['change','close_delta', 'close_-2_r','close_-6_r']] ### no volume

y5bond_stock = pd.DataFrame(y5bond_stats_data).drop(columns='volume').add_prefix('y5bond_')

# 10 year bonds

y10bond_raw = yf.download("^TNX", start=stock_date_start, end=stock_date_end)
y10bond_raw_data = y10bond_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
y10bond_raw_data['high_low_diff'] = (y10bond_raw['High'] - y10bond_raw['Low'])
y10bond_raw_data['open_close_diff'] = (y10bond_raw['Open'] - y10bond_raw['Close'])
y10bond_raw_data['high_low_diff_ratio'] = (y10bond_raw['High'] - y10bond_raw['Low']) / y10bond_raw['Close']
y10bond_raw_data['open_close_diff_ratio'] = (y10bond_raw['Open'] - y10bond_raw['Close']) / y10bond_raw['Close']

# Add financial indicators
y10bond_stats_data = stockstats.StockDataFrame.retype(y10bond_raw_data)
y10bond_stats_data[['change','close_delta', 'close_-2_r','close_-6_r']] ### no volume

y10bond_stock = pd.DataFrame(y10bond_stats_data).drop(columns='volume').add_prefix('y10bond_')



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [72]:
stock_without_absolute = pd.merge(stock_without_absolute, y10bond_stock[['y10bond_high_low_diff', 'y10bond_open_close_diff',
       'y10bond_high_low_diff_ratio', 'y10bond_open_close_diff_ratio',
       'y10bond_change', 'y10bond_close_delta', 'y10bond_close_-2_r',
       'y10bond_close_-6_r']], how="inner", left_index=True, right_index=True)
stock_with_absolute = pd.merge(stock_with_absolute, y10bond_stock, how="inner", left_index=True, right_index=True)

stock_without_absolute = pd.merge(stock_without_absolute, y5bond_stock[['y5bond_high_low_diff', 'y5bond_open_close_diff',
       'y5bond_high_low_diff_ratio', 'y5bond_open_close_diff_ratio',
       'y5bond_change', 'y5bond_close_delta', 'y5bond_close_-2_r',
       'y5bond_close_-6_r']], how="inner", left_index=True, right_index=True)
stock_with_absolute = pd.merge(stock_with_absolute, y5bond_stock, how="inner", left_index=True, right_index=True)

In [73]:
print("stock_without_absolute Null values? ", stock_without_absolute.isnull().values.any())
print("stock_with_absolute Null values? ", stock_with_absolute.isnull().values.any())

stock_without_absolute Null values?  False
stock_with_absolute Null values?  False


In [74]:
stock_without_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,y10bond_close_-2_r,y10bond_close_-6_r,y5bond_high_low_diff,y5bond_open_close_diff,y5bond_high_low_diff_ratio,y5bond_open_close_diff_ratio,y5bond_change,y5bond_close_delta,y5bond_close_-2_r,y5bond_close_-6_r
count,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,...,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0,2481.0
mean,0.075545,0.049073,0.048956,143437.6,0.017359,-0.000237,0.149953,0.440537,57.208638,57.20853,...,-0.005599,-0.003106,0.051446,0.002287,0.035191,0.001803,0.040268,-0.000228,0.076973,0.244167
std,1.439234,0.875447,0.910422,20146340.0,0.009092,0.01135,1.993869,3.282888,22.197408,19.166271,...,3.016018,5.207423,0.042168,0.03711,0.029581,0.026286,3.242781,0.045671,4.506002,7.791991
min,-11.399546,-6.469994,-6.099998,-229492300.0,0.00412,-0.053898,-12.380344,-14.475906,8.463379,11.387073,...,-16.043699,-20.364967,0.0,-0.181,0.0,-0.147514,-14.468864,-0.188,-25.458899,-28.91933
25%,-0.674506,-0.290001,-0.280006,-6460000.0,0.011464,-0.006757,-0.932067,-1.370401,40.438621,43.223542,...,-1.777059,-3.22829,0.03,-0.017,0.018944,-0.010936,-1.752579,-0.028,-2.44127,-4.326332
50%,0.050863,0.030001,0.02,-357900.0,0.015238,-0.000386,0.186441,0.52848,59.456322,59.019476,...,-0.175977,-0.269266,0.043,0.003,0.02893,0.001488,0.0,0.0,-0.109708,-0.220266
75%,0.82248,0.370001,0.360001,6376400.0,0.020433,0.005704,1.267888,2.351033,76.430105,73.041141,...,1.603056,2.858476,0.063,0.023,0.043931,0.014925,1.706774,0.027,2.275193,3.931844
max,10.452235,5.170006,6.599998,239117900.0,0.085416,0.061138,12.950242,15.401249,95.333442,92.557744,...,13.694958,25.518784,1.349,0.234,0.84418,0.250535,16.634976,0.23,24.809877,41.527915


In [53]:
stock_with_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,y5bond_close,y5bond_adj close,y5bond_high_low_diff,y5bond_open_close_diff,y5bond_high_low_diff_ratio,y5bond_open_close_diff_ratio,y5bond_change,y5bond_close_delta,y5bond_close_-2_r,y5bond_close_-6_r
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,1.051018,2.198181,2.932728,-878590.9,0.023524,-0.00579,1.716835,2.512464,65.011284,59.653905,...,0.367909,0.367909,0.029091,-0.003182,0.079142,-0.008474,1.255311,0.003909,1.701198,1.196854
std,1.743904,3.261737,4.887672,11690450.0,0.008188,0.01156,1.975065,4.227669,16.566217,12.880865,...,0.013678,0.013678,0.006818,0.016005,0.01875,0.043121,5.754515,0.020964,5.913426,7.587011
min,-1.620944,-3.709991,-4.72998,-26762200.0,0.014536,-0.020391,-1.551406,-3.81466,37.534866,42.776564,...,0.345,0.345,0.02,-0.029,0.0542,-0.078591,-7.980051,-0.032,-9.725687,-13.126493
25%,-0.15843,0.490021,-0.434998,-1854850.0,0.020366,-0.011748,0.123942,-0.583363,55.5857,48.179759,...,0.363,0.363,0.0245,-0.015,0.064012,-0.039018,-2.439021,-0.009,0.0,-3.434063
50%,1.414881,2.920013,4.089996,423600.0,0.021239,-0.009019,2.48165,3.601089,70.345502,59.174988,...,0.369,0.369,0.027,0.0,0.075362,0.0,1.373625,0.005,1.933702,2.168026
75%,2.494982,4.914978,6.899994,4932400.0,0.025078,-0.002702,2.742549,5.757807,78.199478,70.676407,...,0.369,0.369,0.035,0.0065,0.094851,0.017708,5.014569,0.0175,4.53769,7.564051
max,3.284523,5.889984,9.150024,14388200.0,0.045491,0.023333,5.463422,7.915148,83.538848,77.660894,...,0.401,0.401,0.039,0.028,0.107143,0.075881,10.164836,0.037,10.479041,10.164836


In [75]:
stock_without_absolute.columns

Index(['change', 'open_delta', 'close_delta', 'volume_delta',
       'high_low_diff_ratio', 'open_close_diff_ratio', 'close_-2_r',
       'close_-6_r', 'kdjk', 'kdjd', 'kdjj', 'macd', 'macds', 'macdh', 'rsi_6',
       'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 'boll_-1_d',
       'boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff', 'high_low_diff',
       'open_close_diff', 'news_title_score', 'news_des_score',
       'sp500_high_low_diff', 'sp500_open_close_diff',
       'sp500_high_low_diff_ratio', 'sp500_open_close_diff_ratio',
       'sp500_change', 'sp500_close_delta', 'sp500_volume_delta',
       'sp500_close_-2_r', 'sp500_close_-6_r', 'gold_high_low_diff',
       'gold_open_close_diff', 'gold_high_low_diff_ratio',
       'gold_open_close_diff_ratio', 'gold_change', 'gold_close_delta',
       'gold_volume_delta', 'gold_close_-2_r', 'gold_close_-6_r',
       'y10bond_high_low_diff', 'y10bond_open_close_diff',
       'y10bond_high_low_diff_ratio', 'y10bond_open_close_diff_rati

In [21]:
# store training
stock_without_absolute.to_pickle('./data/'+target+'/stock_without_absolute.pkl')
stock_with_absolute.to_pickle('./data/'+target+'/stock_with_absolute.pkl')

In [22]:
label_abs_1d = label_abs_1d.loc[stock_with_absolute.index]
label_abs_7d = label_abs_7d.loc[stock_with_absolute.index]
label_abs_30d = label_abs_30d.loc[stock_with_absolute.index]

label_value_1d = label_value_1d.loc[stock_with_absolute.index]
label_value_7d = label_value_7d.loc[stock_with_absolute.index]
label_value_30d = label_value_30d.loc[stock_with_absolute.index]

In [23]:
# store labels
label_abs_1d.to_pickle('./data/'+target+'/label_abs_1d.pkl')
label_abs_7d.to_pickle('./data/'+target+'/label_abs_7d.pkl')
label_abs_30d.to_pickle('./data/'+target+'/label_abs_30d.pkl')

label_value_1d.to_pickle('./data/'+target+'/label_value_1d.pkl')
label_value_7d.to_pickle('./data/'+target+'/label_value_7d.pkl')
label_value_30d.to_pickle('./data/'+target+'/label_value_30d.pkl')