# Library import

In [26]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

# Global Variables

In [27]:
stock_date_start = "2009-11-29"
stock_date_end = "2019-12-31"
target = "MSFT"

# 1. Data Preprocessing

In [39]:
stock_raw = yf.download(target, start=stock_date_start, end=stock_date_end)

[*********************100%***********************]  1 of 1 completed


In [40]:
stock_raw.transpose()

Date,2020-04-14,2020-04-15,2020-04-16,2020-04-17,2020-04-20,2020-04-21,2020-04-22,2020-04-23,2020-04-24,2020-04-27,2020-04-28,2020-04-29,2020-04-30,2020-05-01,2020-05-04,2020-05-05,2020-05-06,2020-05-07
Open,280.0,282.4,287.38,284.69,277.95,276.28,273.61,275.87,277.2,281.8,285.08,284.73,289.96,286.25,289.17,295.06,300.46,303.22
High,288.25,286.33,288.2,286.95,281.68,277.25,277.9,281.75,283.01,284.54,285.83,289.67,294.53,299.0,293.69,301.0,303.24,305.17
Low,278.05,280.63,282.35,276.86,276.85,265.43,272.2,274.87,277.0,279.95,278.2,283.89,288.35,285.85,286.32,294.46,298.87,301.97
Close,287.05,284.43,286.69,282.8,276.93,268.37,276.1,275.03,282.97,283.17,278.58,287.73,293.8,289.07,293.16,297.56,300.63,303.74
Adj Close,287.05,284.43,286.69,282.8,276.93,268.37,276.1,275.03,282.97,283.17,278.58,287.73,293.8,289.07,293.16,297.56,300.63,303.74
Volume,48748700.0,32788600.0,39281300.0,53812500.0,32503800.0,45247900.0,29264300.0,31203600.0,31627200.0,29271900.0,28001200.0,34320200.0,45766000.0,60154200.0,33392000.0,36937800.0,35583400.0,28690300.0


## Financial Indicators Features

In [41]:
# origin data features: ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
stock_raw_data = stock_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
stock_raw_data['high_low_diff'] = (stock_raw_data['High'] - stock_raw_data['Low'])
stock_raw_data['open_close_diff'] = (stock_raw_data['Open'] - stock_raw_data['Close'])
stock_raw_data['high_low_diff_ratio'] = (stock_raw_data['High'] - stock_raw_data['Low']) / stock_raw_data['Close']
stock_raw_data['open_close_diff_ratio'] = (stock_raw_data['Open'] - stock_raw_data['Close']) / stock_raw_data['Close']

# Add financial indicators
stock_stats_data = stockstats.StockDataFrame.retype(stock_raw_data)
stock_stats_data[['close_1_d', 'close_7_d', 'close_30_d','change', 'open_delta','close_delta','volume_delta', 
       'close_-2_r','close_-6_r', 'boll', 'boll_ub', 'boll_lb', 'boll_-1_d', 'boll_ub_-1_d', 'boll_lb_-1_d' , 
       'kdjk','kdjd','kdjj', 'macd','macds','macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr']]

# Add financial indicators
stock_data = pd.DataFrame(stock_stats_data)
stock_data = stock_data.dropna()
stock_data['boll_k_diff'] = stock_data['boll'] - stock_data['close']

# Generate absolute label (namely 0,1) and non-absolute label (real price)
label_abs_1d = stock_data['close_1_d'].apply(lambda x: 1 if x > 0 else 0)
label_abs_7d = stock_data['close_7_d'].apply(lambda x: 1 if x > 0 else 0)
label_abs_30d = stock_data['close_30_d'].apply(lambda x: 1 if x > 0 else 0)

label_value_1d = stock_data['close_1_s']
label_value_7d = stock_data['close_7_s']
label_value_30d = stock_data['close_30_s']

# training datasets of with or without absolute data
stock_without_absolute = stock_data[['change', 'open_delta','close_delta','volume_delta', 'high_low_diff_ratio', 
                                 'open_close_diff_ratio','close_-2_r','close_-6_r','kdjk','kdjd','kdjj', 'macd',
                                 'macds', 'macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 
                                 'boll_-1_d','boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff',
                                 'high_low_diff', 'open_close_diff']]

stock_with_absolute = stock_data[['change', 'open_delta','close_delta','volume_delta', 'high_low_diff_ratio', 
                                'open_close_diff_ratio','close_-2_r','close_-6_r','kdjk','kdjd','kdjj', 'macd',
                                'macds', 'macdh', 'rsi_6', 'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 
                                'boll_-1_d','boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff',
                                'high_low_diff', 'open_close_diff', 
                                'open', 'high', 'low', 'close', 'adj close', 'volume', ]]
print("Process data from: ", stock_with_absolute.index[0], " to ", stock_with_absolute.index[-1])
final_index = stock_with_absolute

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
NOTE: Behavior of MACDH calculation has changed as of July 2017 - it is now 1/2 of previous calculated values


                  open        high         low       close   adj close  \
Date                                                                     
2020-04-22  273.609985  277.899994  272.200012  276.100006  276.100006   
2020-04-23  275.869995  281.750000  274.869995  275.029999  275.029999   
2020-04-24  277.200012  283.010010  277.000000  282.970001  282.970001   
2020-04-27  281.799988  284.540009  279.950012  283.170013  283.170013   
2020-04-28  285.079987  285.829987  278.200012  278.579987  278.579987   
2020-04-29  284.730011  289.670013  283.890015  287.730011  287.730011   
2020-04-30  289.959991  294.529999  288.350006  293.799988  293.799988   
2020-05-01  286.250000  299.000000  285.850006  289.070007  289.070007   
2020-05-04  289.170013  293.690002  286.320007  293.160004  293.160004   
2020-05-05  295.059998  301.000000  294.459991  297.559998  297.559998   
2020-05-06  300.459991  303.239990  298.869995  300.630005  300.630005   

              volume  high_low_diff  

In [42]:
stock_with_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,boll_lb_-1_d,boll_k_diff,high_low_diff,open_close_diff,open,high,low,close,adj close,volume
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,1.051018,2.198181,2.932728,-878590.9,0.023524,-0.00579,1.716835,2.512464,65.011284,59.653905,...,0.123543,-5.477497,6.74545,-1.691822,285.380907,290.378182,283.632732,287.072729,287.072729,35956530.0
std,1.743904,3.261737,4.887672,11690450.0,0.008188,0.01156,1.975065,4.227669,16.566217,12.880865,...,0.553017,7.057592,2.35426,3.267072,8.163458,8.488246,8.236644,8.661021,8.661021,9413822.0
min,-1.620944,-3.709991,-4.72998,-26762200.0,0.014536,-0.020391,-1.551406,-3.81466,37.534866,42.776564,...,-0.609704,-15.684712,4.369995,-5.769989,273.609985,277.899994,272.200012,275.029999,275.029999,28001200.0
25%,-0.15843,0.490021,-0.434998,-1854850.0,0.020366,-0.011748,0.123942,-0.583363,55.5857,48.179759,...,-0.311132,-11.041049,5.73999,-3.419998,279.5,283.775009,277.600006,280.774994,280.774994,30237750.0
50%,1.414881,2.920013,4.089996,423600.0,0.021239,-0.009019,2.48165,3.601089,70.345502,59.174988,...,0.051098,-6.732867,6.179993,-2.5,285.079987,289.670013,283.890015,287.730011,287.730011,33392000.0
75%,2.494982,4.914978,6.899994,4932400.0,0.025078,-0.002702,2.742549,5.757807,78.199478,70.676407,...,0.436645,-0.601639,7.125,-0.77002,289.565002,296.764999,287.335007,293.479996,293.479996,36260600.0
max,3.284523,5.889984,9.150024,14388200.0,0.045491,0.023333,5.463422,7.915148,83.538848,77.660894,...,1.02605,4.644997,13.149994,6.5,300.459991,303.23999,298.869995,300.630005,300.630005,60154200.0


## News Sentiment Features

In [7]:
# First run financial_news_data.ipynb to scrape financial news

# open file
with open('./data/'+target+'/news_09-19.json') as json_file:
    news_data = json.load(json_file)

time_index = list(stock_with_absolute.index)
score = {}
des_score = {}
for time in time_index:
    yesterday = (time - timedelta(days=1)).strftime("%m/%d/%Y")
    if(yesterday in news_data):
        num_news = len(news_data[yesterday])
        sentiment = 0
        des_sentiment = 0
        for news in news_data[yesterday]:
            news_title = news['news_title'].replace('...', '')
            news_des = news['news_text'].encode("ascii", "ignore").decode("ascii").replace('...', '')
            blob = TextBlob(news_title)
            des_blob = TextBlob(news_des)
            sentiment += blob.sentiment.polarity
            des_sentiment += des_blob.sentiment.polarity
        score[time] = sentiment / num_news
        des_score[time] = des_sentiment / num_news
    else:
        score[time] = 0.0
        des_score[time] = 0.0
des_score_series = pd.Series(des_score)
des_score_series.name = "news_des_score"
title_score_series = pd.Series(score)
title_score_series.name = "news_title_score"
stock_without_absolute = stock_without_absolute.join([title_score_series, des_score_series])
stock_with_absolute = stock_with_absolute.join([title_score_series, des_score_series])

In [8]:
stock_with_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,high_low_diff,open_close_diff,open,high,low,close,adj close,volume,news_title_score,news_des_score
count,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,...,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0,2502.0
mean,0.074439,0.046859,0.04727,-7335.292,0.017341,-0.000235,0.147704,0.436701,57.119822,57.10607,...,0.942222,-0.01026,54.937342,55.391143,54.448921,54.947602,50.905797,40165840.0,0.075608,0.087938
std,1.438169,0.879032,0.911218,20207800.0,0.009083,0.01136,1.99477,3.286971,22.227605,19.199233,...,0.811377,0.738079,32.262246,32.498733,31.957023,32.245552,33.439694,22986030.0,0.146826,0.127471
min,-11.399546,-6.469994,-6.099998,-229492300.0,0.00412,-0.053898,-12.380344,-14.475906,5.849243,11.387073,...,0.15,-5.419998,23.09,23.32,22.73,23.01,18.228165,7425600.0,-0.714286,-0.75
25%,-0.674071,-0.290001,-0.280005,-6549600.0,0.011445,-0.006787,-0.931561,-1.365578,40.394887,43.08502,...,0.459999,-0.310001,29.530001,29.74,29.18,29.5125,24.267555,24594550.0,0.0,0.0
50%,0.053514,0.030001,0.02,-419450.0,0.01523,-0.000389,0.192823,0.52926,59.434207,58.800491,...,0.669998,-0.019999,43.875,44.195,43.5,43.92,39.584156,34547050.0,0.034091,0.06721
75%,0.819278,0.370001,0.360001,6349025.0,0.020423,0.005703,1.261678,2.351975,76.371931,72.923998,...,1.110001,0.25,68.907503,69.425001,68.479998,69.030001,65.731054,49851100.0,0.14,0.157197
max,10.452235,5.170006,6.599998,239117900.0,0.085416,0.061138,12.950242,15.401249,95.333442,92.557744,...,7.07,6.090004,147.020004,148.410004,147.0,148.059998,147.15596,319317900.0,1.0,1.0


## External Features

### a) S&P 500

In [43]:
SP500_raw = yf.download("^GSPC", start=stock_date_start, end=stock_date_end)
SP500_raw_data = SP500_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
SP500_raw_data['high_low_diff'] = (SP500_raw['High'] - SP500_raw['Low'])
SP500_raw_data['open_close_diff'] = (SP500_raw['Open'] - SP500_raw['Close'])
SP500_raw_data['high_low_diff_ratio'] = (SP500_raw['High'] - SP500_raw['Low']) / SP500_raw['Close']
SP500_raw_data['open_close_diff_ratio'] = (SP500_raw['Open'] - SP500_raw['Close']) / SP500_raw['Close']

# Add financial indicators
SP500_stats_data = stockstats.StockDataFrame.retype(SP500_raw_data)
SP500_stats_data[['change','close_delta','volume_delta', 'close_-2_r','close_-6_r']]

SP500_stock = pd.DataFrame(SP500_stats_data).add_prefix('sp500_')

[*********************100%***********************]  1 of 1 completed


In [44]:
stock_without_absolute = stock_without_absolute.join(SP500_stock[['sp500_high_low_diff',
       'sp500_open_close_diff', 'sp500_high_low_diff_ratio',
       'sp500_open_close_diff_ratio', 'sp500_change', 'sp500_close_delta',
       'sp500_volume_delta', 'sp500_close_-2_r', 'sp500_close_-6_r']].loc[stock_with_absolute.index])
stock_with_absolute = stock_with_absolute.join(SP500_stock.loc[stock_with_absolute.index])

In [45]:
stock_without_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,open_close_diff,sp500_high_low_diff,sp500_open_close_diff,sp500_high_low_diff_ratio,sp500_open_close_diff_ratio,sp500_change,sp500_close_delta,sp500_volume_delta,sp500_close_-2_r,sp500_close_-6_r
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,1.051018,2.198181,2.932728,-878590.9,0.023524,-0.00579,1.716835,2.512464,65.011284,59.653905,...,-1.691822,44.469105,3.916326,0.015584,0.001386,0.3764,10.169078,-19446360.0,0.529521,1.249035
std,1.743904,3.261737,4.887672,11690450.0,0.008188,0.01156,1.975065,4.227669,16.566217,12.880865,...,3.267072,7.78094,27.696022,0.002797,0.009696,1.592272,45.497642,719785400.0,2.056545,2.505473
min,-1.620944,-3.709991,-4.72998,-26762200.0,0.014536,-0.020391,-1.551406,-3.81466,37.534866,42.776564,...,-5.769989,34.679932,-27.72998,0.01209,-0.009755,-2.805903,-81.719971,-1769960000.0,-3.701299,-1.642622
25%,-0.15843,0.490021,-0.434998,-1854850.0,0.020366,-0.011748,0.123942,-0.583363,55.5857,48.179759,...,-3.419998,38.795044,-22.440063,0.013592,-0.00772,-0.61109,-17.555054,-223945000.0,-0.322494,-0.106215
50%,1.414881,2.920013,4.089996,423600.0,0.021239,-0.009019,2.48165,3.601089,70.345502,59.174988,...,-2.5,43.460205,0.439941,0.015258,0.000153,0.424983,12.030029,-30020000.0,1.33288,0.518795
75%,2.494982,4.914978,6.899994,4932400.0,0.025078,-0.002702,2.742549,5.757807,78.199478,70.676407,...,-0.77002,49.059937,26.599976,0.017367,0.009267,1.431606,40.339966,442535000.0,1.916437,1.376712
max,3.284523,5.889984,9.150024,14388200.0,0.045491,0.023333,5.463422,7.915148,83.538848,77.660894,...,6.5,60.439941,46.570068,0.021108,0.016264,2.658392,76.120117,947260000.0,2.883692,7.416243


### b) Gold

In [46]:
gold_raw = yf.download("GLD", start=stock_date_start, end=stock_date_end)
gold_raw_data = gold_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
gold_raw_data['high_low_diff'] = (gold_raw['High'] - gold_raw['Low'])
gold_raw_data['open_close_diff'] = (gold_raw['Open'] - gold_raw['Close'])
gold_raw_data['high_low_diff_ratio'] = (gold_raw['High'] - gold_raw['Low']) / gold_raw['Close']
gold_raw_data['open_close_diff_ratio'] = (gold_raw['Open'] - gold_raw['Close']) / gold_raw['Close']

# Add financial indicators
gold_stats_data = stockstats.StockDataFrame.retype(gold_raw_data)
gold_stats_data[['change','close_delta','volume_delta', 'close_-2_r','close_-6_r']]

gold_stock = pd.DataFrame(gold_stats_data).add_prefix('gold_')

[*********************100%***********************]  1 of 1 completed


In [47]:
stock_without_absolute = stock_without_absolute.join(gold_stock[['gold_high_low_diff', 'gold_open_close_diff',
       'gold_high_low_diff_ratio', 'gold_open_close_diff_ratio', 'gold_change',
       'gold_close_delta', 'gold_volume_delta', 'gold_close_-2_r',
       'gold_close_-6_r']].loc[stock_with_absolute.index])
stock_with_absolute = stock_with_absolute.join(gold_stock.loc[stock_with_absolute.index])

In [48]:
stock_without_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,sp500_close_-6_r,gold_high_low_diff,gold_open_close_diff,gold_high_low_diff_ratio,gold_open_close_diff_ratio,gold_change,gold_close_delta,gold_volume_delta,gold_close_-2_r,gold_close_-6_r
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,1.051018,2.198181,2.932728,-878590.9,0.023524,-0.00579,1.716835,2.512464,65.011284,59.653905,...,1.249035,1.896364,-0.376359,0.011787,-0.002322,0.024742,0.030909,79445.45,0.102688,-0.130581
std,1.743904,3.261737,4.887672,11690450.0,0.008188,0.01156,1.975065,4.227669,16.566217,12.880865,...,2.505473,0.523838,0.989204,0.003292,0.006162,1.077098,1.728845,2455234.0,1.359773,1.441905
min,-1.620944,-3.709991,-4.72998,-26762200.0,0.014536,-0.020391,-1.551406,-3.81466,37.534866,42.776564,...,-1.642622,1.039993,-1.779999,0.006486,-0.01114,-1.811657,-2.929993,-3724900.0,-1.268337,-2.179501
25%,-0.15843,0.490021,-0.434998,-1854850.0,0.020366,-0.011748,0.123942,-0.583363,55.5857,48.179759,...,-0.106215,1.649994,-1.089996,0.0103,-0.006711,-0.55485,-0.900002,-2444650.0,-1.098246,-1.294624
50%,1.414881,2.920013,4.089996,423600.0,0.021239,-0.009019,2.48165,3.601089,70.345502,59.174988,...,0.518795,1.770004,-0.009995,0.010992,-6.2e-05,0.35048,0.559998,1349500.0,0.105223,-0.334237
75%,2.494982,4.914978,6.899994,4932400.0,0.025078,-0.002702,2.742549,5.757807,78.199478,70.676407,...,1.376712,2.255005,0.185005,0.013901,0.001146,0.585235,0.934998,1920350.0,0.87292,0.817219
max,3.284523,5.889984,9.150024,14388200.0,0.045491,0.023333,5.463422,7.915148,83.538848,77.660894,...,7.416243,2.690002,1.419998,0.01694,0.008942,1.967086,3.119995,2788900.0,2.982155,1.967086


### c) Bonds

In [49]:
# 5 year bonds
y5bond_raw = yf.download("^FVX", start=stock_date_start, end=stock_date_end)
y5bond_raw_data = y5bond_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
y5bond_raw_data['high_low_diff'] = (y5bond_raw['High'] - y5bond_raw['Low'])
y5bond_raw_data['open_close_diff'] = (y5bond_raw['Open'] - y5bond_raw['Close'])
y5bond_raw_data['high_low_diff_ratio'] = (y5bond_raw['High'] - y5bond_raw['Low']) / y5bond_raw['Close']
y5bond_raw_data['open_close_diff_ratio'] = (y5bond_raw['Open'] - y5bond_raw['Close']) / y5bond_raw['Close']

# Add financial indicators
y5bond_stats_data = stockstats.StockDataFrame.retype(y5bond_raw_data)
y5bond_stats_data[['change','close_delta', 'close_-2_r','close_-6_r']] ### no volume

y5bond_stock = pd.DataFrame(y5bond_stats_data).drop(columns='volume').add_prefix('y5bond_')

# 10 year bonds

y10bond_raw = yf.download("^TNX", start=stock_date_start, end=stock_date_end)
y10bond_raw_data = y10bond_raw
# Add high_low_diff, open_close_diff, high_low_diff_ratio, open_close_diff_ratio
y10bond_raw_data['high_low_diff'] = (y10bond_raw['High'] - y10bond_raw['Low'])
y10bond_raw_data['open_close_diff'] = (y10bond_raw['Open'] - y10bond_raw['Close'])
y10bond_raw_data['high_low_diff_ratio'] = (y10bond_raw['High'] - y10bond_raw['Low']) / y10bond_raw['Close']
y10bond_raw_data['open_close_diff_ratio'] = (y10bond_raw['Open'] - y10bond_raw['Close']) / y10bond_raw['Close']

# Add financial indicators
y10bond_stats_data = stockstats.StockDataFrame.retype(y10bond_raw_data)
y10bond_stats_data[['change','close_delta', 'close_-2_r','close_-6_r']] ### no volume

y10bond_stock = pd.DataFrame(y10bond_stats_data).drop(columns='volume').add_prefix('y10bond_')



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [50]:
stock_without_absolute = pd.merge(stock_without_absolute, y10bond_stock[['y10bond_high_low_diff', 'y10bond_open_close_diff',
       'y10bond_high_low_diff_ratio', 'y10bond_open_close_diff_ratio',
       'y10bond_change', 'y10bond_close_delta', 'y10bond_close_-2_r',
       'y10bond_close_-6_r']], how="inner", left_index=True, right_index=True)
stock_with_absolute = pd.merge(stock_with_absolute, y10bond_stock, how="inner", left_index=True, right_index=True)

stock_without_absolute = pd.merge(stock_without_absolute, y5bond_stock[['y5bond_high_low_diff', 'y5bond_open_close_diff',
       'y5bond_high_low_diff_ratio', 'y5bond_open_close_diff_ratio',
       'y5bond_change', 'y5bond_close_delta', 'y5bond_close_-2_r',
       'y5bond_close_-6_r']], how="inner", left_index=True, right_index=True)
stock_with_absolute = pd.merge(stock_with_absolute, y5bond_stock, how="inner", left_index=True, right_index=True)

In [51]:
print("stock_without_absolute Null values? ", stock_without_absolute.isnull().values.any())
print("stock_with_absolute Null values? ", stock_with_absolute.isnull().values.any())

stock_without_absolute Null values?  False
stock_with_absolute Null values?  False


In [52]:
stock_without_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,y10bond_close_-2_r,y10bond_close_-6_r,y5bond_high_low_diff,y5bond_open_close_diff,y5bond_high_low_diff_ratio,y5bond_open_close_diff_ratio,y5bond_change,y5bond_close_delta,y5bond_close_-2_r,y5bond_close_-6_r
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,1.051018,2.198181,2.932728,-878590.9,0.023524,-0.00579,1.716835,2.512464,65.011284,59.653905,...,2.562648,1.147495,0.029091,-0.003182,0.079142,-0.008474,1.255311,0.003909,1.701198,1.196854
std,1.743904,3.261737,4.887672,11690450.0,0.008188,0.01156,1.975065,4.227669,16.566217,12.880865,...,4.764552,8.747269,0.006818,0.016005,0.01875,0.043121,5.754515,0.020964,5.913426,7.587011
min,-1.620944,-3.709991,-4.72998,-26762200.0,0.014536,-0.020391,-1.551406,-3.81466,37.534866,42.776564,...,-4.420738,-17.686165,0.02,-0.029,0.0542,-0.078591,-7.980051,-0.032,-9.725687,-13.126493
25%,-0.15843,0.490021,-0.434998,-1854850.0,0.020366,-0.011748,0.123942,-0.583363,55.5857,48.179759,...,0.424501,-2.345278,0.0245,-0.015,0.064012,-0.039018,-2.439021,-0.009,0.0,-3.434063
50%,1.414881,2.920013,4.089996,423600.0,0.021239,-0.009019,2.48165,3.601089,70.345502,59.174988,...,2.348993,0.305816,0.027,0.0,0.075362,0.0,1.373625,0.005,1.933702,2.168026
75%,2.494982,4.914978,6.899994,4932400.0,0.025078,-0.002702,2.742549,5.757807,78.199478,70.676407,...,4.713136,5.805018,0.035,0.0065,0.094851,0.017708,5.014569,0.0175,4.53769,7.564051
max,3.284523,5.889984,9.150024,14388200.0,0.045491,0.023333,5.463422,7.915148,83.538848,77.660894,...,11.616954,16.557378,0.039,0.028,0.107143,0.075881,10.164836,0.037,10.479041,10.164836


In [53]:
stock_with_absolute.describe()

Unnamed: 0,change,open_delta,close_delta,volume_delta,high_low_diff_ratio,open_close_diff_ratio,close_-2_r,close_-6_r,kdjk,kdjd,...,y5bond_close,y5bond_adj close,y5bond_high_low_diff,y5bond_open_close_diff,y5bond_high_low_diff_ratio,y5bond_open_close_diff_ratio,y5bond_change,y5bond_close_delta,y5bond_close_-2_r,y5bond_close_-6_r
count,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,...,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0,11.0
mean,1.051018,2.198181,2.932728,-878590.9,0.023524,-0.00579,1.716835,2.512464,65.011284,59.653905,...,0.367909,0.367909,0.029091,-0.003182,0.079142,-0.008474,1.255311,0.003909,1.701198,1.196854
std,1.743904,3.261737,4.887672,11690450.0,0.008188,0.01156,1.975065,4.227669,16.566217,12.880865,...,0.013678,0.013678,0.006818,0.016005,0.01875,0.043121,5.754515,0.020964,5.913426,7.587011
min,-1.620944,-3.709991,-4.72998,-26762200.0,0.014536,-0.020391,-1.551406,-3.81466,37.534866,42.776564,...,0.345,0.345,0.02,-0.029,0.0542,-0.078591,-7.980051,-0.032,-9.725687,-13.126493
25%,-0.15843,0.490021,-0.434998,-1854850.0,0.020366,-0.011748,0.123942,-0.583363,55.5857,48.179759,...,0.363,0.363,0.0245,-0.015,0.064012,-0.039018,-2.439021,-0.009,0.0,-3.434063
50%,1.414881,2.920013,4.089996,423600.0,0.021239,-0.009019,2.48165,3.601089,70.345502,59.174988,...,0.369,0.369,0.027,0.0,0.075362,0.0,1.373625,0.005,1.933702,2.168026
75%,2.494982,4.914978,6.899994,4932400.0,0.025078,-0.002702,2.742549,5.757807,78.199478,70.676407,...,0.369,0.369,0.035,0.0065,0.094851,0.017708,5.014569,0.0175,4.53769,7.564051
max,3.284523,5.889984,9.150024,14388200.0,0.045491,0.023333,5.463422,7.915148,83.538848,77.660894,...,0.401,0.401,0.039,0.028,0.107143,0.075881,10.164836,0.037,10.479041,10.164836


In [54]:
stock_with_absolute.columns

Index(['change', 'open_delta', 'close_delta', 'volume_delta',
       'high_low_diff_ratio', 'open_close_diff_ratio', 'close_-2_r',
       'close_-6_r', 'kdjk', 'kdjd', 'kdjj', 'macd', 'macds', 'macdh', 'rsi_6',
       'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 'boll_-1_d',
       'boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff', 'high_low_diff',
       'open_close_diff', 'open', 'high', 'low', 'close', 'adj close',
       'volume', 'sp500_open', 'sp500_high', 'sp500_low', 'sp500_close',
       'sp500_adj close', 'sp500_volume', 'sp500_high_low_diff',
       'sp500_open_close_diff', 'sp500_high_low_diff_ratio',
       'sp500_open_close_diff_ratio', 'sp500_change', 'sp500_close_delta',
       'sp500_volume_delta', 'sp500_close_-2_r', 'sp500_close_-6_r',
       'gold_open', 'gold_high', 'gold_low', 'gold_close', 'gold_adj close',
       'gold_volume', 'gold_high_low_diff', 'gold_open_close_diff',
       'gold_high_low_diff_ratio', 'gold_open_close_diff_ratio', 'gold_change',
     

In [21]:
# store training
stock_without_absolute.to_pickle('./data/'+target+'/stock_without_absolute.pkl')
stock_with_absolute.to_pickle('./data/'+target+'/stock_with_absolute.pkl')

In [22]:
label_abs_1d = label_abs_1d.loc[stock_with_absolute.index]
label_abs_7d = label_abs_7d.loc[stock_with_absolute.index]
label_abs_30d = label_abs_30d.loc[stock_with_absolute.index]

label_value_1d = label_value_1d.loc[stock_with_absolute.index]
label_value_7d = label_value_7d.loc[stock_with_absolute.index]
label_value_30d = label_value_30d.loc[stock_with_absolute.index]

In [23]:
# store labels
label_abs_1d.to_pickle('./data/'+target+'/label_abs_1d.pkl')
label_abs_7d.to_pickle('./data/'+target+'/label_abs_7d.pkl')
label_abs_30d.to_pickle('./data/'+target+'/label_abs_30d.pkl')

label_value_1d.to_pickle('./data/'+target+'/label_value_1d.pkl')
label_value_7d.to_pickle('./data/'+target+'/label_value_7d.pkl')
label_value_30d.to_pickle('./data/'+target+'/label_value_30d.pkl')