In [1]:
import os
import sys

import numpy as np
import pandas as pd
import tushare as ts
from datetime import datetime
from tqdm.notebook import tqdm
#from sklearn.linear_model import LinearRegression
from scipy.stats import linregress
import plotly.graph_objects as go
#import matplotlib.pyplot as plt

from utils import plot_k

sys.path.insert(0, './../tushare_code/')
%matplotlib inline

In [47]:
data_path = '../data/stockData/allstock/'
cols = [0, 1, 2, 3, 4, 5]
names = ['stock', 'date', 'open', 'high', 'low', 'close']

In [2]:
df_gldq = ts.get_k_data('000651', start='2000-01-01', end='2020-01-01')
df_gldq = df_gldq.set_index(pd.to_datetime(df_gldq["date"]))
df_gldq = df_gldq.drop(['date'], axis=1)
df_gldq.head()

Unnamed: 0_level_0,open,close,high,low,volume,code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,0.544,0.555,0.557,0.539,3549.8,651
2000-01-05,0.555,0.548,0.563,0.547,4366.1,651
2000-01-06,0.537,0.588,0.602,0.537,11334.15,651
2000-01-07,0.599,0.629,0.631,0.596,28765.47,651
2000-01-10,0.636,0.628,0.647,0.615,19977.43,651


In [8]:
prices = df_gldq.close

In [4]:
def momentum(closes):
    returns = np.log(closes)
    x = np.arange(len(returns))
    slope, _, rvalue, _, _ = linregress(x, returns)
    return ((1 + slope) ** 252) * (rvalue ** 2)  # annualize slope and multiply by R^2

In [5]:
momentums = df_gldq.close.rolling(90).apply(momentum)
t = momentums.describe()[-2] # 75%
momentums_np = np.nan_to_num(momentums.values)

In [20]:
indices = np.where(momentums_np > t)[0]
indices = np.sort(indices)[::-1][::90]

In [21]:
fig = go.Figure()
for end in idices[:3]:
    rets = np.log(prices[end - 90 : end])
    x = np.arange(len(rets))
    slope, intercept, r_value, p_value, std_err = linregress(x, rets)
    fig.add_trace(go.Scatter(x=np.arange(180), y=prices[end-90:end+90],
                        mode='lines',
                        name='lines'))
    fig.add_trace(go.Scatter(x=x, y=np.e ** (intercept + slope*x),
                        mode='lines',
                        name='lines'))
fig.update_layout(showlegend=False,
                 xaxis_title="Days",
                 yaxis_title="Stock price")
fig.show()

In [28]:
def get_price_diff(prices, idices, lag=30):
    inc = []
    dec = []
    ret = []
    for idx in idices:
        next_d = idx + lag
        diff =  prices[next_d] / prices[idx] - 1
        if diff > 0:
            inc.append(diff)
        else:
            dec.append(diff)
        ret.append(diff)
    return inc, dec, ret  

In [41]:
lag = 30
indices = np.where(momentums_np > t)[0]
indices = np.sort(indices)[::90]
inc, dec, ret = get_price_diff(prices, indices, lag)
print("there are {} times price increase after {} days".format(len(inc), lag))
print("there are {} times price decrease after {} days".format(len(dec), lag))

there are 9 times price increase after 30 days
there are 4 times price decrease after 30 days


In [39]:
pd.Series(ret).describe()

count    13.000000
mean      0.090057
std       0.165951
min      -0.217571
25%      -0.007167
50%       0.054695
75%       0.217288
max       0.414286
dtype: float64

In [45]:
files = os.listdir(data_path)
print("There are total {} stocks".format(len(files)))

There are total 3905 stocks


In [48]:
total_inc = 0
total_dec = 0
rets = []
for f in tqdm(files):
    if f.endswith('.xls'):
        #print('Processing file {}'.format(f))
        inc = []
        dec = []
        ret = []
        try:
            data_f = os.path.join(data_path, f)
            df = pd.read_excel(data_f, usecols=cols, names=names)
            df = df.set_index(pd.to_datetime(df["date"], format='%Y%m%d'))
            df = df.drop(['date'], axis=1)
            df = df[::-1]
            prices = df.close
            momentums = df.close.rolling(90).apply(momentum)
            t = momentums.describe()[-2] # 75%
            momentums_np = np.nan_to_num(momentums.values)
            indices = np.where(momentums_np > t)[0]
            indices = np.sort(indices)[::90]
            inc, dec, ret = get_price_diff(prices, indices, lag=30)
            if len(ret):
                rets.append(np.mean(ret))
            else:
                rets.append(0)
        except:
            pass
            #print('An error occurred for file {}'.format(f))
    total_inc += len(inc)
    total_dec += len(dec)
print('total inc {}'.format(total_inc))
print('total dec {}'.format(total_dec))

  0%|          | 0/3905 [00:00<?, ?it/s]


invalid value encountered in greater



total inc 12437
total dec 14141


In [49]:
pd.Series(rets).describe()

count    3598.000000
mean       -0.008519
std         0.099662
min        -0.436112
25%        -0.061626
50%        -0.000264
75%         0.045417
max         0.768108
dtype: float64