In [63]:
'''
import tushare as ts
df = ts.get_k_data('600642',ktype='5')
dt_cols = list(zip(*(df['date'].str.split(' '))))
df['date'] = dt_cols[0]
df['time'] = dt_cols[1]
df.head()
df.to_csv('data/600642_5min.csv')
'''

'''
import tushare as ts
df = ts.get_k_data('600642', start='2017-01-01', end='2018-06-30')
df.head()
df.to_csv('data/600642.csv')
'''

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/600642_5min.csv', index_col='date', parse_dates=True)
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,open,close,high,low,volume,code,time
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-03,5.2,5.19,5.2,5.18,1866.0,600642,14:55
2018-08-03,5.19,5.17,5.19,5.17,1418.0,600642,15:00
2018-08-06,5.18,5.18,5.19,5.17,1936.0,600642,09:35
2018-08-06,5.19,5.19,5.2,5.17,3586.0,600642,09:40
2018-08-06,5.19,5.19,5.2,5.18,1664.0,600642,09:45


In [3]:
def _valid_price(g):
    return (((g.max() - g.min()) / g.min()) < 0.223).all()

# 按照日期分组
days = df.groupby('date').agg(
    {'open': lambda g: _valid_price(g) and g[0] or 0,
     'high': lambda g: _valid_price(g) and np.max(g) or 0,
     'low': lambda g: _valid_price(g) and np.min(g) or 0,
     'close': lambda g: _valid_price(g) and g[-1] or 0,
     'volume': 'sum'})
days.head()

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-03,5.2,5.2,5.17,5.17,3284.0
2018-08-06,5.18,5.22,5.08,5.14,89350.0
2018-08-07,5.15,5.28,5.11,5.27,74158.0
2018-08-08,5.25,5.34,5.24,5.29,82297.0
2018-08-09,5.28,5.37,5.26,5.34,124683.0


In [4]:
df = pd.read_csv('data/600642.csv', index_col='date', parse_dates=True)
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,open,close,high,low,volume,code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,5.449,5.495,5.504,5.449,106965.0,600642
2017-01-04,5.486,5.523,5.532,5.486,99175.0,600642
2017-01-05,5.523,5.542,5.542,5.504,105144.0,600642
2017-01-06,5.542,5.616,5.625,5.532,287604.0,600642
2017-01-09,5.597,5.625,5.644,5.588,179594.0,600642


In [5]:
# 填充数据：生成日期索引
l = len(df)
start = df.iloc[0:1].index.tolist()[0]
end = df.iloc[l - 1: l].index.tolist()[0]
idx = pd.date_range(start=start, end=end)
idx

DatetimeIndex(['2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06',
               '2017-01-07', '2017-01-08', '2017-01-09', '2017-01-10',
               '2017-01-11', '2017-01-12',
               ...
               '2018-06-20', '2018-06-21', '2018-06-22', '2018-06-23',
               '2018-06-24', '2018-06-25', '2018-06-26', '2018-06-27',
               '2018-06-28', '2018-06-29'],
              dtype='datetime64[ns]', length=543, freq='D')

In [6]:
# 填充数据，把缺失的交易数据用前一个交易数据来填充，但交易量设置为 0
data = df.reindex(idx)
zvalues = data.loc[~(data.volume > 0)].loc[:, 'volume']
data.update(zvalues.fillna(0))
data.fillna(method='ffill', inplace=True)
data.head()

Unnamed: 0,open,close,high,low,volume,code
2017-01-03,5.449,5.495,5.504,5.449,106965.0,600642.0
2017-01-04,5.486,5.523,5.532,5.486,99175.0,600642.0
2017-01-05,5.523,5.542,5.542,5.504,105144.0,600642.0
2017-01-06,5.542,5.616,5.625,5.532,287604.0,600642.0
2017-01-07,5.542,5.616,5.625,5.532,0.0,600642.0


In [7]:
def gen_item_group_index(total, group_len): # 分组函数
    """ generate an item group index array 
    
    suppose total = 10, unitlen = 2, then we will return array [0 0 1 1 2 2 3 3 4 4]
    """
    
    group_count = int(total / group_len)
    group_index = np.arange(total)
    for i in range(group_count):
        group_index[i * group_len: (i + 1) * group_len] = i
    group_index[(i + 1) * group_len : total] = i + 1
    return group_index.tolist()

gen_item_group_index(10, 3)

[0, 0, 0, 1, 1, 1, 2, 2, 2, 3]

In [8]:
period = 30
group_index = gen_item_group_index(len(data), period)
# 把分组索引数据添加到股票数据里
data['group_index'] = group_index
print(len(data))
data.head().append(data.tail())

543


Unnamed: 0,open,close,high,low,volume,code,group_index
2017-01-03,5.449,5.495,5.504,5.449,106965.0,600642.0,0
2017-01-04,5.486,5.523,5.532,5.486,99175.0,600642.0,0
2017-01-05,5.523,5.542,5.542,5.504,105144.0,600642.0,0
2017-01-06,5.542,5.616,5.625,5.532,287604.0,600642.0,0
2017-01-07,5.542,5.616,5.625,5.532,0.0,600642.0,0
2018-06-25,4.861,4.803,4.87,4.774,60818.0,600642.0,17
2018-06-26,4.793,4.803,4.822,4.745,50648.0,600642.0,17
2018-06-27,4.813,4.764,4.822,4.764,44063.0,600642.0,18
2018-06-28,4.764,4.784,4.822,4.755,36828.0,600642.0,18
2018-06-29,4.793,4.832,4.851,4.774,46016.0,600642.0,18


In [9]:
# 针对下跌的波动，我们把最高价设置为负数
def _ceiling_price(g):
    return g.idxmin() < g.idxmax() and np.max(g) or (-np.max(g))
    

# 根据索引分组计算
group = data.groupby('group_index').agg({
                                        'volume': 'sum', 
                                        'low': 'min', 
                                        'high': _ceiling_price})
group.head()

Unnamed: 0_level_0,volume,low,high
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2847032.0,5.449,5.746
1,4104564.0,5.625,5.951
2,2369294.0,5.709,-5.895
3,3822968.0,5.504,-5.96
4,2486461.0,5.281,-5.681


In [10]:
# 添加每个分组的起始日期
date_col = pd.DataFrame({"group_index": group_index, "date": idx})
group['date'] = date_col.groupby('group_index').agg('first')
group.head()

Unnamed: 0_level_0,volume,low,high,date
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2847032.0,5.449,5.746,2017-01-03
1,4104564.0,5.625,5.951,2017-02-02
2,2369294.0,5.709,-5.895,2017-03-04
3,3822968.0,5.504,-5.96,2017-04-03
4,2486461.0,5.281,-5.681,2017-05-03


In [12]:
# 添加我们的波动指标 股票波动系数 = 最高价/最低价
group['ripples_radio'] = group.high / group.low
group.head()

Unnamed: 0_level_0,volume,low,high,date,ripples_radio
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2847032.0,5.449,5.746,2017-01-03,1.054505
1,4104564.0,5.625,5.951,2017-02-02,1.057956
2,2369294.0,5.709,-5.895,2017-03-04,-1.03258
3,3822968.0,5.504,-5.96,2017-04-03,-1.082849
4,2486461.0,5.281,-5.681,2017-05-03,-1.075743


In [13]:
# 降序排列。我们把分组的起始日期，交易量总和都列出来，也可以观察一下交易量和股票波动比的关系
ripples = group.sort_values('ripples_radio', ascending=False)
ripples.head()

Unnamed: 0_level_0,volume,low,high,date,ripples_radio
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11,3175975.0,5.352,5.775,2017-11-29,1.079036
6,3397988.0,5.727,6.16,2017-07-02,1.075607
5,2289722.0,5.542,5.913,2017-06-02,1.066943
1,4104564.0,5.625,5.951,2017-02-02,1.057956
0,2847032.0,5.449,5.746,2017-01-03,1.054505


In [16]:
# 我们算出前 10 个上涨的波动。作为这个股票的波动值。
# 最后，我们就可以根据所有股票的波动值来选择最优的股票了。
ripples.head(10).ripples_radio.mean()

0.639360894464561

In [17]:
# 我们也可以看一下前 10 个下跌的波动。
ripples.tail(10).ripples_radio.mean()

-1.070716470648314

In [19]:
# 计算涨跌幅
rise = df.close.diff()
rise.iloc[0] = 0
rise.head()

date
2017-01-03    0.000
2017-01-04    0.028
2017-01-05    0.019
2017-01-06    0.074
2017-01-09    0.009
Name: close, dtype: float64

In [20]:
df['rise'] = rise
df.head()

Unnamed: 0_level_0,open,close,high,low,volume,code,rise
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-03,5.449,5.495,5.504,5.449,106965.0,600642,0.0
2017-01-04,5.486,5.523,5.532,5.486,99175.0,600642,0.028
2017-01-05,5.523,5.542,5.542,5.504,105144.0,600642,0.019
2017-01-06,5.542,5.616,5.625,5.532,287604.0,600642,0.074
2017-01-09,5.597,5.625,5.644,5.588,179594.0,600642,0.009


In [22]:
def get_period_data(df, start_date, days):
    start_date = pd.Timestamp(start_date)
    end_date = start_date + pd.Timedelta(days=days)
    return df.loc[start_date:end_date]

get_period_data(df, '2017-06-01', 45)

Unnamed: 0_level_0,open,close,high,low,volume,code,rise
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-06-01,5.579,5.625,5.672,5.551,170454.0,600642,0.037
2017-06-02,5.597,5.616,5.634,5.542,139521.0,600642,-0.009
2017-06-05,5.625,5.634,5.644,5.607,89702.0,600642,0.018
2017-06-06,5.644,5.653,5.672,5.616,91717.0,600642,0.019
2017-06-07,5.653,5.7,5.718,5.653,146307.0,600642,0.047
2017-06-08,5.709,5.765,5.774,5.7,175597.0,600642,0.065
2017-06-09,5.765,5.765,5.783,5.727,173854.0,600642,0.0
2017-06-12,5.765,5.783,5.802,5.737,136762.0,600642,0.018
2017-06-13,5.765,5.765,5.783,5.718,103092.0,600642,-0.018
2017-06-14,5.755,5.746,5.774,5.718,95895.0,600642,-0.019


In [23]:
# 计算波动值
_ripple_radio = lambda data: data.high.max() / data.low.min()
ripple_radio = df.high.idxmin() < data.low.idxmax() and _ripple_radio(df) or -_ripple_radio(df)
ripple_radio

-1.298208640674394