# Import sqlcell

In [1]:
from sqlcell import SqlcellWidget

In [24]:
%%sql
SELECT *
FROM 'data.csv'

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# Feature Engineering

以下sql代码实现了在当前股票以及日期下，计算相关指标，以及后一天按开盘价买入，分别以T+1，T+2日收盘价卖出的收益。  
可以以此来计算指标和未来收益的相关性，并且不存在未来函数。  
  
指标说明：  
LOW5：5日最低价  
HIGH5：5日最高价  
FA：5日均价  
FB：当日最高价  
FC：当日收盘价除以5日最低价  
FD：当日收盘价除以5日最高价  
FE：日最高价除以最低价的5日最大值  
FF：5日均价除以10日均价  
FG：10日收盘价时间序的第5个收盘价  
FH：10日收盘价的中位数 Median value  
FI：10日收盘价和中位数的差值的中位数 MEDIAN(ABS(x-MEDIAN(x)))  
FJ：10日最高价和最低价之间的相关性  
canBuy：是否能在当前交易日的下一个交易日买入  
gain：当前交易日的下一个交易日开盘价买入，T+1日收盘价卖出的收益  
gain3：当前交易日的下一个交易日开盘价买入，T+2日收盘价卖出的收益  
  
WINDOW  
five：5日窗口，按code分区按date排序  
ten：10日窗口，按code分区按date排序  
norm：一般窗口，不限定数量，按code分区按date排序  
  
WHERE  
过滤gain波动太大的，有可能是脏数据  
过滤无法买入的交易日  

In [17]:
%%sql mytable
SELECT *
FROM
(
	select code, date,
			min(L) OVER five as LOW5, 
			max(H) OVER five as HIGH5,
			LOW5 / HIGH5 as LH5,
			--Feature
			avg(C) OVER five as FA,
			H as FB,
			C / LOW5 as FC,
			C / HIGH5 as FD,
			max(H / L) OVER five as FE,
			avg(C) OVER five / avg(C) over ten as FF,
			nth_value(C, 5) over ten as FG,
			quantile_cont(C, 0.5) OVER ten FH, 
			mad(C) OVER ten FI,
			corr(H, L) OVER ten FJ,
			--2 Days return ratio
			lead(L, 1, null) OVER norm < lead(H, 1, null) OVER norm as canBuy,
			lead(C, 2, null) OVER norm / lead(O, 1, null) OVER norm - 1 as gain,
            lead(C, 3, null) OVER norm / lead(O, 1, null) OVER norm - 1 as gain3
	FROM 'data.csv'
	WINDOW
		five AS (PARTITION BY code ORDER BY date ASC ROWS BETWEEN 4 PRECEDING AND 0 FOLLOWING),
		ten AS (PARTITION BY code ORDER BY date ASC ROWS BETWEEN 9 PRECEDING AND 0 FOLLOWING),
		norm AS (PARTITION BY code ORDER BY date ASC)
) a
WHERE gain is not null AND abs(gain) < 0.30 AND gain3 is not null AND
		canBuy is not null AND canBuy

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# Percentage Return Ratio
将指定特征排序，按日划分为十个分位，计算每个分位的平均收益  

In [23]:
%%sql df
SELECT rank, avg(gain) * 100 GA, avg(gain3) * 100 GA3, count(1) c
FROM
(
	SELECT cast(percent_rank() OVER wds * 9 as int) rank, gain, gain3
	FROM mytable
	WINDOW wds as (PARTITION BY date ORDER BY FF)
)
GROUP BY rank
ORDER BY rank

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# Correlation Analysis
相关性分析显示，FF与gain呈负相关，换句话说，当FF指标越大时，这个股票短期内越可能跌  

In [22]:
%%sql df
SELECT corr(FA, gain),
		corr(FB, gain),
		corr(FC, gain),
		corr(FD, gain),
		corr(FE, gain),
		corr(FF, gain),
		corr(FG, gain),
		corr(FH, gain),
		corr(FI, gain)
FROM mytable

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

In [25]:
[{column:mytable.dtypes[column].name} for column in mytable]

[{'code': 'int32'},
 {'date': 'datetime64[ns]'},
 {'LOW5': 'float64'},
 {'HIGH5': 'float64'},
 {'LH5': 'float64'},
 {'FA': 'float64'},
 {'FB': 'float64'},
 {'FC': 'float64'},
 {'FD': 'float64'},
 {'FE': 'float64'},
 {'FF': 'float64'},
 {'canBuy': 'bool'},
 {'gain': 'float64'}]