# Import asqlcell

In [1]:
import asqlcell

# Data Preview
数据来自A股。
code：股票代码  
date：日期  
O：开盘价  
H：最高价  
C：收盘价  
L：最低价  

In [2]:
%%sql dataframe_name
SELECT *
FROM data.csv
LIMIT 100

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# Feature Engineering

以下sql代码实现了在当前股票以及日期下，计算相关指标，以及后一天按开盘价买入，分别以T+1，T+2日收盘价卖出的收益。  
可以以此来计算指标和未来收益的相关性，并且不存在未来函数。  
  
指标说明：  
LOW5：5日最低价  
HIGH5：5日最高价  
FA：5日均价  
FB：LOW5 / HIGH5  
FC：当日收盘价除以5日最低价  
FD：当日收盘价除以5日最高价  
FE：日最高价除以最低价的5日最大值  
FF：5日均价除以10日均价  
FG：10日收盘价时间序的第5个收盘价  
FH：10日收盘价的中位数 Median value  
FI：10日收盘价和中位数的差值的中位数 MEDIAN(ABS(x-MEDIAN(x)))  
FJ：10日最高价和最低价之间的相关性  
FK: 10  
canBuy：是否能在当前交易日的下一个交易日买入  
gain：当前交易日的下一个交易日开盘价买入，T+1日收盘价卖出的收益  
gain3：当前交易日的下一个交易日开盘价买入，T+2日收盘价卖出的收益  
  
WINDOW  
five：5日窗口，按code分区按date排序  
ten：10日窗口，按code分区按date排序  
norm：一般窗口，不限定数量，按code分区按date排序  
  
WHERE  
过滤gain波动太大的，有可能是脏数据  
过滤无法买入的交易日  

In [3]:
%%sql mytable
SELECT *
FROM
(
	select code, date,
			min(L) OVER five as LOW5, 
			max(H) OVER five as HIGH5,
			--Feature
			avg(C) OVER five as FA,
			LOW5 / HIGH5 as FB,
			C / LOW5 as FC,
			C / HIGH5 as FD,
			max(H / L) OVER five as FE,
			avg(C) OVER five / avg(C) over ten as FF,
			C / nth_value(C, 5) over ten as FG,
			quantile_cont(C, 0.5) OVER ten FH, --Median value
			mad(C) OVER ten FI,
			corr(H, L) OVER ten FJ,
			regr_slope(H, L) OVER ten FK,
			--2 Days return ratio
			lead(L, 1, null) OVER norm < lead(H, 1, null) OVER norm as canBuy,
			lead(C, 2, null) OVER norm / lead(O, 1, null) OVER norm - 1 as gain,
			lead(C, 3, null) OVER norm / lead(O, 1, null) OVER norm - 1 as gain3
	FROM 'data.csv'
	WINDOW
		five AS (PARTITION BY code ORDER BY date ASC ROWS BETWEEN 4 PRECEDING AND 0 FOLLOWING),
		ten AS (PARTITION BY code ORDER BY date ASC ROWS BETWEEN 9 PRECEDING AND 0 FOLLOWING),
		norm AS (PARTITION BY code ORDER BY date ASC)
) a
WHERE gain is not null AND abs(gain) < 0.30 AND gain3 is not null AND
		canBuy is not null AND canBuy

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

mytable各字段的类型

In [4]:
[{column:mytable.dtypes[column].name} for column in mytable]

[{'code': 'int32'},
 {'date': 'datetime64[ns]'},
 {'LOW5': 'float64'},
 {'HIGH5': 'float64'},
 {'FA': 'float64'},
 {'FB': 'float64'},
 {'FC': 'float64'},
 {'FD': 'float64'},
 {'FE': 'float64'},
 {'FF': 'float64'},
 {'FG': 'float64'},
 {'FH': 'float64'},
 {'FI': 'float64'},
 {'FJ': 'float64'},
 {'FK': 'float64'},
 {'canBuy': 'bool'},
 {'gain': 'float64'},
 {'gain3': 'float64'}]

# Percentage Return Ratio
将指定特征排序，按日划分为6个分位，计算每个分位的平均收益。  
 

In [5]:
%%sql df1
SELECT RA, avg(gain) * 100 GA, avg(gain3) * 100 GA3, count(1) c
FROM
(
	SELECT cast(percent_rank() OVER wa * 5 as int) RA,
			gain, gain3
	FROM mytable
	WINDOW wa as (PARTITION BY date ORDER BY FA)
)
GROUP BY RA

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

这里用了GROUP BY CUBE的语法，直接生成括号里所有的特征子集，  
计算每个子CUBE中的平均收益。这个写法用在特征的联合网格分析中比较方便。 

In [6]:
%%sql df2
SELECT RA, RB, RC, avg(gain) * 100 GA, avg(gain3) * 100 GA3, count(1) c
FROM
(
	SELECT cast(percent_rank() OVER wa * 5 as int) RA,
			cast(percent_rank() OVER wb * 5 as int) RB,
			cast(percent_rank() OVER wc * 5 as int) RC,
			gain, gain3
	FROM mytable
	WINDOW
		wa as (PARTITION BY date ORDER BY FA),
		wb as (PARTITION BY date ORDER BY FB),
		wc as (PARTITION BY date ORDER BY FC)
)
GROUP BY CUBE (RA, RB, RC)

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# Correlation Analysis
相关性分析显示，FF与gain呈负相关，换句话说，当FF指标越大时，这个股票短期内越可能跌  

In [7]:
%%sql df3
SELECT corr(FA, gain),
		corr(FB, gain),
		corr(FC, gain),
		corr(FD, gain),
		corr(FE, gain),
		corr(FF, gain),
		corr(FG, gain),
		corr(FH, gain),
		corr(FI, gain),
		corr(FJ, gain),
		corr(FK, gain)
FROM mytable

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# 性能优化
用以下语句将csv文件转换成parquet格式，将极大的提升数据存储效率和文件载入效率。

In [8]:
%%sql df4
COPY (SELECT * FROM data.csv) to 'data.parquet' (FORMAT PARQUET)

SqlcellWidget(data_range=(0, 10), index_sort=('', 0))

# 在sql语句里直接引用python变量

In [9]:
colunm1 = 'FA'
colunm2 = 'FB'
%sql SELECT corr({colunm1}, {colunm2}) from mytable

Unnamed: 0,"corr(""FA"", ""FB"")"
0,-0.118705
