## dataProcess
### 数据说明
- Trddt 交易日期 - 以yyyy-mm-dd表示
- Agmtcd 合约代码  
- Trdvar 交易品种  
- Fdt003 日开盘价  
- Fdt006 日收盘价 
- Fdt007 日结算价 
- Fdt010 成交量 
### TODO
- 数据标准化
- 去除某一天没有交易的期货（流动性）
- 计算累积收益率

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
data = pd.read_csv("data/FUT_Fdt.csv")

In [3]:
len(data["Trdvar"].unique())

64

In [3]:
# 根据成交量筛选出主力合约
data.sort_values(by=["Trddt", "Trdvar"], inplace=True)

In [4]:
data.head()

Unnamed: 0,Trddt,Agmtcd,Trdvar,Fdt003,Fdt006,Fdt007,Fdt010
35,2017-01-03,L1701,LLDPE,9820.0,9600.0,9650.0,742.0
71,2017-01-03,L1702,LLDPE,9710.0,9710.0,9710.0,0.0
103,2017-01-03,L1703,LLDPE,9855.0,9855.0,9855.0,0.0
129,2017-01-03,L1704,LLDPE,9660.0,9860.0,9760.0,4.0
165,2017-01-03,L1705,LLDPE,9875.0,9710.0,9800.0,415780.0


In [5]:
def majorContract(group):
    '''
    @Description
    生成最大成交量列
    ------------
    @Params
    group, pd.groupby
    ------------
    @Returns
    group
    '''
    maxVol = group["Fdt010"].max()
    group["max"] = maxVol
    return group

In [6]:
data = data.groupby(["Trddt", "Trdvar"]).apply(majorContract)

In [7]:
data = data[data["Fdt010"]==data["max"]]

In [8]:
# 去除主力合约成交量过小的期货，如中密度纤维板、早籼稻等
data = data[data["max"]>100]

In [11]:
data = data[["Trddt", "Trdvar", "Fdt006", "Fdt010"]]

In [37]:
# 去除某一日无交易的品种
tradvars = data["Trdvar"].unique()
temp = data.groupby("Trdvar")["Trddt"]
count = temp.count()
fullTrade = (count == count.max())  # 没有交易空缺

In [42]:
data = data[data["Trdvar"].map(fullTrade)]

In [46]:
data["Trdvar"].unique(), len(data["Trdvar"].unique())
# 35品种 1075交易日

(array(['LLDPE', 'PTA', '动力煤', '天然橡胶', '棉花', '棕榈油', '热轧卷板', '焦炭', '焦煤',
        '玉米', '玉米淀粉', '玻璃', '甲醇', '白糖', '白银', '石油沥青', '硅铁', '聚丙烯', '聚氯乙烯',
        '菜籽油', '菜籽粕', '螺纹钢', '豆油', '豆粕', '铁矿石', '铅', '铜', '铝', '锌', '锡',
        '锰硅', '镍', '鲜鸡蛋', '黄大豆1号', '黄金'], dtype=object),
 35)

In [48]:
data.columns = ["date", "asset", "close", "volume"]

In [50]:
data.to_csv("data/preprocessed.csv", index=False)

## 计算累计收益率并导出

In [4]:
data = pd.read_csv("data/preprocessed.csv")

In [5]:
# 转化成时间格式
data.loc[:,"date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")

In [6]:
returns = {}
assets = data["asset"].unique()
for asset in assets:
    temp = data[data["asset"] ==asset].sort_values(by="date")["close"]
    # 收益率
    temp = (temp - temp.shift(1))/temp.shift(1) 
    # 累计收益率
    temp = (1+temp).cumprod()
    returns[asset] = temp

In [7]:
import pickle
with open("data/cumReturn.bin", "wb") as f:
    pickle.dump(returns, f)
    f.close()

In [8]:
with open("data/cumReturn.bin", "rb") as f:
    test = pickle.load(f)

In [12]:
type(test["LLDPE"])

pandas.core.series.Series

In [13]:
f.close()