#### Set up
Run this section before anything else

In [None]:
from datetime import datetime, timedelta
import dataloader_ricequant as dl

import pandas as pd
import rqdatac as rq
import missingno as msno

In [4]:
DATA_PATH = './data/' # TODO: Considering seperating raw and processed data
dl.rq_initialize()
price_data = dl.load_price_data()



#### Missing value analysis of price data

In [None]:
# backtest 10 years from 2011 to 2020
price_data.index = pd.to_datetime(price_data.index)
backtest_price_data = price_data['20201231':'20110101']
backtest_price_data = backtest_price_data.sort_index()

In [None]:
# Drop a row if it is all nan
backtest_price_data.dropna(axis=1, how='all',inplace=True)
backtest_price_data

In [None]:
# Calculate percentage of data missing for each stock, 
# record all stocks that have more than 50% data missing
missing = backtest_price_data.isnull().sum().reset_index().rename(columns = {0:'missNum'}).sort_values('missNum')
missing['missRate'] = missing['missNum']/backtest_price_data.shape[0]
missing = missing[missing.missRate > 0.5].sort_values(by = 'missRate', ascending=False)
missing

In [None]:
# Drop entries of stock with more than 50% data missing
removed_code = missing['index'].to_list()
backtest_codes = list(set(backtest_price_data.columns) - set(removed_code))

# FIXME: dropping stock data at the end of 10 yrs testing period is a form of look ahead bias
backtest_price_data = backtest_price_data[backtest_codes] 

In [None]:
# Store processed price data to HDF files
price_data.to_hdf('full price data.h5',key='price')
backtest_price_data.to_hdf('10-year non-ST price data.h5', key='price')

#### Download factor data

In [None]:
# Fill in factors intended to download, startdate, and enddate
# to bulk download. Downloaded file is saved to './data/factor'.
# Refer to Ricequant doc for all available factors:
# https://www.ricequant.com/welcome/rqdata 


factor_list = ['OBOS']
startdate = '20201001'
enddate = '20211001'

for factor in []:
    dl.download_factor_data(price_data.columns, factor, '20201001', '20211001') 

#### Missing value analysis of factor data

In [6]:
dl.load_factor_data('OBOS')

Unnamed: 0_level_0,Unnamed: 1_level_0,OBOS
order_book_id,date,Unnamed: 2_level_1
000001.XSHE,2021-08-31,-4.0
000001.XSHE,2021-09-01,-4.0
000001.XSHE,2021-09-02,-2.0
000001.XSHE,2021-09-03,-2.0
000001.XSHE,2021-09-06,0.0
000001.XSHE,2021-09-07,0.0
000001.XSHE,2021-09-08,0.0
000001.XSHE,2021-09-09,0.0
000001.XSHE,2021-09-10,2.0
000001.XSHE,2021-09-13,2.0
