### Pipeline ID : dataclean

### Input Description

RAW OHLC data.

### Output  

Clean OHLC data in a hdf store

### Operations

This code takes a financial market data file and runs it through a processing pipeline. The following operations are carried out :

- Localise the time data to market time
- Merge with existing RAW data based on datetime
- Save the resulting RAW data to HDF5

In [None]:
#!pip install --upgrade ../../quantutils
#git+https://github.com/cwilko/quantutils.git
    
import os
import json
import pandas
import numpy
    
import quantutils.dataset.pipeline as ppl
from quantutils.api.auth import CredentialsFileStore
from quantutils.api.bluemix import ObjectStore
from quantutils.api.marketinsights import MarketInsights

PIPELINE_ID = "marketdirection"

    
##############
## Pipeline ##
##############

CONFIG_FILE = "../../marketinsights-datasets/rawConvert.json"

with open(CONFIG_FILE) as data_file:    
    config = json.load(data_file)

DS = config["datasources"]

credStore = CredentialsFileStore('~/.marketinsights')
objStore = ObjectStore(credStore)
mi = MarketInsights(credStore)

markets = dict()
## Loop over datasources...

for datasource in DS:
    
    DS_path = config["dataPath"] + datasource["name"] + "/"
    SRC_path = DS_path + "raw/"
        
    # Get HDFStore
    hdfFile = DS_path + datasource["name"] + ".hdf"
    print(hdfFile)
    hdfStore = pandas.HDFStore(hdfFile)
    
    for timeseries in datasource["timeseries"]:
        
        # Load Dataframe from store
        if timeseries["name"] in hdfStore:
            tsData = hdfStore[timeseries["name"]]
        else:
            tsData = pandas.DataFrame()
                        
        ## Loop over any source files...
        for infile in os.listdir(SRC_path):          

            newData = ppl.loadRawData(datasource, timeseries, SRC_path, infile)
            if not newData is None:

                ### RAW PIPELINE #############################################

                newData = ppl.localize(newData, datasource["timezone"], timeseries["timezone"])
                
                tsData = ppl.merge(newData, tsData)                
                
                ##############################################################  
        
        #ppl.save_hdf(tsData, timeseries["name"], hdfStore)
        # TODO : Back up to object storage
         

hdfStore.close()
