# Processing

This notebook processes raw csv files into binary files (parquet) for portability purposes.

If you'd like more symbols processed in this format, just drop me a note in the server

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

demo = True
folder_name = 'demo' if demo else '5min'
suffix = 'demo' if demo else '5min_20160103_20190405'
symbols = ['CL', 'GC', 'ES']

## Helper Methods for Cleaning and Processing Data

In [None]:
def clean_data(data):
    """Process date and time into index, and Combine up and down volumes
    
    Format is:
        "Date","Time","Open","High","Low","Close","Up","Down"
        01/03/2016,18:05,52.18,52.90,52.18,52.69,2790,1543
    """
    data.index = pd.to_datetime(data['Date'] + ' ' + data['Time'])
    data['Volume'] = data['Up'] + data['Down']
    data = data.drop(['Date', 'Time', 'Up', 'Down'], axis=1)
    
    return data

def process_data(data, fname_symbol, folder_name, suffix):
    table = pa.Table.from_pandas(data, preserve_index=True)
    output = '{}_{}.parquet'.format(fname_symbol, suffix)
    pq.write_table(table, '../data/processed/{}/{}'.format(folder_name, output))
    print('Wrote {}'.format(output))

## Convert the CSV to dataframes and validate the filepaths

In [None]:
for fname_symbol in symbols:
    # Process
    data = pd.read_csv('../data/raw/{}/{}_{}.txt'.format(folder_name, fname_symbol, suffix))
    data = clean_data(data)
    process_data(data, fname_symbol, folder_name, suffix)

    # Read back
    df = pd.read_parquet(os.path.join('../data/processed/{}/'.format(folder_name), '{}_{}.parquet'.format(fname_symbol, suffix)))
    print(df.shape)