In [1]:
import numpy as np
import csv

In [2]:
def collect_data(filename, delimiter):
    data = []
    with open(filename, 'rb') as csvfile:
        rows = csv.reader(csvfile, delimiter = delimiter)
        for row in rows:
            data.append(' '.join(row))
    return data

def treatment(data_list):
    ## converting strings to floats
    for i in range(1,len(data_list)): 
        data_list[i] = data_list[i].split()
        data_list[i] = map(float, data_list[i])
    return data_list

# Collect all data 

## Organising the data 

Two options possible: either do it in one big data frame or in atomized units. 

Actually keeping the two is a good idea since we can get a global feel of the data with the huge data frame and then more nuanced stuff using one data frame for every 8 entries. 

You also need fast read and write: do it via HDF. 



In [3]:
import pandas as pd

In [11]:
training_data = pd.read_csv('training_input.csv')
test_data = pd.read_csv('testing_input.csv')
target_data = pd.read_csv('challenge_output_data_training_file_prediction_of_trading_activity_within_the_order_book.csv',
                         delimiter=';')

In [12]:
target_data.head(n=10)

Unnamed: 0,ID,TARGET
0,1,1
1,2,0
2,3,1
3,4,0
4,5,0
5,6,1
6,7,1
7,8,0
8,9,1
9,10,0


In [5]:
from pandas import HDFStore

In [6]:
hdf = HDFStore('challenge_data.h5')

In [7]:
hdf.put('training', training_data, format='table', data_columns=True)

In [9]:
hdf.put('test', test_data, format='table', data_columns=True)



In [None]:
hdf['test'].shape

In [None]:
hdf.close()

In [5]:
print target_data.shape
print training_data.shape
print test_data.shape

(587214, 1)
(4697712, 23)
(4697720, 23)


In [6]:
train_data = collect_data('training_input.csv' , delimiter=',')
test_data = collect_data('testing_input.csv', delimiter=',')
target_data = collect_data('challenge_output_data_training_file_prediction_of_trading_activity_within_the_order_book.csv',
                           delimiter = ';')


In [7]:
print len(train_data), len(test_data), len(target_data)

4697713 4697721 587215


In [17]:
train_data = treatment(train_data)

In [None]:
test_data  = treatment(test_data)

'1 -500 1443.0 1444.0 1442.0 1445.0 170 119 509 579 35 17 78 82 1.91590043224 1.76231471084 2.3281993832 2.45821894169 64.0 1.0 9.0 1.0 0.0'

In [21]:
test = train_data[1][:]

In [23]:
test = map(float, test)
print test

[1.0, -1000.0, 1443.0, 1444.0, 1442.0, 1445.0, 170.0, 119.0, 509.0, 579.0, 35.0, 17.0, 78.0, 82.0, 1.91590043224, 1.76231471084, 2.3281993832, 2.45821894169, 64.0, 1.0, 9.0, 1.0, 0.0]


In [17]:
tags2 = tags2.split()

In [18]:
print tags2

['ID', 'offset', 'bid_1', 'ask_1', 'bid_2', 'ask_2', 'bid_size_1', 'ask_size_1', 'bid_size_2', 'ask_size_2', 'bid_entry_1', 'ask_entry_1', 'bid_entry_2', 'ask_entry_2', 'bid_entropy_1', 'ask_entropy_1', 'bid_entropy_2', 'ask_entropy_2', 'bid_sqentry_1', 'ask_sqentry_1', 'bid_sqentry_2', 'ask_sqentry_2', 'nb_trade']


In [20]:
print all_data[1]

1 -1000 1443.0 1444.0 1442.0 1445.0 170 119 509 579 35 17 78 82 1.91590043224 1.76231471084 2.3281993832 2.45821894169 64.0 1.0 9.0 1.0 0.0


In [21]:
test = all_data[1]
print test

1 -1000 1443.0 1444.0 1442.0 1445.0 170 119 509 579 35 17 78 82 1.91590043224 1.76231471084 2.3281993832 2.45821894169 64.0 1.0 9.0 1.0 0.0


In [22]:
test = test.split()
print test

['1', '-1000', '1443.0', '1444.0', '1442.0', '1445.0', '170', '119', '509', '579', '35', '17', '78', '82', '1.91590043224', '1.76231471084', '2.3281993832', '2.45821894169', '64.0', '1.0', '9.0', '1.0', '0.0']


In [25]:
test = map(float, test)
print test

[1.0, -1000.0, 1443.0, 1444.0, 1442.0, 1445.0, 170.0, 119.0, 509.0, 579.0, 35.0, 17.0, 78.0, 82.0, 1.91590043224, 1.76231471084, 2.3281993832, 2.45821894169, 64.0, 1.0, 9.0, 1.0, 0.0]


# Maybe splitting the files might work better

In [3]:
from itertools import chain, islice
def chunks(iterable,n):
    iterable = iter(iterable)
    while True:
        yield chain([next(iterable)],islice(iterable, n-1))
size = 30*10**6
test_file = 'training_input.csv'
with open(test_file) as bigfile:
    for i, lines in enumerate(chunks(bigfile,size)):
        file_split = '{}.{}'.format(test_file,i)
        with open(file_split,'w') as f:
            f.writelines(lines)