In [40]:
import pandas as pd
from itertools import combinations

In [13]:
data = pd.read_csv('train_data.csv') 
symbols = 'ABCDEFGHIJ'

## Data Cleaning

In [14]:
data_sub = data[data['time'] >= '06:30:00']

In [15]:
for s in symbols:
    d = data_sub[data_sub['symbol'] == s]
    for i in range(87):
        row_ct = d[d.day == i].shape[0]
        if row_ct != 4680:
            print(f"Inconsistent: {row_ct}, i: {i}, s: {s}!")

Inconsistent: 4665, i: 67, s: A!
Inconsistent: 4668, i: 11, s: B!
Inconsistent: 4679, i: 22, s: B!
Inconsistent: 4679, i: 45, s: B!
Inconsistent: 4667, i: 47, s: B!
Inconsistent: 4670, i: 52, s: B!
Inconsistent: 4320, i: 0, s: D!
Inconsistent: 3960, i: 31, s: D!
Inconsistent: 4669, i: 5, s: F!
Inconsistent: 3960, i: 23, s: F!
Inconsistent: 4678, i: 55, s: F!
Inconsistent: 4669, i: 66, s: F!
Inconsistent: 4320, i: 0, s: H!
Inconsistent: 3960, i: 23, s: I!
Inconsistent: 3960, i: 42, s: I!
Inconsistent: 3960, i: 23, s: J!


In [33]:
# make sure dimensions match
full_time = list(data[(data['symbol'] == 'A') & (data['day'] == 2)]['time'])

In [63]:
full_days = [i for i in range(87) for _ in range(len(full_time))]

In [66]:
full_data = pd.DataFrame({"time": full_time*87, "day": full_days})

### Impute missing data based on immediate next non-null value

In [76]:
full_ds = []
for s in symbols:
    d = data[data['symbol'] == s]
    d_full = full_data.merge(d, how = 'left', on = ['time', 'day'])
    d_full.symbol = s
    d_full.fillna(method='bfill', inplace=True)
    full_ds.append(d_full)

In [77]:
pd.concat(full_ds)

Unnamed: 0,time,day,symbol,open,high,low,close,average
0,06:00:00,0,A,135.54,135.79,135.54,135.79,135.67
1,06:00:05,0,A,135.54,135.79,135.54,135.79,135.67
2,06:00:10,0,A,135.54,135.79,135.54,135.79,135.67
3,06:00:15,0,A,135.54,135.79,135.54,135.79,135.67
4,06:00:20,0,A,135.54,135.79,135.54,135.79,135.67
...,...,...,...,...,...,...,...,...
438475,12:59:35,86,J,186.37,186.40,186.37,186.39,186.40
438476,12:59:40,86,J,186.37,186.39,186.37,186.39,186.37
438477,12:59:45,86,J,186.40,186.43,186.40,186.43,186.42
438478,12:59:50,86,J,186.42,186.44,186.38,186.44,186.41


In [78]:

def split_train_test(dat):
    return dat[dat['day'] < 78], dat[dat['day'] >= 78]


### Build features

In [4]:
# features 
def get_change(l, s):
    return l / l.shift(s) - 1

data

Unnamed: 0,symbol,open,high,low,close,average,time,day
0,B,101.72,101.72,101.72,101.72,101.72,06:00:00,0
1,B,101.72,101.72,101.72,101.72,101.72,06:00:05,0
2,B,101.72,101.72,101.72,101.72,101.72,06:00:10,0
3,B,101.72,101.72,101.72,101.72,101.72,06:00:15,0
4,B,101.72,101.72,101.72,101.72,101.72,06:00:20,0
...,...,...,...,...,...,...,...,...
4330249,H,78.26,78.29,78.25,78.29,78.28,12:59:35,86
4330250,H,78.28,78.29,78.28,78.29,78.28,12:59:40,86
4330251,H,78.29,78.30,78.28,78.30,78.29,12:59:45,86
4330252,H,78.29,78.30,78.26,78.26,78.29,12:59:50,86
