In [1]:
from pandas import read_csv
import numpy as np
import pandas as pd

In [2]:
#Function to add a prefix to all column headings
def rename_columns(prefix, dataset):
    for heading in (dataset.columns):
        dataset.rename(columns = {heading : (prefix + '_' + heading)},inplace=True)

In [3]:
#Function to get the row full data starts
def get_shortest_feature(dataset):
    lenghts = []
    for col in ['txVolume(USD)', 'txCount', 'marketcap(USD)', 'price(USD)',
           'exchangeVolume(USD)', 'activeAddresses', 'medianTxValue(USD)']:
        lenghts.append(len(dataset[dataset[col] >0]))
    return min(lenghts)

In [4]:
from matplotlib import pyplot

def get_float_headings_list(dataset):
    float_headings = []
    float_columns = dataset.loc[:0, dataset.dtypes == float].columns
    for index, heading in enumerate(float_columns):
        float_headings.append(dataset.columns.get_loc(float_columns[index]))
    return float_headings

def plot_timeseries_graphs(dataset, name=''):
    values = dataset.values
    # specify columns to plot
    groups = get_float_headings_list(dataset)
    i = 1
    # plot each column
    pyplot.figure()
    for group in groups:
        pyplot.subplot(len(groups), 1, i)
        pyplot.plot(values[:, group])
        pyplot.title(name + ' ' + dataset.columns[group], y=0.5, loc='right')
        i += 1
        
    return pyplot.show()
    


In [5]:
#function to slice a base dataset based to match with the dates of a dataset to be analyzed
def slice_base_asset_data(base_dataset, sliced_dataset, date_column_name='date'):
    valid_dates = sliced_dataset['date'].values
    sliced_base_data = base_dataset.loc[base_dataset['eth_date'].isin(valid_dates)]
    return sliced_base_data

## Describe the above data

In [6]:
#Function to merge two datasets by date
def merge_frames(dataset, base_dataset, dataset_column_name='date', base_dataset_column_name='eth_date'):
    x =pd.merge(dataset.rename(columns={dataset_column_name:'date'}), 
                base_dataset.rename(columns={base_dataset_column_name:'date'}), 
                on='date', how='left')
    return x

### Describe Problem with having empty cells in time series data

As A solution I decided to use the average of the previous & subsequent cell as a replacement. This is not perfect but will hopefully not affect the results significantly given the very low number of 0 value cells. 

In [7]:
#Function to replace 0/NaN values
def replace_bad_values(dataset):
    dataset.replace(0,np.nan, inplace=True)
    return dataset.interpolate()


### Add in Additional Columns

In [8]:
#Function to establish if ETH or REP performed better for the time period. This is the label
def generate_label(price_percent_change, eth_price_percent_change):
    if price_percent_change > eth_price_percent_change:
        return 1
    else:
        return 0

In [9]:
#Function to generate labels column
def generate_labels(dataset):
    performance_vs_eth = []
    for index, row in dataset.iterrows():
        x = generate_label(row["price_percent_inrcease(USD)"], row["eth_price_percent_inrcease(USD)"])
        performance_vs_eth.append(x)
    return performance_vs_eth

In [10]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
from pandas import concat

# convert series to supervised learning
def series_to_supervised(data, n_in=10, n_out=6, dropnan=False):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D
from keras.layers import Dropout
from keras.layers import LSTM

# fit network
# history = model.fit(train_X, train_y, epochs=400, batch_size=50, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
# pyplot.plot(history.history['loss'], label='train')
# pyplot.plot(history.history['val_loss'], label='test')
# pyplot.legend()
# pyplot.show()

# pyplot.plot(history.history['acc'], label='train')
# pyplot.plot(history.history['val_acc'], label='test')
# pyplot.legend()
# pyplot.show()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [60]:
class DataProcessing():
    
    def __init__(self, erc20_names = ['ae','bat','fun','gno','gnt','loom','omg','loom','omg','rep','salt','snt','zrx']):
        self.erc20_names = erc20_names # List of all erc20 tokens to be evaluated
        self.raw_data = {} # raw DataFrames from csv
        self.merged_data = {} # erc20 DataFrames with missing data removed & join with eth dataset on date
        self.final_sets = {}   
        self.eval_sets = {}
        self.train_X = None
        self.test_X = None
        self.train_y = None
        self.test_y = None
        self.history = None
        self.get_raw_datasets(self.erc20_names)
        
    def get_raw_datasets(self, erc20_names):
        eth_dataset=read_csv('eth.csv', index_col=False)
        rename_columns('eth', eth_dataset)
        self.raw_data['eth'] = eth_dataset
        for erc20 in erc20_names:
            csv = erc20 + '.csv'            
            dataset=read_csv(csv, index_col=False)
            self.raw_data[erc20] = dataset
            
    def merge_data(self):
        for erc20 in self.erc20_names:
            dataset = self.raw_data[erc20].copy().fillna(0)
            eth_dataset = self.raw_data['eth'].copy()
            valid_record_count = len(dataset) - get_shortest_feature(dataset)
            sliced_data = dataset.iloc[valid_record_count:]
            sliced_eth_data = slice_base_asset_data(eth_dataset,sliced_data,'eth_date')
            print('sliced eth data shape {}'.format(sliced_eth_data.shape))
            print('sliced {} data shape {}'.format(erc20, sliced_data.shape))
            merged_data = merge_frames(sliced_data,eth_dataset)
            self.merged_data[erc20] = merged_data.copy()
            
    def process_raw_dataset(self, csv_file, name):
        #get Eth dataset
        eth_dataset=read_csv('eth.csv', index_col=False)
        rename_columns('eth', eth_dataset)
        dataset=read_csv(csv_file, index_col=False)
        dataset.fillna(0, inplace=True)

        #plot graphs
        plot_timeseries_graphs(dataset, name)
        plot_timeseries_graphs(eth_dataset)

        valid_record_count = len(dataset) - get_shortest_feature(dataset)
        sliced_data = dataset.iloc[valid_record_count:]
        sliced_eth_data = slice_base_asset_data(eth_dataset,sliced_data,'eth_date')

        #sanity check sliced datasets are the same shape
        print('sliced eth data shape {}'.format(sliced_eth_data.shape))
        print('sliced {} data shape {}'.format(name, sliced_data.shape))
        merged_data = merge_frames(sliced_data,eth_dataset)

        #Plot Graphs
        plot_timeseries_graphs(merged_data)

        #remove 0's & NaN
        merged_data = replace_bad_values(merged_data)

        #Repeat Sanity Check for missing values
        print('NaN Cells: {}'.format(merged_data.isnull().values.sum()))

        #Repeat Sanity Check for 0 values
        print('Cells With a 0: {}'.format(merged_data.isin([0]).sum().sum()))
        merged_data.insert(5, 'price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(5, 'price_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(15, 'eth_price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(15, 'eth_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data['performance_vs_eth'] = np.nan

        #generate difference from previous day's price
        merged_data['price_inrcease(USD)'] = merged_data['price(USD)'].diff()
        merged_data['eth_price_inrcease(USD)'] = merged_data['eth_price(USD)'].diff()

        #generate percentage difference from previous day's price
        merged_data['price_percent_inrcease(USD)'] = merged_data['price(USD)'].pct_change()
        merged_data['eth_price_percent_inrcease(USD)'] = merged_data['eth_price(USD)'].pct_change()

        #Fill labels column, uses the next days price data to infer the target for the current day
        merged_data['current_performance_vs_eth'] = generate_labels(merged_data)
        merged_data['next_day_performance_vs_eth'] = merged_data['current_performance_vs_eth'].shift(-1)

        print('{} outperformed ETH on {} / {} days'.format(name, merged_data['current_performance_vs_eth'].sum(), merged_data['current_performance_vs_eth'].count()))
        print(merged_data['current_performance_vs_eth'].sum() / merged_data['current_performance_vs_eth'].count() * 100,  '% of the time')

        #remove irrelevant columns
        relevant_data = merged_data.drop(['date','eth_generatedCoins','eth_averageDifficulty','eth_blockCount','eth_percent_inrcease(USD)','current_performance_vs_eth','performance_vs_eth'], axis =1)

        #remove the first row(due to missing data)
        relevant_data = relevant_data.drop(0)
        relevant_data = relevant_data.drop(1)

        relevant_data = relevant_data[:-1]

        # load dataset
        new_dataset = relevant_data
        values = new_dataset.values
        # integer encode direction
        encoder = LabelEncoder()
        values[:,4] = encoder.fit_transform(values[:,4])
        # ensure all data is float
        values = values.astype('float32')

        # normalize features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled = scaler.fit_transform(values)
        # frame as supervised learning
        reframed = series_to_supervised(scaled, 1, 1)
        # drop columns we don't want to predict
        # reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
        # print(len(reframed))
        # print(len(relevant_data))
        reframed = reframed.drop(0)
        
        self.final_sets[name] = reframed 

        return "upload for {} complete".format(csv_file)
        
    def build_model(self):
        self.model = Sequential()
        self.model.add(LSTM(40,input_shape=(self.train_X.shape[1], self.train_X.shape[2])))
        # model.add(Flatten())
        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))
        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))

        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))

        self.model.add(Dense(25))

        
        self.model.add(Dense(1, activation = 'sigmoid'))

        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            
        return "build complete"
    
    #use self.final_sets.get(rep) as the input variable
    def set_train_test_values(self, dataset, n_train_days=250):
            values = dataset.values
            train = values[:n_train_days, :]
            test = values[n_train_days:, :]

            # split into input and outputs
            self.train_X, self.train_y = train[:, :-1], train[:, -1]
            print(len(self.train_X), len(self.train_y))
            self.test_X, self.test_y = test[:, :-1], test[:, -1]
            print(len(self.test_X), len(self.test_y))

            # reshape input to be 3D [samples, timesteps, features]
            self.train_X = self.train_X.reshape((self.train_X.shape[0], 1, self.train_X.shape[1]))
            self.test_X = self.test_X.reshape((self.test_X.shape[0], 1, self.test_X.shape[1]))
            print(self.train_X.shape, self.train_y.shape, self.test_X.shape, self.test_y.shape)
            
    def set_eval_values(self, test_days=None):
        for erc20 in self.final_sets:
            
            values = self.final_sets.get(erc20).values
            days = len(values)
            if test_days == None:
                test_days = len(self.test_y)
        
            n_train_days = days - test_days 
            train = values[:n_train_days, :]
            test = values[n_train_days:, :]

            # split into input and outputs
            train_X, train_y = train[:, :-1], train[:, -1]
            print(len(train_X), len(train_y))
            test_X, test_y = test[:, :-1], test[:, -1]
            print(len(test_X), len(test_y))

            # reshape input to be 3D [samples, timesteps, features]
            self.eval_sets[erc20] = {'test_X' : test_X.reshape((test_X.shape[0], 1, test_X.shape[1])), 'test_y' : test_y} 
        
    def fit_model(self):        
        self.history = self.model.fit(self.train_X, self.train_y, epochs=400, batch_size=50, validation_data=(self.test_X, self.test_y), verbose=2, shuffle=False)
        # plot history
        pyplot.plot(self.history.history['loss'], label='train')
        pyplot.plot(self.history.history['val_loss'], label='test')
        pyplot.legend()
        pyplot.show()

        pyplot.plot(self.history.history['acc'], label='train')
        pyplot.plot(self.history.history['val_acc'], label='test')
        pyplot.legend()
        pyplot.show()



In [61]:
DataProcessing = DataProcessing(erc20_names=['ae','bat','fun','gno','gnt','rep'])


In [64]:
DataProcessing.merge_data()
DataProcessing.merged_data['ae']

sliced eth data shape (312, 16)
sliced ae data shape (312, 8)
sliced eth data shape (411, 16)
sliced bat data shape (411, 8)
sliced eth data shape (369, 16)
sliced fun data shape (369, 8)
sliced eth data shape (441, 16)
sliced gno data shape (441, 8)
sliced eth data shape (595, 16)
sliced gnt data shape (595, 8)
sliced eth data shape (346, 16)
sliced rep data shape (346, 8)


Unnamed: 0,date,txVolume(USD),txCount,marketcap(USD),price(USD),exchangeVolume(USD),activeAddresses,medianTxValue(USD),eth_txVolume(USD),eth_adjustedTxVolume(USD),...,eth_exchangeVolume(USD),eth_generatedCoins,eth_fees,eth_activeAddresses,eth_medianTxValue(USD),eth_medianFee,eth_averageDifficulty,eth_paymentCount,eth_blockSize,eth_blockCount
0,2017-09-09,2.999494e+06,443.0,101059000.0,0.433691,1451410.0,393.0,1214.977096,6.078558e+09,1.249127e+09,...,5.286210e+08,19039.53125,441.035120,166725.0,5.811137,0.000743,2.321405e+15,207777.0,61062245,3561
1,2017-09-10,2.650581e+06,337.0,101347000.0,0.434929,1295900.0,315.0,1598.364075,5.110929e+09,6.490751e+08,...,6.983260e+08,18988.75000,413.297569,150996.0,15.002251,0.000630,2.289896e+15,187046.0,52465083,3574
2,2017-09-11,1.763552e+06,341.0,89547600.0,0.384291,703492.0,311.0,1152.873000,4.463696e+09,8.171827e+08,...,5.713070e+08,18846.09375,393.332823,173476.0,28.779607,0.000478,2.312746e+15,215137.0,59529264,3551
3,2017-09-12,1.155435e+07,433.0,84851900.0,0.364139,1478770.0,360.0,1244.253131,4.630433e+09,8.667608e+08,...,7.531440e+08,19312.50000,519.977466,196918.0,29.480911,0.000525,2.375290e+15,245382.0,64986807,3603
4,2017-09-13,9.712114e+06,591.0,103432000.0,0.443873,3100890.0,400.0,1890.144396,2.996784e+09,8.568419e+08,...,9.216920e+08,18898.28125,463.058658,171052.0,26.584650,0.000473,2.324138e+15,213717.0,59875748,3536
5,2017-09-14,2.742420e+06,315.0,86771800.0,0.372378,688983.0,261.0,1228.847400,4.322451e+09,1.009428e+09,...,1.184590e+09,19289.68750,408.768330,191003.0,27.663792,0.000525,2.321976e+15,243658.0,62219159,3599
6,2017-09-15,3.403619e+06,301.0,69967800.0,0.300264,1194600.0,260.0,1277.171070,4.075869e+09,1.082351e+09,...,1.935700e+09,19536.40625,636.971537,205482.0,33.147405,0.000689,2.366097e+15,268318.0,69830597,3616
7,2017-09-16,1.367481e+06,272.0,80774600.0,0.346642,807397.0,224.0,1035.607470,2.513055e+09,6.147274e+08,...,7.117370e+08,19337.96875,660.859549,156651.0,24.876678,0.000894,2.350116e+15,226350.0,60261419,3609
8,2017-09-17,6.827055e+05,212.0,83651100.0,0.358986,478048.0,205.0,638.098693,3.001032e+09,4.292309e+08,...,4.274520e+08,19219.53125,371.183068,137378.0,12.397117,0.000840,2.390881e+15,174364.0,52089233,3630
9,2017-09-18,9.489240e+05,238.0,80023400.0,0.343418,680281.0,195.0,931.539643,4.785953e+09,6.147828e+08,...,1.127750e+09,19067.50000,403.163138,179198.0,13.969003,0.000628,2.351653e+15,208657.0,62377123,3557


In [None]:
DataProcessing.set_train_test_values(DataProcessing.final_sets.get('rep'))

In [None]:
DataProcessing.set_eval_values()

In [None]:
DataProcessing.build_model()

In [None]:
DataProcessing.test_X

In [None]:
DataProcessing.