In [13]:
from pandas import read_csv
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, Dropout, LSTM


In [14]:
#Function to add a prefix to all column headings
def rename_columns(prefix, dataset):
    for heading in (dataset.columns):
        dataset.rename(columns = {heading : (prefix + '_' + heading)},inplace=True)

In [15]:
#Function to get the row full data starts
def get_shortest_feature(dataset):
    lenghts = []
    for col in ['txVolume(USD)', 'txCount', 'marketcap(USD)', 'price(USD)',
           'exchangeVolume(USD)', 'activeAddresses', 'medianTxValue(USD)']:
        lenghts.append(len(dataset[dataset[col] >0]))
    return min(lenghts)

In [16]:
from matplotlib import pyplot

def get_float_headings_list(dataset):
    float_headings = []
    float_columns = dataset.loc[:0, dataset.dtypes == float].columns
    for index, heading in enumerate(float_columns):
        float_headings.append(dataset.columns.get_loc(float_columns[index]))
    return float_headings

def plot_timeseries_graphs(dataset, name=''):
    values = dataset.values
    # specify columns to plot
    groups = get_float_headings_list(dataset)
    i = 1
    # plot each column
    pyplot.figure()
    for group in groups:
        pyplot.subplot(len(groups), 1, i)
        pyplot.plot(values[:, group])
        pyplot.title(name + ' ' + dataset.columns[group], y=0.5, loc='right')
        i += 1
        
    return pyplot.show()
    


In [17]:
#function to slice a base dataset based to match with the dates of a dataset to be analyzed
def slice_base_asset_data(base_dataset, sliced_dataset, date_column_name='date'):
    valid_dates = sliced_dataset['date'].values
    sliced_base_data = base_dataset.loc[base_dataset['eth_date'].isin(valid_dates)]
    return sliced_base_data

## Describe the above data

In [18]:
#Function to merge two datasets by date
def merge_frames(dataset, base_dataset, dataset_column_name='date', base_dataset_column_name='eth_date'):
    x =pd.merge(dataset.rename(columns={dataset_column_name:'date'}), 
                base_dataset.rename(columns={base_dataset_column_name:'date'}), 
                on='date', how='left')
    return x

### Describe Problem with having empty cells in time series data

As A solution I decided to use the average of the previous & subsequent cell as a replacement. This is not perfect but will hopefully not affect the results significantly given the very low number of 0 value cells. 

In [19]:
#Function to replace 0/NaN values, First row is deleted if Nans exist, remaining missing values interploated
def replace_bad_values(dataset):
    dataset = dataset.copy().replace(0,np.nan)
    if dataset.loc[0].isnull().sum() > 0:
        dataset.drop(0, inplace=True)
        dataset.reset_index(inplace=True)
    return dataset.interpolate()



### Add in Additional Columns

In [20]:
#Function to establish if ETH or REP performed better for the time period. This is the label
def generate_label(price_percent_change, eth_price_percent_change):
    if price_percent_change > eth_price_percent_change:
        return 1
    else:
        return 0

In [21]:
#Function to generate labels column
def generate_labels(dataset):
    performance_vs_eth = []
    for index, row in dataset.iterrows():
        x = generate_label(row["price_percent_inrcease(USD)"], row["eth_price_percent_inrcease(USD)"])
        performance_vs_eth.append(x)
    return performance_vs_eth

In [22]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
from pandas import concat

# convert series to supervised learning
def series_to_supervised(data, n_in=10, n_out=6, dropnan=False):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [53]:
class DataProcessing():
    
    def __init__(self, erc20_names = ['ae','bat','fun','gno','gnt','loom','omg','rep','salt','snt','zrx']):
        self.erc20_names = erc20_names # List of all erc20 tokens to be evaluated
        self.raw_data = {} # raw DataFrames from csv
        self.merged_data = {} # erc20 DataFrames with missing data removed & join with eth dataset on date
        self.clean_data = {} # interpolated merged_data set to remove nan values (some first rows removed due to NaN values) 
        self.full_data = {}
        self.final_sets = {}   
        self.eval_sets = {}
        
        self.train_X = None
        self.test_X = None
        self.train_y = None
        self.test_y = None
        self.history = None
        
        self.IRRELEVANT_COLUMNS = ['date','eth_generatedCoins','eth_averageDifficulty','eth_blockCount','eth_percent_inrcease(USD)','current_performance_vs_eth','performance_vs_eth']

        self.get_raw_datasets(self.erc20_names)
        
    def get_raw_datasets(self, erc20_names):
        eth_dataset=read_csv('eth.csv', index_col=False)
        rename_columns('eth', eth_dataset)
        self.raw_data['eth'] = eth_dataset
        for erc20 in erc20_names:
            csv = erc20 + '.csv'            
            dataset=read_csv(csv, index_col=False)
            self.raw_data[erc20] = dataset
            
    def merge_data(self):
        for erc20 in self.erc20_names:
            dataset = self.raw_data[erc20].copy().fillna(0)
            eth_dataset = self.raw_data['eth'].copy()
            
            #get the number of rows that have all valid data & slice datasets to only include this
            valid_record_count = len(dataset) - get_shortest_feature(dataset) 
            sliced_data = dataset.iloc[valid_record_count:]
            sliced_eth_data = slice_base_asset_data(eth_dataset,sliced_data,'eth_date')
            
            #sanity check sliced datasets are the same shape
            print('sliced eth data shape {}'.format(sliced_eth_data.shape))
            print('sliced {} data shape {}'.format(erc20, sliced_data.shape))
            
            #merge erc20 data & eth data (join on date)
            merged_data = merge_frames(sliced_data,eth_dataset)
            self.merged_data[erc20] = merged_data.copy()
            
    def remove_nans(self):
        for erc20 in self.erc20_names:
            merged_data = replace_bad_values(self.merged_data[erc20].copy())            
            #Repeat Sanity Check for missing values
            print('{} NaN Cells: {}'.format(erc20, merged_data.isnull().values.sum()))
            self.clean_data[erc20] = merged_data
            
    def populate_infered_data(self):
        for erc20 in self.erc20_names:
            
            dataset = self.clean_data[erc20].copy()
            
            #add nan filled columns
            dataset.insert(5, 'price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
            dataset.insert(5, 'price_inrcease(USD)', np.nan, allow_duplicates=False)
            dataset.insert(15, 'eth_price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
#             dataset.insert(15, 'eth_percent_inrcease(USD)', np.nan, allow_duplicates=False)
#             dataset['performance_vs_eth'] = np.nan
            
            #generate difference from previous day's price
            dataset['price_inrcease(USD)'] = dataset['price(USD)'].diff()
            dataset['eth_price_inrcease(USD)'] = dataset['eth_price(USD)'].diff()

            #generate percentage difference from previous day's price
            dataset['price_percent_inrcease(USD)'] = dataset['price(USD)'].pct_change()
            dataset['eth_price_percent_inrcease(USD)'] = dataset['eth_price(USD)'].pct_change()

            #Fill labels column, uses the next days price data to infer the target for the current day
            dataset['current_performance_vs_eth'] = generate_labels(dataset)
            dataset['next_day_performance_vs_eth'] = dataset['current_performance_vs_eth'].shift(-1)
            
            #NOTE: WOULD BE COOL TO ADD A HEAT CHART FOR WHAT DAYS ALL ASSETS OUTPERFORMED ETH
            print('{} outperformed ETH on {} / {} days'.format(erc20, dataset['current_performance_vs_eth'].sum(), dataset['current_performance_vs_eth'].count()))
            print(dataset['current_performance_vs_eth'].sum() / dataset['current_performance_vs_eth'].count() * 100,  '% of the time')

            self.full_data[erc20] = dataset[1:-1]
            
    def process_raw_dataset(self, csv_file, name):
        #get Eth dataset
        eth_dataset=read_csv('eth.csv', index_col=False)
        rename_columns('eth', eth_dataset)
        dataset=read_csv(csv_file, index_col=False)
        dataset.fillna(0, inplace=True)

        #plot graphs
        plot_timeseries_graphs(dataset, name)
        plot_timeseries_graphs(eth_dataset)

        valid_record_count = len(dataset) - get_shortest_feature(dataset)
        sliced_data = dataset.iloc[valid_record_count:]
        sliced_eth_data = slice_base_asset_data(eth_dataset,sliced_data,'eth_date')

        #sanity check sliced datasets are the same shape
        print('sliced eth data shape {}'.format(sliced_eth_data.shape))
        print('sliced {} data shape {}'.format(name, sliced_data.shape))
        merged_data = merge_frames(sliced_data,eth_dataset)

        #Plot Graphs
        plot_timeseries_graphs(merged_data)

        #remove 0's & NaN
        merged_data = replace_bad_values(merged_data)

        #Repeat Sanity Check for missing values
        print('NaN Cells: {}'.format(merged_data.isnull().values.sum()))

        #Repeat Sanity Check for 0 values
        print('Cells With a 0: {}'.format(merged_data.isin([0]).sum().sum()))
        merged_data.insert(5, 'price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(5, 'price_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(15, 'eth_price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(15, 'eth_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data['performance_vs_eth'] = np.nan

        #generate difference from previous day's price
        merged_data['price_inrcease(USD)'] = merged_data['price(USD)'].diff()
        merged_data['eth_price_inrcease(USD)'] = merged_data['eth_price(USD)'].diff()

        #generate percentage difference from previous day's price
        merged_data['price_percent_inrcease(USD)'] = merged_data['price(USD)'].pct_change()
        merged_data['eth_price_percent_inrcease(USD)'] = merged_data['eth_price(USD)'].pct_change()

        #Fill labels column, uses the next days price data to infer the target for the current day
        merged_data['current_performance_vs_eth'] = generate_labels(merged_data)
        merged_data['next_day_performance_vs_eth'] = merged_data['current_performance_vs_eth'].shift(-1)

        print('{} outperformed ETH on {} / {} days'.format(name, merged_data['current_performance_vs_eth'].sum(), merged_data['current_performance_vs_eth'].count()))
        print(merged_data['current_performance_vs_eth'].sum() / merged_data['current_performance_vs_eth'].count() * 100,  '% of the time')

        #remove irrelevant columns
        relevant_data = merged_data.drop(['date','eth_generatedCoins','eth_averageDifficulty','eth_blockCount','eth_percent_inrcease(USD)','current_performance_vs_eth','performance_vs_eth'], axis =1)

        #remove the first row(due to missing data)
        relevant_data = relevant_data.drop(0)
        relevant_data = relevant_data.drop(1)

        relevant_data = relevant_data[:-1]

        # load dataset
        new_dataset = relevant_data
        values = new_dataset.values
        # integer encode direction
        encoder = LabelEncoder()
        values[:,4] = encoder.fit_transform(values[:,4])
        # ensure all data is float
        values = values.astype('float32')

        # normalize features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled = scaler.fit_transform(values)
        # frame as supervised learning
        reframed = series_to_supervised(scaled, 1, 1)
        # drop columns we don't want to predict
        # reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
        # print(len(reframed))
        # print(len(relevant_data))
        reframed = reframed.drop(0)
        
        self.final_sets[name] = reframed 

        return "upload for {} complete".format(csv_file)
        
    def build_model(self):
        self.model = Sequential()
        self.model.add(LSTM(40,input_shape=(self.train_X.shape[1], self.train_X.shape[2])))
        # model.add(Flatten())
        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))
        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))

        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))

        self.model.add(Dense(25))

        
        self.model.add(Dense(1, activation = 'sigmoid'))

        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            
        return "build complete"
    
    #use self.final_sets.get(rep) as the input variable
    def set_train_test_values(self, dataset, n_train_days=250):
            values = dataset.values
            train = values[:n_train_days, :]
            test = values[n_train_days:, :]

            # split into input and outputs
            self.train_X, self.train_y = train[:, :-1], train[:, -1]
            print(len(self.train_X), len(self.train_y))
            self.test_X, self.test_y = test[:, :-1], test[:, -1]
            print(len(self.test_X), len(self.test_y))

            # reshape input to be 3D [samples, timesteps, features]
            self.train_X = self.train_X.reshape((self.train_X.shape[0], 1, self.train_X.shape[1]))
            self.test_X = self.test_X.reshape((self.test_X.shape[0], 1, self.test_X.shape[1]))
            print(self.train_X.shape, self.train_y.shape, self.test_X.shape, self.test_y.shape)
            
    def set_eval_values(self, test_days=None):
        for erc20 in self.final_sets:
            
            values = self.final_sets.get(erc20).values
            days = len(values)
            if test_days == None:
                test_days = len(self.test_y)
        
            n_train_days = days - test_days 
            train = values[:n_train_days, :]
            test = values[n_train_days:, :]

            # split into input and outputs
            train_X, train_y = train[:, :-1], train[:, -1]
            print(len(train_X), len(train_y))
            test_X, test_y = test[:, :-1], test[:, -1]
            print(len(test_X), len(test_y))

            # reshape input to be 3D [samples, timesteps, features]
            self.eval_sets[erc20] = {'test_X' : test_X.reshape((test_X.shape[0], 1, test_X.shape[1])), 'test_y' : test_y} 
        
    def fit_model(self):        
        self.history = self.model.fit(self.train_X, self.train_y, epochs=400, batch_size=50, validation_data=(self.test_X, self.test_y), verbose=2, shuffle=False)
        # plot history
        pyplot.plot(self.history.history['loss'], label='train')
        pyplot.plot(self.history.history['val_loss'], label='test')
        pyplot.legend()
        pyplot.show()

        pyplot.plot(self.history.history['acc'], label='train')
        pyplot.plot(self.history.history['val_acc'], label='test')
        pyplot.legend()
        pyplot.show()



In [54]:
# DataProcessing = DataProcessing(erc20_names=['ae','bat','fun','gno','gnt','rep'])
DataProcessing = DataProcessing()

In [55]:
DataProcessing.merge_data()

sliced eth data shape (312, 16)
sliced ae data shape (312, 8)
sliced eth data shape (411, 16)
sliced bat data shape (411, 8)
sliced eth data shape (369, 16)
sliced fun data shape (369, 8)
sliced eth data shape (441, 16)
sliced gno data shape (441, 8)
sliced eth data shape (595, 16)
sliced gnt data shape (595, 8)
sliced eth data shape (119, 16)
sliced loom data shape (119, 8)
sliced eth data shape (367, 16)
sliced omg data shape (367, 8)
sliced eth data shape (346, 16)
sliced rep data shape (346, 8)
sliced eth data shape (292, 16)
sliced salt data shape (292, 8)
sliced eth data shape (384, 16)
sliced snt data shape (384, 8)
sliced eth data shape (336, 16)
sliced zrx data shape (336, 8)


In [56]:
DataProcessing.remove_nans()

ae NaN Cells: 0
bat NaN Cells: 0
fun NaN Cells: 0
gno NaN Cells: 0
gnt NaN Cells: 0
loom NaN Cells: 0
omg NaN Cells: 0
rep NaN Cells: 0
salt NaN Cells: 0
snt NaN Cells: 0
zrx NaN Cells: 0


In [57]:
DataProcessing.populate_infered_data()

ae outperformed ETH on 145 / 312 days
46.47435897435898 % of the time
bat outperformed ETH on 190 / 411 days
46.228710462287104 % of the time
fun outperformed ETH on 160 / 368 days
43.47826086956522 % of the time
gno outperformed ETH on 183 / 441 days
41.49659863945578 % of the time
gnt outperformed ETH on 257 / 595 days
43.19327731092437 % of the time
loom outperformed ETH on 53 / 119 days
44.537815126050425 % of the time
omg outperformed ETH on 161 / 367 days
43.869209809264305 % of the time
rep outperformed ETH on 155 / 346 days
44.797687861271676 % of the time
salt outperformed ETH on 120 / 292 days
41.0958904109589 % of the time
snt outperformed ETH on 158 / 384 days
41.14583333333333 % of the time
zrx outperformed ETH on 152 / 336 days
45.23809523809524 % of the time


In [67]:
DataProcessing.full_data['ae'].dtypes

date                                object
txVolume(USD)                      float64
txCount                            float64
marketcap(USD)                     float64
price(USD)                         float64
price_inrcease(USD)                float64
price_percent_inrcease(USD)        float64
exchangeVolume(USD)                float64
activeAddresses                    float64
medianTxValue(USD)                 float64
eth_txVolume(USD)                  float64
eth_adjustedTxVolume(USD)          float64
eth_txCount                        float64
eth_marketcap(USD)                 float64
eth_price(USD)                     float64
eth_price_percent_inrcease(USD)    float64
eth_exchangeVolume(USD)            float64
eth_generatedCoins                 float64
eth_fees                           float64
eth_activeAddresses                float64
eth_medianTxValue(USD)             float64
eth_medianFee                      float64
eth_averageDifficulty              float64
eth_payment

In [72]:
FEATURES_TO_IGNORE = ['date','current_performance_vs_eth','eth_generatedCoins']

values = DataProcessing.full_data['ae'].copy().drop('date',axis=1)
dates = DataProcessing.full_data['ae'].copy()['date']        
values = values.astype('float32')

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)


In [87]:
len(series_to_supervised(scaled))


310

In [None]:
labels = pd.DataFrame()
labels['date'] = DataProcessing.full_data['gnt']['date']
labels['sum'] = np.nan

for key in DataProcessing.erc20_names:
#     labels[key + '_y'] = DataProcessing.full_data[key]['next_day_performance_vs_eth']
    labels = pd.merge(labels, DataProcessing.full_data[key][['date','next_day_performance_vs_eth']],how='outer', left_on='date', right_on='date',suffixes=("", "_" +key))
#     labels[key + 'test'] = np.where(labels['date'] == DataProcessing.full_data[key]['date'], DataProcessing.full_data[key]['next_day_performance_vs_eth'], np.nan)
#     labels[key + 'test'].map(DataProcessing.full_data[key]['next_day_performance_vs_eth'])

labels[:-1].head()

In [None]:
import ipynb.fs.defs.vis as vis

In [None]:
vis.sns_labels_heatmap(labels)

In [None]:
vis.plotly_labels_heatmap(labels)

In [None]:
DataProcessing.set_train_test_values(DataProcessing.final_sets.get('rep'))

In [None]:
xt_day_performance_vs_eth_omg

In [None]:
DataProcessing.set_eval_values()

In [None]:
DataProcessing.build_model()

In [None]:
DataProcessing.test_X

In [None]:
if DataProcessing.merged_data['fun'].loc[0].isnull().sum() > 0:
    print ("yes")
else:
    print ("no")

In [None]:
def replace_bad_valuess(dataset):
    dataset = dataset.copy().replace(0,np.nan)
    if dataset.loc[0].isnull().sum() > 0:
        dataset.drop(0, inplace=True)
        dataset.reset_index(inplace=True)
    return dataset.interpolate()

replace_bad_valuess(DataProcessing.merged_data['fun'])