In [4]:
import pandas as pd
from pandas import read_csv, DataFrame, concat

import numpy as np

from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, Dropout, LSTM

from sklearn.preprocessing import MinMaxScaler, LabelEncoder


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
#Function to add a prefix to all column headings
def rename_columns(prefix, dataset):
    for heading in (dataset.columns):
        dataset.rename(columns = {heading : (prefix + '_' + heading)},inplace=True)

In [6]:
#Function to get the row full data starts
def get_shortest_feature(dataset):
    lenghts = []
    for col in ['txVolume(USD)', 'txCount', 'marketcap(USD)', 'price(USD)',
           'exchangeVolume(USD)', 'activeAddresses', 'medianTxValue(USD)']:
        lenghts.append(len(dataset[dataset[col] >0]))
    return min(lenghts)

In [7]:
from matplotlib import pyplot

def get_float_headings_list(dataset):
    float_headings = []
    float_columns = dataset.loc[:0, dataset.dtypes == float].columns
    for index, heading in enumerate(float_columns):
        float_headings.append(dataset.columns.get_loc(float_columns[index]))
    return float_headings

def plot_timeseries_graphs(dataset, name=''):
    values = dataset.values
    # specify columns to plot
    groups = get_float_headings_list(dataset)
    i = 1
    # plot each column
    pyplot.figure()
    for group in groups:
        pyplot.subplot(len(groups), 1, i)
        pyplot.plot(values[:, group])
        pyplot.title(name + ' ' + dataset.columns[group], y=0.5, loc='right')
        i += 1
        
    return pyplot.show()
    


In [8]:
#function to slice a base dataset based to match with the dates of a dataset to be analyzed
def slice_base_asset_data(base_dataset, sliced_dataset, date_column_name='date'):
    valid_dates = sliced_dataset['date'].values
    sliced_base_data = base_dataset.loc[base_dataset['eth_date'].isin(valid_dates)]
    return sliced_base_data

In [9]:
#Function to merge two datasets by date
def merge_frames(dataset, base_dataset, dataset_column_name='date', base_dataset_column_name='eth_date'):
    x =pd.merge(dataset.rename(columns={dataset_column_name:'date'}), 
                base_dataset.rename(columns={base_dataset_column_name:'date'}), 
                on='date', how='left')
    return x

In [10]:
#Function to replace 0/NaN values, First row is deleted if Nans exist, remaining missing values interploated
def replace_bad_values(dataset):
    dataset = dataset.copy().replace(0,np.nan)
    if dataset.loc[0].isnull().sum() > 0:
        dataset.drop(0, inplace=True)
        dataset.reset_index(inplace=True)
    return dataset.interpolate()



In [11]:
#Function to establish if ETH or REP performed better for the time period. This is the label
def generate_label(price_percent_change, eth_price_percent_change):
    if price_percent_change > eth_price_percent_change:
        return 1
    else:
        return 0

In [12]:
#Function to generate labels column
def generate_labels(dataset):
    performance_vs_eth = []
    for index, row in dataset.iterrows():
        x = generate_label(row["price_percent_inrcease(USD)"], row["eth_price_percent_inrcease(USD)"])
        performance_vs_eth.append(x)
    return performance_vs_eth

In [13]:
# convert series to supervised learning
def series_to_supervised(data, n_in=10, n_out=6, dropnan=False):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [96]:
class DataProcessing():
    
    def __init__(self, erc20_names = ['ae','bat','fun','gno','gnt','loom','omg','rep','salt','snt','zrx']):
        self.erc20_names = erc20_names # List of all erc20 tokens to be evaluated
        self.raw_data = {} # raw DataFrames from csv
        self.merged_data = {} # erc20 DataFrames with missing data removed & join with eth dataset on date
        self.clean_data = {} # interpolated merged_data set to remove nan values (some first rows removed due to NaN values)
        self.normalized_data = {} # normalized data, including corresponding dates & col_names
        self.full_data = {}
        self.final_sets = {}   
        self.eval_sets = {}
        
        self.train_X = None
        self.test_X = None
        self.train_y = None
        self.test_y = None
        self.history = None
        
        self.FEATURES_TO_IGNORE = ['date']


        self.get_raw_datasets(self.erc20_names)
        
    def get_raw_datasets(self, erc20_names):
        eth_dataset=read_csv('eth.csv', index_col=False)
        rename_columns('eth', eth_dataset)
        self.raw_data['eth'] = eth_dataset
        for erc20 in erc20_names:
            csv = erc20 + '.csv'            
            dataset=read_csv(csv, index_col=False)
            self.raw_data[erc20] = dataset
            
    def merge_data(self):
        for erc20 in self.erc20_names:
            dataset = self.raw_data[erc20].copy().fillna(0)
            eth_dataset = self.raw_data['eth'].copy()
            
            #get the number of rows that have all valid data & slice datasets to only include this
            valid_record_count = len(dataset) - get_shortest_feature(dataset) 
            sliced_data = dataset.iloc[valid_record_count:]
            sliced_eth_data = slice_base_asset_data(eth_dataset,sliced_data,'eth_date')
            
            #sanity check sliced datasets are the same shape
            print('sliced eth data shape {}'.format(sliced_eth_data.shape))
            print('sliced {} data shape {}'.format(erc20, sliced_data.shape))
            
            #merge erc20 data & eth data (join on date)
            merged_data = merge_frames(sliced_data,eth_dataset)
            self.merged_data[erc20] = merged_data.copy()
            
    def remove_nans(self):
        for erc20 in self.erc20_names:
            merged_data = replace_bad_values(self.merged_data[erc20].copy())            
            #Repeat Sanity Check for missing values
            print('{} NaN Cells: {}'.format(erc20, merged_data.isnull().values.sum()))
            self.clean_data[erc20] = merged_data
            
    def populate_infered_data(self):
        for erc20 in self.erc20_names:
            
            dataset = self.clean_data[erc20].copy()
            
            #add nan filled columns
            dataset.insert(5, 'price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
            dataset.insert(5, 'price_inrcease(USD)', np.nan, allow_duplicates=False)
            dataset.insert(15, 'eth_price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
#             dataset.insert(15, 'eth_percent_inrcease(USD)', np.nan, allow_duplicates=False)
#             dataset['performance_vs_eth'] = np.nan
            
            #generate difference from previous day's price
            dataset['price_inrcease(USD)'] = dataset['price(USD)'].diff()
            dataset['eth_price_inrcease(USD)'] = dataset['eth_price(USD)'].diff()

            #generate percentage difference from previous day's price
            dataset['price_percent_inrcease(USD)'] = dataset['price(USD)'].pct_change()
            dataset['eth_price_percent_inrcease(USD)'] = dataset['eth_price(USD)'].pct_change()

            #Fill labels column, uses the next days price data to infer the target for the current day
            dataset['current_performance_vs_eth'] = generate_labels(dataset)
            dataset['next_day_performance_vs_eth'] = dataset['current_performance_vs_eth'].shift(-1)
            
            #NOTE: WOULD BE COOL TO ADD A HEAT CHART FOR WHAT DAYS ALL ASSETS OUTPERFORMED ETH
            print('{} outperformed ETH on {} / {} days'.format(erc20, dataset['current_performance_vs_eth'].sum(), dataset['current_performance_vs_eth'].count()))
            print(dataset['current_performance_vs_eth'].sum() / dataset['current_performance_vs_eth'].count() * 100,  '% of the time')

            self.full_data[erc20] = dataset[1:-1]
    
    #normalize features extract dates for future reference
    def normalize_features(self, FEATURES_TO_IGNORE = None, ):
        
        if FEATURES_TO_IGNORE == None:
            FEATURES_TO_IGNORE = self.FEATURES_TO_IGNORE
            
        target_col = ['next_day_performance_vs_eth'] 
        for erc20 in self.erc20_names:
            
            values = DataProcessing.full_data[erc20].copy().drop(FEATURES_TO_IGNORE,axis=1)
            values = values.drop(target_col, axis=1)
            
            col_names = values.columns.values
            col_name_lookup = DataFrame(data = values.columns.values)
            col_name_lookup.insert(0, 'Var_num', range(1, 1+len(col_name_lookup)))
            col_name_lookup['Var_num'] = "var"+str(col_name_lookup['Var_num'])

            dates = DataProcessing.full_data[erc20].copy()['date'] 
            
            targets = DataProcessing.full_data[erc20].copy()['next_day_performance_vs_eth'].tolist()        
            
            values = values.astype('float32')

            # normalize features
            scaler = MinMaxScaler(feature_range=(0, 1))
            scaled = scaler.fit_transform(values)
            
            self.normalized_data[erc20] = {'features' : scaled, 'targets' : targets, 'dates' :  dates, 'col_names' : col_name_lookup}
    
    #fuction to convert features to be lstm compatible (basically adding cols from previous dates)
    def series_to_supervised(self):
        for erc20 in self.erc20_names:
            reframed = series_to_supervised(self.normalized_data[erc20]['features'].copy(), 1, 1)
            
            reframed = reframed.copy().drop(0)
        
            self.final_sets[erc20] = {'features' : reframed, 
                                      'targets' : self.normalized_data[erc20]['targets'].copy()[1:],
                                      'dates' : self.normalized_data[erc20]['dates'].copy()[1:],
                                      'col_names' : self.normalized_data[erc20]['col_names'].copy()
                                     }
            
    def process_raw_dataset(self, csv_file, name):
        #get Eth dataset
        eth_dataset=read_csv('eth.csv', index_col=False)
        rename_columns('eth', eth_dataset)
        dataset=read_csv(csv_file, index_col=False)
        dataset.fillna(0, inplace=True)

        #plot graphs
        plot_timeseries_graphs(dataset, name)
        plot_timeseries_graphs(eth_dataset)

        valid_record_count = len(dataset) - get_shortest_feature(dataset)
        sliced_data = dataset.iloc[valid_record_count:]
        sliced_eth_data = slice_base_asset_data(eth_dataset,sliced_data,'eth_date')

        #sanity check sliced datasets are the same shape
        print('sliced eth data shape {}'.format(sliced_eth_data.shape))
        print('sliced {} data shape {}'.format(name, sliced_data.shape))
        merged_data = merge_frames(sliced_data,eth_dataset)

        #Plot Graphs
        plot_timeseries_graphs(merged_data)

        #remove 0's & NaN
        merged_data = replace_bad_values(merged_data)

        #Repeat Sanity Check for missing values
        print('NaN Cells: {}'.format(merged_data.isnull().values.sum()))

        #Repeat Sanity Check for 0 values
        print('Cells With a 0: {}'.format(merged_data.isin([0]).sum().sum()))
        merged_data.insert(5, 'price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(5, 'price_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(15, 'eth_price_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data.insert(15, 'eth_percent_inrcease(USD)', np.nan, allow_duplicates=False)
        merged_data['performance_vs_eth'] = np.nan

        #generate difference from previous day's price
        merged_data['price_inrcease(USD)'] = merged_data['price(USD)'].diff()
        merged_data['eth_price_inrcease(USD)'] = merged_data['eth_price(USD)'].diff()

        #generate percentage difference from previous day's price
        merged_data['price_percent_inrcease(USD)'] = merged_data['price(USD)'].pct_change()
        merged_data['eth_price_percent_inrcease(USD)'] = merged_data['eth_price(USD)'].pct_change()

        #Fill labels column, uses the next days price data to infer the target for the current day
        merged_data['current_performance_vs_eth'] = generate_labels(merged_data)
        merged_data['next_day_performance_vs_eth'] = merged_data['current_performance_vs_eth'].shift(-1)

        print('{} outperformed ETH on {} / {} days'.format(name, merged_data['current_performance_vs_eth'].sum(), merged_data['current_performance_vs_eth'].count()))
        print(merged_data['current_performance_vs_eth'].sum() / merged_data['current_performance_vs_eth'].count() * 100,  '% of the time')

        #remove irrelevant columns
        relevant_data = merged_data.drop(['date','eth_generatedCoins','eth_averageDifficulty','eth_blockCount','eth_percent_inrcease(USD)','current_performance_vs_eth','performance_vs_eth'], axis =1)

        #remove the first row(due to missing data)
        relevant_data = relevant_data.drop(0)
        relevant_data = relevant_data.drop(1)

        relevant_data = relevant_data[:-1]

        # load dataset
        new_dataset = relevant_data
        values = new_dataset.values
        # integer encode direction
        encoder = LabelEncoder()
        values[:,4] = encoder.fit_transform(values[:,4])
        # ensure all data is float
        values = values.astype('float32')

        # normalize features
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled = scaler.fit_transform(values)
        # frame as supervised learning
        reframed = series_to_supervised(scaled, 1, 1)
        # drop columns we don't want to predict
        # reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
        # print(len(reframed))
        # print(len(relevant_data))
        reframed = reframed.drop(0)
        


        return "upload for {} complete".format(csv_file)
        
    def build_model(self):
        self.model = Sequential()
        self.model.add(LSTM(40,input_shape=(self.train_X.shape[1], self.train_X.shape[2])))
        # model.add(Flatten())
        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))
        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))

        self.model.add(Dense(25))
        self.model.add(Dropout(0.8))

        self.model.add(Dense(25))

        
        self.model.add(Dense(1, activation = 'sigmoid'))

        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
            
        return "build complete"
    
    #use self.final_sets.get(rep) as the input variable
    def set_train_test_values(self, dataset, n_train_days=250):
            values = dataset.values
            train = values[:n_train_days, :]
            test = values[n_train_days:, :]

            # split into input and outputs
            self.train_X, self.train_y = train[:, :-1], train[:, -1]
            print(len(self.train_X), len(self.train_y))
            self.test_X, self.test_y = test[:, :-1], test[:, -1]
            print(len(self.test_X), len(self.test_y))

            # reshape input to be 3D [samples, timesteps, features]
            self.train_X = self.train_X.reshape((self.train_X.shape[0], 1, self.train_X.shape[1]))
            self.test_X = self.test_X.reshape((self.test_X.shape[0], 1, self.test_X.shape[1]))
            print(self.train_X.shape, self.train_y.shape, self.test_X.shape, self.test_y.shape)
            
    def set_eval_values(self, test_days=None):
        for erc20 in self.final_sets:
            
            values = self.final_sets.get(erc20).values
            days = len(values)
            if test_days == None:
                test_days = len(self.test_y)
        
            n_train_days = days - test_days 
            train = values[:n_train_days, :]
            test = values[n_train_days:, :]

            # split into input and outputs
            train_X, train_y = train[:, :-1], train[:, -1]
            print(len(train_X), len(train_y))
            test_X, test_y = test[:, :-1], test[:, -1]
            print(len(test_X), len(test_y))

            # reshape input to be 3D [samples, timesteps, features]
            self.eval_sets[erc20] = {'test_X' : test_X.reshape((test_X.shape[0], 1, test_X.shape[1])), 'test_y' : test_y} 
        
    def fit_model(self):        
        self.history = self.model.fit(self.train_X, self.train_y, epochs=400, batch_size=50, validation_data=(self.test_X, self.test_y), verbose=2, shuffle=False)
        # plot history
        pyplot.plot(self.history.history['loss'], label='train')
        pyplot.plot(self.history.history['val_loss'], label='test')
        pyplot.legend()
        pyplot.show()

        pyplot.plot(self.history.history['acc'], label='train')
        pyplot.plot(self.history.history['val_acc'], label='test')
        pyplot.legend()
        pyplot.show()



In [97]:
# DataProcessing = DataProcessing(erc20_names=['ae','bat','fun','gno','gnt','rep'])
DataProcessing = DataProcessing()

In [98]:
DataProcessing.merge_data()

sliced eth data shape (312, 16)
sliced ae data shape (312, 8)
sliced eth data shape (411, 16)
sliced bat data shape (411, 8)
sliced eth data shape (369, 16)
sliced fun data shape (369, 8)
sliced eth data shape (441, 16)
sliced gno data shape (441, 8)
sliced eth data shape (595, 16)
sliced gnt data shape (595, 8)
sliced eth data shape (119, 16)
sliced loom data shape (119, 8)
sliced eth data shape (367, 16)
sliced omg data shape (367, 8)
sliced eth data shape (346, 16)
sliced rep data shape (346, 8)
sliced eth data shape (292, 16)
sliced salt data shape (292, 8)
sliced eth data shape (384, 16)
sliced snt data shape (384, 8)
sliced eth data shape (336, 16)
sliced zrx data shape (336, 8)


In [99]:
DataProcessing.remove_nans()

ae NaN Cells: 0
bat NaN Cells: 0
fun NaN Cells: 0
gno NaN Cells: 0
gnt NaN Cells: 0
loom NaN Cells: 0
omg NaN Cells: 0
rep NaN Cells: 0
salt NaN Cells: 0
snt NaN Cells: 0
zrx NaN Cells: 0


In [100]:
DataProcessing.populate_infered_data()

ae outperformed ETH on 145 / 312 days
46.47435897435898 % of the time
bat outperformed ETH on 190 / 411 days
46.228710462287104 % of the time
fun outperformed ETH on 160 / 368 days
43.47826086956522 % of the time
gno outperformed ETH on 183 / 441 days
41.49659863945578 % of the time
gnt outperformed ETH on 257 / 595 days
43.19327731092437 % of the time
loom outperformed ETH on 53 / 119 days
44.537815126050425 % of the time
omg outperformed ETH on 161 / 367 days
43.869209809264305 % of the time
rep outperformed ETH on 155 / 346 days
44.797687861271676 % of the time
salt outperformed ETH on 120 / 292 days
41.0958904109589 % of the time
snt outperformed ETH on 158 / 384 days
41.14583333333333 % of the time
zrx outperformed ETH on 152 / 336 days
45.23809523809524 % of the time


In [101]:
DataProcessing.normalize_features()

In [102]:
DataProcessing.series_to_supervised()

In [103]:
len((DataProcessing.normalized_data['ae']['targets'][1:]))

309

In [104]:
DataProcessing.final_sets['ae']['col_names']

Unnamed: 0,Var_num,0
0,var0 1\n1 2\n2 3\n3 4\n4 ...,txVolume(USD)
1,var0 1\n1 2\n2 3\n3 4\n4 ...,txCount
2,var0 1\n1 2\n2 3\n3 4\n4 ...,marketcap(USD)
3,var0 1\n1 2\n2 3\n3 4\n4 ...,price(USD)
4,var0 1\n1 2\n2 3\n3 4\n4 ...,price_inrcease(USD)
5,var0 1\n1 2\n2 3\n3 4\n4 ...,price_percent_inrcease(USD)
6,var0 1\n1 2\n2 3\n3 4\n4 ...,exchangeVolume(USD)
7,var0 1\n1 2\n2 3\n3 4\n4 ...,activeAddresses
8,var0 1\n1 2\n2 3\n3 4\n4 ...,medianTxValue(USD)
9,var0 1\n1 2\n2 3\n3 4\n4 ...,eth_txVolume(USD)


In [106]:
(DataProcessing.final_sets['ae']['features']['var1(t-1)'])

1      0.029297
2      0.018656
3      0.136107
4      0.114007
5      0.030399
6      0.038330
7      0.013905
8      0.005690
9      0.008884
10     0.008807
11     0.002441
12     0.027244
13     0.002041
14     0.002849
15     0.007542
16     0.007110
17     0.011580
18     0.000000
19     0.003784
20     0.011382
21     0.017691
22     0.004259
23     0.011891
24     0.008067
25     0.005727
26     0.000715
27     0.004150
28     0.002003
29     0.002394
30     0.009856
         ...   
280    0.028859
281    0.014583
282    0.047989
283    0.047669
284    0.038319
285    0.072590
286    0.060222
287    0.042668
288    0.050371
289    0.017747
290    0.033783
291    0.031653
292    0.058213
293    0.024585
294    0.052381
295    0.017908
296    0.014298
297    0.054469
298    0.033527
299    0.046687
300    0.030997
301    0.063153
302    0.030256
303    0.063758
304    0.099737
305    0.042046
306    0.031914
307    0.025776
308    0.045185
309    0.034905
Name: var1(t-1), dtype: 

In [None]:
FEATURES_TO_IGNORE = ['date','current_performance_vs_eth','eth_generatedCoins']

values = DataProcessing.full_data['ae'].copy().drop(['date','next_day_performance_vs_eth'],axis=1)
dates = DataProcessing.full_data['ae'].copy()['date']        
values = values.astype('float32')

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

targets = DataProcessing.full_data['ae'].copy()['next_day_performance_vs_eth']        

In [None]:
len(series_to_supervised(scaled))
scaled

In [None]:
labels = pd.DataFrame()
labels['date'] = DataProcessing.full_data['gnt']['date']
labels['sum'] = np.nan

for key in DataProcessing.erc20_names:
#     labels[key + '_y'] = DataProcessing.full_data[key]['next_day_performance_vs_eth']
    labels = pd.merge(labels, DataProcessing.full_data[key][['date','next_day_performance_vs_eth']],how='outer', left_on='date', right_on='date',suffixes=("", "_" +key))
#     labels[key + 'test'] = np.where(labels['date'] == DataProcessing.full_data[key]['date'], DataProcessing.full_data[key]['next_day_performance_vs_eth'], np.nan)
#     labels[key + 'test'].map(DataProcessing.full_data[key]['next_day_performance_vs_eth'])

labels[:-1].head()

In [None]:
import ipynb.fs.defs.vis as vis

In [None]:
vis.sns_labels_heatmap(labels)

In [23]:
vis.plotly_labels_heatmap(labels)

NameError: name 'vis' is not defined

In [24]:
DataProcessing.set_train_test_values(DataProcessing.final_sets.get('rep'))

TypeError: 'builtin_function_or_method' object is not subscriptable

In [25]:
xt_day_performance_vs_eth_omg

NameError: name 'xt_day_performance_vs_eth_omg' is not defined

In [26]:
DataProcessing.set_eval_values()

TypeError: object of type 'builtin_function_or_method' has no len()

In [27]:
DataProcessing.build_model()

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
DataProcessing.test_X

In [28]:
if DataProcessing.merged_data['fun'].loc[0].isnull().sum() > 0:
    print ("yes")
else:
    print ("no")

no


In [29]:
def replace_bad_valuess(dataset):
    dataset = dataset.copy().replace(0,np.nan)
    if dataset.loc[0].isnull().sum() > 0:
        dataset.drop(0, inplace=True)
        dataset.reset_index(inplace=True)
    return dataset.interpolate()

replace_bad_valuess(DataProcessing.merged_data['fun'])

Unnamed: 0,index,date,txVolume(USD),txCount,marketcap(USD),price(USD),exchangeVolume(USD),activeAddresses,medianTxValue(USD),eth_txVolume(USD),...,eth_exchangeVolume(USD),eth_generatedCoins,eth_fees,eth_activeAddresses,eth_medianTxValue(USD),eth_medianFee,eth_averageDifficulty,eth_paymentCount,eth_blockSize,eth_blockCount
0,1,2017-07-15,5.105779e+05,214.0,49268800.0,0.012832,292390.0,131.0,360.402471,5.025524e+09,...,8.104520e+08,24148.59375,288.040627,107888.0,39.765022,0.000462,1.207945e+15,136491.0,40264684,4603
1,2,2017-07-16,8.043347e+05,329.0,39991000.0,0.010416,424293.0,202.0,176.219971,4.569388e+09,...,1.516890e+09,24009.68750,276.731321,114300.0,27.809779,0.000441,1.243782e+15,148763.0,42202748,4575
2,3,2017-07-17,6.338906e+05,216.0,42268300.0,0.011009,580590.0,160.0,164.584385,3.791200e+09,...,1.669500e+09,23518.28125,281.509048,116017.0,26.459340,0.000441,1.204494e+15,148824.0,43761400,4459
3,4,2017-07-18,7.307962e+05,218.0,49529500.0,0.012900,557570.0,159.0,230.177306,5.374200e+09,...,2.709260e+09,23954.06250,423.677943,124912.0,30.773615,0.000473,1.213897e+15,159970.0,51364024,4551
4,5,2017-07-19,6.219828e+05,214.0,50937500.0,0.013267,594138.0,165.0,313.106785,5.868629e+09,...,2.328790e+09,24483.59375,324.838750,128267.0,30.427694,0.000441,1.233999e+15,161719.0,47678315,4637
5,6,2017-07-20,4.149286e+06,194.0,43985600.0,0.011456,518285.0,142.0,195.515398,4.397296e+09,...,2.225000e+09,24492.81250,256.133190,123585.0,25.662710,0.000431,1.271418e+15,158267.0,43401483,4632
6,7,2017-07-21,4.260789e+05,167.0,47693300.0,0.012422,734587.0,129.0,235.872726,4.864176e+09,...,1.006130e+09,23842.18750,309.641557,116588.0,26.381267,0.000459,1.304971e+15,146635.0,39347024,4543
7,8,2017-07-22,4.124545e+05,195.0,42791700.0,0.011145,604149.0,142.0,100.305000,1.083531e+09,...,6.182600e+08,24307.34375,224.825603,100051.0,23.779710,0.000441,1.278260e+15,122995.0,37152575,4649
8,9,2017-07-23,6.057165e+05,216.0,44790200.0,0.011666,714335.0,175.0,129.013598,1.676927e+09,...,6.496150e+08,24483.28125,288.493998,134918.0,23.649885,0.000448,1.315028e+15,159634.0,43256183,4660
9,10,2017-07-24,7.857785e+05,285.0,52134900.0,0.013579,1074080.0,204.0,196.141105,4.889162e+09,...,4.316910e+08,24058.90625,309.440275,118577.0,23.763935,0.000449,1.302508e+15,144558.0,48384541,4582


In [30]:
targets.tolist()

NameError: name 'targets' is not defined

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

train_x = scaled[:200, :]
test_x = scaled[200:, :]

train_y = targets.tolist()[:200]
test_y = targets.tolist()[200:]

clf.fit(train_x, train_y)

In [None]:
clf.predict(test_x)

In [None]:
clf.score(test_x,test_y)

In [None]:
clf.score(train_x,train_y)

In [None]:
print (clf.decision_path([train_x[1],train_x[2]], check_input=True))

In [None]:
clf.predict_proba(train_x)

In [None]:
clf.feature_importances_