In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from pandas.tools.plotting import lag_plot, scatter_matrix, autocorrelation_plot
from pandas import Series, DataFrame, concat
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from scipy.stats import boxcox

import logging
logging.basicConfig(format='%(asctime)s %(message)s', filename="BIA656-crypto_dataset.log", level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [6]:
# To install package which finds arima order 
# !pip install pyramid-arima
from pyramid.arima import auto_arima

In [2]:
# Import the csv file with historical neo data and other variables
logger.info('Define Source')
data = pd.DataFrame.from_csv('~\\neo.csv')
logger.debug('df: %s', data)

In [None]:
# Different types of autocorrelation plots

'''
logging.info('acf plotting')

autocorrelation_plot(data['Close'])
sm.graphics.tsa.plot_acf(data['Close'], lags=50)
plot_pacf(data['Close'], lags=50)
'''

In [8]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Market Cap,Created_At,Sum of retweet_count,Sum of favorite_count,Avg Retweet,Avg favorite_count,Google Trends,Num_upvotes,Avg_Up_Ratio,Post_Count,Comments_Count,Dependent Variable
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-11-30,32.54,34.98,30.09,32.96,78895100,2115360000,3,208,518,69.333333,172.666667,0,1681,0.938182,15,418,1
2017-11-29,37.99,39.44,31.23,32.28,137123000,2469340000,2,318,700,159.0,350.0,83,4048,0.918462,16,1614,0
2017-11-28,40.13,40.13,37.47,38.08,80129000,2608660000,4,281,506,70.25,126.5,83,4548,0.963182,25,1465,0
2017-11-27,38.32,41.91,38.32,40.13,83345900,2491020000,2,216,0,108.0,0.0,83,1596,0.9475,9,294,1
2017-11-26,38.04,39.2,37.49,38.44,61597700,2472680000,1,130,325,130.0,325.0,86,1181,0.938182,13,299,1


In [None]:
# Different types of autocorrelation plots

'''
values = DataFrame(data['Close'].values)
lags = 25
columns = [values]
for i in range(1,(lags + 1)):
	columns.append(values.shift(i))
dframe = concat(columns, axis=1)
columns = ['t+1']
for i in range(1,(lags + 1)):
	columns.append('t-' + str(i))
dframe.columns = columns
plt.figure(1, figsize=(20,20))

for i in range(1,(lags + 1)):
	ax = plt.subplot(5, 5, i)
	ax.set_title('t+1 vs t-' + str(i))
	plt.scatter(x=dframe['t+1'].values, y=dframe['t-'+str(i)].values)
plt.show()
'''

In [9]:
# Setting date as index
logging.info('Setting up dataframe')
data = data.reset_index()
data['Date'] = pd.to_datetime(data['Date'])
data.index = pd.DatetimeIndex(data.Date)

In [None]:
# Quickly determining if target variable is stationary
logging.info('Checking if target variable is stationary')
stat_check = sm.tsa.adfuller(data['Close'])
if stat_check[0]> stat_check[4]['5%']: 
    print('Time Series is  nonstationary')
else:
    print('Time Series is stationary')

In [10]:
# Setting up dataframe
logging.info('Dropping unnecessary variables')
data.drop(['Date', 'Open', 'High', 'Low', 'Market Cap', 'Dependent Variable'], axis=1, inplace=True)

In [11]:
# Function to determine lags
logging.info('Determining lags')
def stepwise_fit(data):
    model_order = auto_arima(data, start_p=1, start_q=1, max_p=30, max_q=2, m=12,
                          start_P=0, seasonal=False, d=1, D=1,
                          error_action='ignore',  # don't want to know if an order does not work
                          suppress_warnings=True,  # don't want convergence warnings
                          stepwise=False,
                          random=True, random_state=42, n_fits=25) # we can fit a random search (not exhaustive)
    return model_order.order[0]

In [12]:
# Building dictionary of lags (takes ~10 min)
logging.info('Building dictionary of lags')

df_lags = {}
for i in range(0,len(data.columns)):
    df = data.iloc[:, i]
    variable = str(data.columns[i])
    lag_periods = stepwise_fit(data.Close)
    df_lags[variable] = lag_periods

In [13]:
# Appending lag values to dataframe for each variable
logging.info('Appending lag values')

df_final = pd.DataFrame()
for i in range(0,len(data.columns)):
    df = data.iloc[:, i]
    variable = str(data.columns[i])
    lag_periods = df_lags[variable]
    if len(df[df==0]) > 0:
        df.loc[df==0] = min(df[df>0]) *.5
    df = boxcox(df)[0]
    df = pd.DataFrame(df).diff()
    lags = list(range(-lag_periods, 0))[::-1]
    dataframe = pd.DataFrame()
    dataframe = dataframe.append(df)
    dataframe.columns = [str(variable + ': t')]
    cols = [data.columns[i] + ': t' + str(s) for s in lags]
    cols = cols + list(dataframe.columns)
    dataframe = dataframe.reindex(columns = cols)
    for i in range(0, lag_periods):
        dataframe.iloc[:, i] = dataframe[str(variable + ': t')].shift(-i-1)
    if not df_final.empty:
        df_final = pd.concat([df_final, dataframe], axis=1) 
    else:
        df_final = dataframe

In [20]:
# Removing observations at end of dataframe with na values
logging.info('Finishing dataset')
df_final['Close: t'] = df_final['Close: t'].shift(1)
df_final = df_final[2:-15]

In [21]:
# Saving data
logging.info('Saving')
df_final.to_csv('~\\df_final.csv')