In [1]:
# To install package which finds arima order 
# !pip install pyramid-arima
from pyramid.arima import auto_arima

  from pandas.core import datetools


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import urllib3
import itertools
import time
import datetime
from datetime import timedelta
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from pandas.tools.plotting import lag_plot, scatter_matrix, autocorrelation_plot
from pandas import Series, DataFrame, concat
import statsmodels.api as sm
import statsmodels.tsa.arima_process as tsp
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_process import arma_generate_sample
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import boxcox

import logging
logging.basicConfig(format='%(asctime)s %(message)s', filename="BIA656-crypto_dataset.log", level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [4]:
# Import the csv file with historical neo data and other variables
logger.info('Define Source')
data = pd.DataFrame.from_csv('C:/Users/Stevens/Desktop/BIA 656/Project/neo.csv')
logger.debug('df: %s', data)

In [None]:
# Different types of autocorrelation plots
logging.info('acf plotting')

autocorrelation_plot(data['Close'])
sm.graphics.tsa.plot_acf(data['Close'], lags=50)
plot_pacf(data['Close'], lags=50)

values = DataFrame(data['Close'].values)
lags = 25
columns = [values]
for i in range(1,(lags + 1)):
	columns.append(values.shift(i))
dframe = concat(columns, axis=1)
columns = ['t+1']
for i in range(1,(lags + 1)):
	columns.append('t-' + str(i))
dframe.columns = columns
pyplot.figure(1, figsize=(20,20))

for i in range(1,(lags + 1)):
	ax = pyplot.subplot(5, 5, i)
	ax.set_title('t+1 vs t-' + str(i))
	pyplot.scatter(x=dframe['t+1'].values, y=dframe['t-'+str(i)].values)
pyplot.show()

In [5]:
# Setting date as index
logging.info('Setting up dataframe')
data = data.reset_index()
data['Date'] = pd.to_datetime(data['Date'])
data.index = pd.DatetimeIndex(data.Date)

In [None]:
# Quickly determining if target variable is stationary
logging.info('Checking if target variable is stationary')
stat_check = sm.tsa.adfuller(data['Close'])
if stat_check[0]> stat_check[4]['5%']: 
    print('Time Series is  nonstationary')
else:
    print('Time Series is stationary')

In [6]:
# Setting up dataframe
logging.info('Dropping unnecessary variables')
data.drop(['Date', 'Open', 'High', 'Low', 'Market Cap', 'Dependent Variable'], axis=1, inplace=True)

In [7]:
# Function to determine lags
logging.info('Determining lags')
def stepwise_fit(data):
    model_order = auto_arima(data, start_p=1, start_q=1, max_p=30, max_q=2, m=12,
                          start_P=0, seasonal=False, d=1, D=1,
                          error_action='ignore',  # don't want to know if an order does not work
                          suppress_warnings=True,  # don't want convergence warnings
                          stepwise=False,
                          random=True, random_state=42, n_fits=25) # we can fit a random search (not exhaustive)
    return model_order.order[0]

In [8]:
# Building dictionary of lags
logging.info('Building dictionary of lags')

df_lags = {}
for i in range(0,len(data.columns)):
    df = data.iloc[:, i]
    variable = str(data.columns[i])
    lag_periods = stepwise_fit(data.Close)
    df_lags[variable] = lag_periods

In [9]:
# Appending lag values to dataframe for each variable
logging.info('Appending lag values')

df_final = pd.DataFrame()
for i in range(0,len(data.columns)):
    df = data.iloc[:, i]
    variable = str(data.columns[i])
    lag_periods = df_lags[variable]
    if len(df[df==0]) > 0:
        df.loc[df==0] = min(df[df>0]) *.5
    df = boxcox(df)[0]
    df = pd.DataFrame(df).diff()
    lags = list(range(-lag_periods, 0))[::-1]
    dataframe = pd.DataFrame()
    dataframe = dataframe.append(df)
    dataframe.columns = [str(variable + ': t')]
    cols = [data.columns[i] + ': t' + str(s) for s in lags]
    cols = cols + list(dataframe.columns)
    dataframe = dataframe.reindex(columns = cols)
    for i in range(0, lag_periods):
        dataframe.iloc[:, i] = dataframe[str(variable + ': t')].shift(-i-1)
    if not df_final.empty:
        df_final = pd.concat([df_final, dataframe], axis=1) 
    else:
        df_final = dataframe

In [12]:
# Removing observations at end of dataframe with na values
logging.info('Finishing dataset')
df_final = df_final[1:-10]

In [13]:
# Saving data
logging.info('Saving')
df_final.to_csv('C:/Users/Stevens/Desktop/BIA 656/Project/df_final.csv')