## Feature Selection

In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler

  from pandas.core import datetools


In [2]:
# Import dataframe 

df = pd.read_csv('cap1_df.csv', index_col='time')

In [3]:
# Standardize the data

sc_x = StandardScaler()
df_scaled = pd.DataFrame(sc_x.fit_transform(df), index=df.index, columns=df.columns)

In [4]:
# Perfrom differencing to stationalize the series

# Creat difference function, with default value of lag 24
def difference(dataset, interval=24):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return diff

# Differencing the dataset
inter_d = 24
df_diff = pd.DataFrame(index=df.index)
for i in df.columns:
    data = difference(df_scaled[i], inter_d)
    data = pd.Series(np.append(np.repeat(np.nan, inter_d), data), index=df.index, name=i)
    df_diff = pd.concat([df_diff, data], ignore_index=False, axis=1)


In [5]:
# Perform Dickey-fuller test to test the differenced series for stationality

diff = df_diff.dropna()

for i in diff.columns: 
    x = diff[i].values
    result = adfuller(x)
    print('\033[1m' + i + '\033[0m')
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])

[1mXEM[0m
ADF Statistic: -6.83476451521
p-value: 1.85461676998e-09
[1mIOT[0m
ADF Statistic: -5.98499877062
p-value: 1.8009544436e-07
[1mamd[0m
ADF Statistic: -5.02026979199
p-value: 2.02319235484e-05
[1mnvda[0m
ADF Statistic: -5.4958910015
p-value: 2.12394435815e-06
[1mmc_iota[0m
ADF Statistic: -5.68433259244
p-value: 8.34629593537e-07
[1mmc_nem[0m
ADF Statistic: -5.91253699238
p-value: 2.61826788857e-07
[1mgg_crypto[0m
ADF Statistic: -5.64429928845
p-value: 1.01967180794e-06
[1mgg_nem_p[0m
ADF Statistic: -6.93924355512
p-value: 1.0353170023e-09
[1mgg_iota_p[0m
ADF Statistic: -6.95862567119
p-value: 9.28808847821e-10
[1mgg_bitcoin_p[0m
ADF Statistic: -6.09303364912
p-value: 1.0255751682e-07
[1mgg_nem_w[0m
ADF Statistic: -7.02806492145
p-value: 6.28887308455e-10
[1mgg_iota_w[0m
ADF Statistic: -6.46507941208
p-value: 1.41182797081e-08
[1mgg_gpu[0m
ADF Statistic: -5.40863849423
p-value: 3.24853102633e-06
[1mrd_CrypCurr_g[0m
ADF Statistic: -6.16375948434
p-value

In [6]:
# Splitting data to test/train sets

X = diff[['IOT', 'amd', 'nvda', 'gg_crypto','gg_nem_p', 'gg_iota_p', 'gg_bitcoin_p', 'gg_nem_w', 'gg_iota_w','gg_gpu', 'rd_CrypCurr_g', 'rd_nem_g', 'rd_Iota_g']]
#not using marketcap data!!
Ytrain = pd.DataFrame(diff[:1310]['XEM'])
Ytest = pd.DataFrame(diff[1310:]['XEM'])

In [7]:
# Performing ElasticNet Regression 

en = ElasticNet(alpha=0.1, normalize=False)
en.fit(X[:1310], Ytrain)
coef = list(en.coef_)
count = 0
for i in X.columns:
    print(i, ':', coef[count])
    count = count + 1

IOT : 0.224049552638
amd : 0.0
nvda : 0.0
gg_crypto : -0.0341984437505
gg_nem_p : 0.0
gg_iota_p : -0.0
gg_bitcoin_p : -0.0
gg_nem_w : 0.0455045188925
gg_iota_w : -0.0
gg_gpu : 0.0
rd_CrypCurr_g : -0.0
rd_nem_g : 0.0
rd_Iota_g : 0.0


In [8]:
# Based on the ElasticNet results, manually identifying selected features (features with non-zero coefficients)

sig = ['IOT', 'gg_crypto', 'gg_nem_w']
#sig = ['IOT', 'amd', 'nvda', 'gg_crypto','gg_nem_p', 'gg_iota_p', 'gg_bitcoin_p', 'gg_nem_w', 'gg_iota_w','gg_gpu', 'rd_CrypCurr_g', 'rd_nem_g', 'rd_Iota_g']

In [9]:
# Constructing dataframe

X_sig = pd.DataFrame(index=X.index, columns=[sig])
for i in sig:
    X_sig[i] = X[i]
    count = count + 1

# Save dataframe
X_sig[:1310].to_csv('X_train.csv')
X_sig[1310:].to_csv('X_test.csv')
Ytrain.to_csv('Y_train.csv')
Ytest.to_csv('Y_test.csv')

### Conclusions: 
We performed ElasticNet regularization on all 13 predictors. Then, we selected 3 predictors with non-zero coefficents for our model. 