In [33]:
# Data Manipulation
import numpy as np
import pandas as pd

# Plotting graphs
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

df = pd.read_csv("Data/eth_clean_2.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,GDP,inflation,MoneySupply
0,2015-08-07,2.83162,3.53661,2.52112,2.77212,2.77212,164329.0,18224700000000.0,1.041462,11937.3
1,2015-08-08,2.79376,2.79881,0.714725,0.753325,0.753325,674188.0,18224700000000.0,1.041462,11925.9
2,2015-08-09,0.706136,0.87981,0.629191,0.701897,0.701897,532170.0,18224700000000.0,1.041462,11929.2
3,2015-08-10,0.713989,0.729854,0.636546,0.708448,0.708448,405283.0,18224700000000.0,1.041462,11943.1
4,2015-08-11,0.708087,1.13141,0.663235,1.06786,1.06786,1463100.0,18224700000000.0,1.041462,11955.2


In [34]:
df['S_10'] = df['Close'].rolling(window=10).mean()
df['Corr'] = df['Close'].rolling(window=10).corr(df['S_10'])
df['Open-Close'] = df['Open'] - df['Close'].shift(1)
df['Open-Open'] = df['Open'] - df['Open'].shift(1)
df = df.dropna()
y = np.where(df['Close'].shift(-1) > df['Close'],1,-1)
df = df.drop(['Date'],axis=1)
df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,GDP,inflation,MoneySupply,S_10,Corr,Open-Close,Open-Open
18,1.228610,1.241820,1.128650,1.140190,1.140190,1.307180e+06,1.822470e+13,1.041462,12130.5,1.307904,0.789666,-0.002660,-0.116980
19,1.132790,1.202480,1.061830,1.159980,1.159980,1.056750e+06,1.822470e+13,1.041462,12140.9,1.267299,0.875566,-0.007400,-0.095820
20,1.169810,1.188830,1.137290,1.147700,1.147700,6.866620e+05,1.822470e+13,1.041462,12140.9,1.261708,0.911504,0.009830,0.037020
21,1.147660,1.207790,1.120500,1.191380,1.191380,7.218720e+05,1.822470e+13,1.041462,12140.9,1.272141,0.909849,-0.000040,-0.022150
22,1.193530,1.207210,1.149490,1.182550,1.182550,3.753770e+05,1.822470e+13,1.041462,12140.9,1.264510,0.910740,0.002150,0.045870
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2066,2117.728516,2133.187500,1945.442139,1971.077271,1971.077271,3.611627e+10,2.143320e+13,2.000000,18548.7,2002.347180,0.653365,-0.650390,8.235352
2067,1969.133179,2091.516357,1959.079468,2088.573730,2088.573730,2.531296e+10,2.143320e+13,2.000000,18548.7,2029.236059,0.587182,-1.944092,-148.595337
2068,2088.772217,2102.873779,2055.163330,2072.108887,2072.108887,1.981247e+10,2.143320e+13,2.000000,18548.7,2051.843579,0.418043,0.198487,119.639038
2069,2071.111572,2196.996338,2062.787598,2135.942139,2135.942139,2.498624e+10,2.143320e+13,2.000000,18548.7,2073.601587,0.263198,-0.997315,-17.660645


In [35]:
from sklearn.model_selection import train_test_split

X = df
split = int(0.5*len(df))
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30,
                                                    random_state=101)

model = LogisticRegression()
model = model.fit (X_train,y_train)

In [36]:
pd.DataFrame(zip(X.columns, np.transpose(model.coef_)))

Unnamed: 0,0,1
0,Open,[2.665682702714765e-19]
1,High,[2.7578435870802125e-19]
2,Low,[2.3032301789460732e-19]
3,Close,[2.406857025656984e-19]
4,Adj Close,[2.406857025656984e-19]
5,Volume,[1.2359261186578687e-11]
6,GDP,[-2.575993706648313e-15]
7,inflation,[1.3910700414028183e-22]
8,MoneySupply,[2.45255846177277e-18]
9,S_10,[2.3713471284973746e-19]


In [37]:
probability = model.predict_proba(X_test)
probability

array([[0.50845442, 0.49154558],
       [0.51253185, 0.48746815],
       [0.46791565, 0.53208435],
       ...,
       [0.47661373, 0.52338627],
       [0.51173044, 0.48826956],
       [0.51175849, 0.48824151]])

In [38]:
probability = model.predict_proba(X_test)
print(probability)

predicted = model.predict(X_test)

[[0.50845442 0.49154558]
 [0.51253185 0.48746815]
 [0.46791565 0.53208435]
 ...
 [0.47661373 0.52338627]
 [0.51173044 0.48826956]
 [0.51175849 0.48824151]]


In [39]:
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))
print(model.score(X_test,y_test))
cross_val = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(cross_val)
print(cross_val.mean())

[[190 107]
 [192 127]]
              precision    recall  f1-score   support

          -1       0.50      0.64      0.56       297
           1       0.54      0.40      0.46       319

    accuracy                           0.51       616
   macro avg       0.52      0.52      0.51       616
weighted avg       0.52      0.51      0.51       616

0.5146103896103896
[0.50970874 0.49029126 0.50970874 0.51219512 0.51219512 0.52195122
 0.56097561 0.50731707 0.50731707 0.50731707]
0.5138977030547005


In [40]:
y_test
predicted

array([-1, -1,  1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1, -1, -1,
       -1,  1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1, -1,  1,  1, -1,  1,
       -1,  1, -1, -1, -1, -1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1, -1,
        1,  1,  1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1,  1, -1,
        1, -1, -1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1,  1, -1, -1, -1,
       -1,  1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1,
       -1, -1,  1,  1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1,  1, -1, -1,
       -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1,  1, -1,  1, -1, -1,
        1,  1, -1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1, -1,
       -1,  1,  1,  1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,
        1,  1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1, -1,  1,  1,  1, -1,
        1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,  1, -1, -1, -1,  1, -1,
        1, -1, -1, -1, -1