In [12]:
# %load crypto_pred_pract.py
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

# load dataset with singling out the rows with desired crypto
df = pd.read_csv(r'C:\Users\Administrator\Desktop\programming\datasets\crypto\crypto-markets.csv')
df = df.loc[df['name'] == 'Bitcoin']

# basic exploration for data organization and possible noise
df.columns

Index(['slug', 'symbol', 'name', 'date', 'ranknow', 'open', 'high', 'low',
       'close', 'volume', 'market', 'close_ratio', 'spread'],
      dtype='object')

In [3]:
df.head(10)

Unnamed: 0,slug,symbol,name,date,ranknow,open,high,low,close,volume,market,close_ratio,spread
0,bitcoin,BTC,Bitcoin,2013-04-28,1,135.3,135.98,132.1,134.21,0.0,1500520000.0,0.5438,3.88
1,bitcoin,BTC,Bitcoin,2013-04-29,1,134.44,147.49,134.0,144.54,0.0,1491160000.0,0.7813,13.49
2,bitcoin,BTC,Bitcoin,2013-04-30,1,144.0,146.93,134.05,139.0,0.0,1597780000.0,0.3843,12.88
3,bitcoin,BTC,Bitcoin,2013-05-01,1,139.0,139.89,107.72,116.99,0.0,1542820000.0,0.2882,32.17
4,bitcoin,BTC,Bitcoin,2013-05-02,1,116.38,125.6,92.28,105.21,0.0,1292190000.0,0.3881,33.32
5,bitcoin,BTC,Bitcoin,2013-05-03,1,106.25,108.13,79.1,97.75,0.0,1180070000.0,0.6424,29.03
6,bitcoin,BTC,Bitcoin,2013-05-04,1,98.1,115.0,92.5,112.5,0.0,1089890000.0,0.8889,22.5
7,bitcoin,BTC,Bitcoin,2013-05-05,1,112.9,118.8,107.14,115.91,0.0,1254760000.0,0.7521,11.66
8,bitcoin,BTC,Bitcoin,2013-05-06,1,115.98,124.66,106.64,112.3,0.0,1289470000.0,0.3141,18.02
9,bitcoin,BTC,Bitcoin,2013-05-07,1,112.25,113.44,97.7,111.5,0.0,1248470000.0,0.8767,15.74


In [4]:
df.isnull().any()

slug           False
symbol         False
name           False
date           False
ranknow        False
open           False
high           False
low            False
close          False
volume         False
market         False
close_ratio    False
spread         False
dtype: bool

In [5]:
# dropping columns that will have little or counterproductive effect on the process
df = df.drop(['symbol', 'slug', 'ranknow', 'spread', 'close_ratio'], 1)
df.columns

Index(['name', 'date', 'open', 'high', 'low', 'close', 'volume', 'market'], dtype='object')

In [6]:
df.describe()

Unnamed: 0,open,high,low,close,volume,market
count,1866.0,1866.0,1866.0,1866.0,1866.0,1866.0
mean,1885.744502,1951.660102,1812.28373,1889.513762,1171548000.0,30782140000.0
std,3321.359095,3458.838432,3153.471445,3323.418678,3023214000.0,56212360000.0
min,68.5,74.56,65.53,68.43,0.0,779255000.0
25%,284.8575,291.1275,280.985,285.31,17266880.0,4051458000.0
50%,565.95,576.295,535.0,565.975,49177350.0,7290880000.0
75%,1055.21,1094.93,1034.7425,1059.6175,244803200.0,16964720000.0
max,19475.8,20089.0,18974.1,19497.4,23840900000.0,326141000000.0


In [19]:
# setting up date column as the index for time series
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
df.head()

Unnamed: 0_level_0,slug,symbol,name,ranknow,open,high,low,close,volume,market,close_ratio,spread
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-04-28,bitcoin,BTC,Bitcoin,1,135.3,135.98,132.1,134.21,0.0,1500520000.0,0.5438,3.88
2013-04-29,bitcoin,BTC,Bitcoin,1,134.44,147.49,134.0,144.54,0.0,1491160000.0,0.7813,13.49
2013-04-30,bitcoin,BTC,Bitcoin,1,144.0,146.93,134.05,139.0,0.0,1597780000.0,0.3843,12.88
2013-05-01,bitcoin,BTC,Bitcoin,1,139.0,139.89,107.72,116.99,0.0,1542820000.0,0.2882,32.17
2013-05-02,bitcoin,BTC,Bitcoin,1,116.38,125.6,92.28,105.21,0.0,1292190000.0,0.3881,33.32


In [20]:
# selecting the beginning date of relevant data, adding some features, dropping other
df = df['2017-01-01':]
df['daily_avg'] = (df['open'] + df['high'] + df['low'] + df['close']) / 4
df['hilo'] = (df['high'] - df['close']) / df['close'] * 100

# adding rolling average feature, taken from previous 6 entries, filling in NaN
df_rolling_avg = df['close']
df_rolling_avg = df_rolling_avg.rolling(window=6).mean()
df_rolling_avg = df_rolling_avg.rename('rolling_avg', inplace=True)
df = pd.concat([df, df_rolling_avg],1)
df = df.fillna(method='backfill')

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNetCV
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from timeit import default_timer as timer

In [22]:
# shifting data to make space for a 3 months worth forecast, splitting for training
df['forecast'] = df['daily_avg'].shift(-90)
X = df.dropna().drop(['forecast'], axis=1)
y = df.dropna()['forecast']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
forecast =  df.tail(90).drop(['forecast'], 1)

# rescaling data
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

ValueError: could not convert string to float: 'bitcoin'

In [23]:
def reg_results_kfold(X, y, regressors):

    result_dict = {}
    cv = KFold(n_splits=5, random_state=42, shuffle=False)

    for name,regressor in regressors:
        
        rmse = []
        mae = []
        r2 = []
               
        for train_index, test_index in cv.split(X):
            X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
            score = regressor.fit(X_train, y_train)
            prediction = score.predict(X_test)
            
            rmse.append(np.sqrt(mean_squared_error(y_test, prediction)))
            mae.append(mean_absolute_error(y_test, prediction))
            r2.append(score.score(X_train, y_train))
        
        if name not in result_dict:
            result_dict[name] = []
            
        result_dict[name].append(np.mean(rmse))
        result_dict[name].append(np.mean(mae))
        result_dict[name].append(np.mean(r2))

    return result_dict

In [None]:
regressors = [['LinearRegression : ', LinearRegression()],
              ['BayesianRidge : ', BayesianRidge()],
              ['ElasticNetCV : ', ElasticNetCV()],
              ['DecisionTreeRegressor :', DecisionTreeRegressor()],
              ['RandomForestRegressor :', RandomForestRegressor()],
              ['ExtraTreesRegressor :', ExtraTreesRegressor()],
              ['GradientBoostingRegressor :', GradientBoostingRegressor()],
              ['XGBRegressor :', XGBRegressor()]]

In [None]:
reg_columns = ['RMSE', 'MAE', 'R2']

regressor_results_kfold = reg_results_kfold(X, y, regressors)
regressor_results_kfold = pd.DataFrame.from_dict(regressor_results_kfold, orient='index')
regressor_results_kfold.columns = reg_columns
regressor_results_kfold

In [None]:
# picking and using the best performing model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# predicting with the best performing model
predict = model.predict(forecast)

# plotting the results into a graph
plt.figure(figsize=(15,8))
(df[:-90]['daily_avg']).plot(label='Historical Price')
(df[-91:]['daily_avg']).plot(label='Predicted Price')

plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Prediction on Daily Average Price of Bitcoin')
plt.legend()
plt.show()