In [1]:
import pandas as pd
import quandl,datetime
import math
import numpy as np
from sklearn import preprocessing, svm #scale, regresions, cross shuffle stats sepeareate data
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
from matplotlib import style

In [2]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [3]:
style.use('ggplot')

In [4]:
data= pd.read_csv('BTC-USD.csv',parse_dates = True, index_col=0)
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-02-26,424.628998,432.152008,421.619995,432.152008,432.152008,61486000.0
2016-02-27,432.838989,434.230988,428.102997,432.519012,432.519012,41893600.0
2016-02-28,432.571014,435.683014,423.820007,433.503998,433.503998,53033400.0
2016-02-29,433.437988,441.506989,431.692993,437.696991,437.696991,60694700.0
2016-03-01,437.916992,439.653015,432.319,435.122986,435.122986,74895800.0


In [5]:
# Display features in data set
data.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [6]:
# Create a new DataFrame with only closing price and date
df = pd.DataFrame(data, columns=['Close'])

# Reset index column so that we have integers to represent time for later analysis
df = df.reset_index()

In [7]:
df.head()

Unnamed: 0,Date,Close
0,2016-02-26,432.152008
1,2016-02-27,432.519012
2,2016-02-28,433.503998
3,2016-02-29,437.696991
4,2016-03-01,435.122986


In [8]:
# Check data types in columns
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1828 entries, 0 to 1827
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1828 non-null   datetime64[ns]
 1   Close   1824 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 28.7 KB


In [9]:
# Check for missing values in the columns
df.isna().values.any()

True

In [10]:
df=df.dropna()

In [11]:
# Check for missing values in the columns
df.isna().values.any()

False

Explore the Data
When we take a look at the price movement over time by simply plotting the Closing price vs Time, we can already see, that the price continously increases over time and we can also estimate that trend could be linear.

In [None]:
# Import matplotlib package for date plots
import matplotlib.dates as mdates

years = mdates.YearLocator() # Get every year
yearsFmt = mdates.DateFormatter('%Y') # Set year format

# Create subplots to plot graph and control axes
fig, ax = plt.subplots()
ax.plot(df['Date'], df['Close'])

# Format the ticks
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)

# Set figure title
plt.title('Close bitcoin Price History [2016 - 2021]', fontsize=16)
# Set x label
plt.xlabel('Date', fontsize=14)
# Set y label
plt.ylabel('Closing bitcoin Price in $', fontsize=14)

# Rotate and align the x labels
fig.autofmt_xdate()

# Show plot
plt.show()

Linear Regression
Our data contains only one independent variable ($X$)</strong> which represents the <em>date</em> and the <strong>dependent variable ($Y$)</strong> we are trying to predict is the <em>Stock Price</em>. To fit a line to the data points, which then represents an estimated relationship between $X$ and $Y$, we can use a Simple Linear Regression.

The best fit line can be described with
$$
Y = \beta_0 + \beta_1 X
$$

where

$Y$ is the predicted value of the dependent variable
$\beta_0$ is the y-intercept
$\beta_1$ is the slope
$X$ is the value of the independent variable
The goal is to find such coefficients $\beta_0$ and $\beta_1$ that the Sum of Squared Errors, which represents the difference between each point in the dataset with it’s corresponding predicted value outputted by the model, is minimal.m


Training a Linear Regression Model

Train Test Split

In [None]:
# Import package for splitting data set
from sklearn.model_selection import train_test_split

In [None]:
# Split data into train and test set: 80% / 20%
train, test = train_test_split(df, test_size=0.20)


In [None]:
# Import package for linear model
from sklearn.linear_model import LinearRegression

In [None]:
# Reshape index column to 2D array for .fit() method
X_train = np.array(train.index).reshape(-1, 1)
y_train = train['Close']


In [None]:
# Create LinearRegression Object
model = LinearRegression()
# Fit linear model using the train data set
model.fit(X_train, y_train)

Create and Train the Model

Model Evaluation

In [None]:
# The coefficient
print('Slope: ', np.asscalar(np.squeeze(model.coef_)))
# The Intercept
print('Intercept: ', model.intercept_)

Interpreting the coefficients:

The slope coefficient tells us that with a 1 unit increase in date the closing price increases by 0.0276 $
The intercept coefficient is the price at wich the closing price measurement started, the stock price value at date zero

In [None]:

# Train set graph
plt.figure(1, figsize=(16,10))
plt.title('Linear Regression | Price vs Time')
plt.scatter(X_train, y_train, edgecolor='w', label='Actual Price')
plt.plot(X_train, model.predict(X_train), color='r', label='Predicted Price')
plt.xlabel('Integer Date')
plt.ylabel('Bitcoin Price')
plt.legend()
plt.show()

Prediction from our Model

In [None]:
# Create test arrays
X_test = np.array(test.index).reshape(-1, 1)
y_test = test['Close']

In [None]:
# Generate array with predicted values
y_pred = model.predict(X_test)

Regression Evaluation
Let's have a look at how the predicted values compare with the actual value on random sample from our data set.

In [None]:
# Add new column for predictions to df
df['Prediction'] = model.predict(np.array(df.index).reshape(-1, 1))

In [None]:

# Get number of rows in data set for random sample
df.shape

In [None]:
randints = np.random.randint(2550, size=25)

# Select row numbers == random numbers
df_sample = df[df.index.isin(randints)]

In [None]:

df_sample.head()

In [None]:
# Create subplots to plot graph and control axes
fig, ax = plt.subplots()
df_sample.plot(x='Date', y=['Close', 'Prediction'], kind='bar', ax=ax)

# Set figure title
plt.title('Comparison Predicted vs Actual Price in Sample data selection', fontsize=16)

# 

# Set x label
plt.xlabel('Date', fontsize=14)

# Set y label
plt.ylabel('Stock Price in $', fontsize=14)

# Show plot
plt.show()

In [None]:
# Plot fitted line, y test
plt.figure(1, figsize=(16,10))
plt.title('Linear Regression | Price vs Time')
plt.plot(X_test, model.predict(X_test), color='r', label='Predicted Price')
plt.scatter(X_test, y_test, edgecolor='w', label='Actual Price')

plt.xlabel('Integer Date')
plt.ylabel('Bitcoin Price in $')

plt.show()

In [None]:
# Plot predicted vs actual prices
plt.scatter(y_test, y_pred)

plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')

plt.title('Predicted vs Actual Price')

plt.show()

Residual Histogram
The residuals are nearly normally distributed around zero, with a slight skewedness to the right.

In [None]:
import seaborn as sns

In [None]:
# Import norm package to plot normal distribution
from scipy.stats import norm

# Fit a normal distribution to the data:
mu, std = norm.fit(y_test - y_pred)

ax = sns.distplot((y_test - y_pred), label='Residual Histogram & Distribution')

# Calculate the pdf over a range of values         
x = np.linspace(min(y_test - y_pred), max(y_test - y_pred), 100)
p = norm.pdf(x, mu, std)

# And plot on the same axes that seaborn put the histogram
ax.plot(x, p, 'r', lw=2, label='Normal Distribution') 

plt.legend()
plt.show()

In [None]:
df.head()

Error Evaluation Metrics
Mean Absolute Error (MAE) is the mean of the absolute value of the errors:$$
\frac{1}{N} \sum_{i = 1}^{N} |y_i - \hat{y}_i|
$$

Mean Squared Error (MSE) is the mean of the squared errors:$$
\frac{1}{N} \sum_{i = 1}^{N} (y_i - \hat{y}_i)^2
$$

Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:$$
\sqrt{\frac{1}{N} \sum_{i = 1}^{N} (y_i - \hat{y}_i)^2}
$$

All of these are cost functions we want to minimize.

In [None]:
# Import metrics package from sklearn for statistical analysis
from sklearn import metrics

In [None]:
# Statistical summary of test data
df['Close'].describe()

In [None]:
# Calculate and print values of MAE, MSE, RMSE
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

The MAE is 3% (of minimum) and 6% (of maximum) of the Closing Price.
The other two errors are larger, because the errors are squared and have therefore a greater influence on the result.
Accuracy Evaluation Metrics
To see how accurate our model is, we can calculate the Coefficient of determination, which describes the ratio between the total error and the error, that is explained by our model. It's value is between 0 and 1, with 1 meaning 100% of the error is acoounted for by the model.

Coefficient of determination$$
R^2 = 1 - \frac{RSS}{TSS}
$$

with

Residual Sum of Squares (RSS)$$
RSS = \sum_{i = 1}^{N} \epsilon_i^2 = \sum_{i = 1}^{N} (y_i - \hat{y}_i)^2
$$

Total Sum of Squares (TSS)$$
TSS = \sum_{i = 1}^{N} (y_i - \bar{y}_i)^2
$$

In [None]:
print('R2: ', metrics.r2_score(y_test, y_pred))

In [None]:
from sklearn.metrics import explained_variance_score
explained_variance_score(y_test, y_pred)

The value of $R^2$ shows that are model accounts for nearly 41% of the differences between the actual bitcoin prices and the predicted prices.