In [76]:
import datetime

import pandas as pd 
import talib as ta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from alpaca.data import CryptoBarsRequest, CryptoHistoricalDataClient
from alpaca.data.timeframe import TimeFrame, TimeFrameUnit

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

### The features we will use

In line with the Machine Learning for Trading book, we will use some traditional technical features as well as some lagged returns to go into our model. The list are: 
* RSI -> relative strength index 
* MACD -> Moving Average convergence divergence
* BBANDS -> Bollinger Bands
* Lagged Returns


In [3]:
data_request = CryptoBarsRequest(
    symbol_or_symbols='ETH/USD', 
    start = datetime.datetime(2021, 1, 1), 
    end = datetime.datetime(2023, 7, 1), 
    timeframe=TimeFrame(1, TimeFrameUnit.Hour)
)

crypto_client = CryptoHistoricalDataClient() 

eth_historical_bars = crypto_client.get_crypto_bars(request_params=data_request)

# Extracting the dataframe, processing, and saving down to csv file
eth_historical_bars_df = eth_historical_bars.df
data = eth_historical_bars_df.reset_index().drop('symbol', axis = 1).set_index('timestamp')

data.to_csv("ETH_historical_data.csv", index=True)

### Generating features

In [39]:
# Creating our RSI feature
data['rsi'] = ta.RSI(data['close'])

def compute_bollinger_bands(close, time_period: int = 20) -> pd.DataFrame: 

    high, mid, low = ta.BBANDS(close, timeperiod = time_period)
    return pd.DataFrame({'bb_high': high, 'bb_low': low}, index = close.index)

def compute_atr(data, time_period: int = 14): 

    df = ta.ATR(data.high, data.low, data.close, timeperiod = time_period)
    return df

In [40]:
# Creating the bollinger bands feature
data = data.join(compute_bollinger_bands(data.close))

In [41]:
data['bb_high'] = data['bb_high'].sub(data.close).div(data.bb_high).apply(np.log1p)
data['bb_low'] = data.close.sub(data.bb_low).div(data.close).apply(np.log1p)

In [50]:
data['atr'] = compute_atr(data)
data['macd'] = ta.MACD(data['close'])[0]

#### Adding lagged returns

In [52]:
lags = [6, 12, 24, 36]

returns = data.close.pct_change()

In [56]:
# We are going to get the geometric mean of the returns
for lag in lags: 
    data[f'Return_{lag}_periods'] = data.close.pct_change(lag).add(1).pow(1/lag).sub(1)

### Generating our target variable

Our target is whether or not the price of ETH is going to go up by 1%, down 1%, or flat over the next 12 hours. This may be useful for us if we would like to go short or long in some way in the future. Let's see if we can get these predicted. 

In [96]:
FORWARD_PERIOD = 12
forward_returns = data['close'].pct_change(FORWARD_PERIOD).shift(-FORWARD_PERIOD)

In [97]:
target_label = forward_returns.apply(lambda x: -1 if x < -0.01 else 1 if x > 0.01 else 0)
target_label.name = 'target'

In [98]:
data['target'] = target_label

In [99]:

data['target'].value_counts(normalize=True)

 0    0.368351
 1    0.330442
-1    0.301207
Name: target, dtype: float64

In [100]:
all_data = data.dropna()

In [101]:
df = all_data.drop(['open', 'high', 'low', 'close', 'volume', 'trade_count', 'vwap'], axis = 1)

target = df.pop('target')

In [102]:
# First we have to split this into trainnig and testing periods
features_train, features_test, target_train, target_test = train_test_split(df, target, test_size=0.2, shuffle=False)

In [103]:
scaler = StandardScaler()
scaler.fit(features_train)

features_train = scaler.transform(features_train)
features_test = scaler.transform(features_test)

In [116]:
# Then we can get the model 
log_reg_model = LogisticRegression(class_weight='balanced')
log_reg_model.fit(features_train, target_train)

In [117]:
log_reg_model.predict_proba(features_train)

array([[0.28173928, 0.34232689, 0.37593383],
       [0.27572579, 0.32566842, 0.3986058 ],
       [0.31218661, 0.30256858, 0.38524481],
       ...,
       [0.28411025, 0.46731952, 0.24857022],
       [0.28351014, 0.46893836, 0.2475515 ],
       [0.28877531, 0.46615008, 0.2450746 ]])

In [118]:
log_reg_model.score(features_train, target_train)

0.40006895759108146

In [119]:
log_reg_model.score(features_test, target_test)

0.5306826017007584

We messed around and made this model. Now, we are going to save down the predictions and use it in a backtrader strategy and see how things go. 

In [125]:
# Going to fit on the entire dataset now and see how this goes
model = LogisticRegression(class_weight='balanced')
model.fit(X = df, y = target)

model_prediction_probs = model.predict_proba(df)
prepared_data = data.join(pd.DataFrame(model_prediction_probs, columns = ['down', 'neutral', 'up'], index = df.index))
prepared_data.to_csv("model_prepared_data.csv", index=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [141]:
import pickle 

with open('log_reg_model_v1.pkl', 'wb') as f: 
    pickle.dump(model, f)

Here, we have set up a very simple logistic regression model which attempts to predict the direction of the next 12 hours of returns based on a magnitude. Next, we are going to take the predicted probabilities from this model and layer on trading logic using backtrader to see if this can be useful in any way. Our first backtesting of a machine learning model, let's get pumped. 

