# AC109 Project Modeling Results: Predicting the returns on Cryptocurrencies

by Ali Dastjerdi, Angelina Massa, Sachin Mathur & Nate Stein

### Supporting Libraries

We outsourced some of the supporting code to other modules we wrote located in the main directory with the intent of having this notebook focus on the presentation of results. The supporting modules are:
- `crypto_utils.py` contains the code we used to scrape and clean data from coinmarket.cap. It also contains the code used to wrangle/preprocess that data (saved in CSV files) into our design matrix. By separating the creation of the design matrix in its own `.py` file, we were also able to create unit tests to ensure the resulting figures match what we expected based on hand-calculated figures, which became increasingly important as we engineered more involved features.
- `crypto_models.py` contains the code we used to iterate over multiple classification and regression models and summarize the results in tabular form.

In [1]:
import crypto_utils as cryp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
import time as time

from crypto_utils import fmt_date, print_update

In [2]:
# Custom output options.

np.set_printoptions(precision=4, suppress=True)
pd.set_option('display.precision', 4)
sns.set_style('white')
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 14
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0.05
%matplotlib inline

In [3]:
RAND_STATE = 88

## Construct Design Matrix

We want the construction of the design matrix to be agile enough to allow us to easily change whether we include certain features, which cryptocurrency's price return we want to forecast, etc.

In [18]:
# We will not be using any non-crypto assets at the moment.

def get_data(x_cryptos, y_crypto, test_size, kwargs):
    design = cryp.DesignMatrix(x_cryptos=x_cryptos, y_crypto=y_crypto, **kwargs)
    X, Y = design.get_data(lag_indicator=True)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, Y, test_size=test_size, random_state=RAND_STATE)
    return X_train, X_test, y_train, y_test

In [5]:
crypto_scope = ['ltc', 'xrp', 'xlm', 'eth', 'btc']

# Modeling

In [6]:
import scipy.stats as stats
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

In [7]:
N_CROSSVAL = 3
TEST_SIZE = 0.2

### Baseline Model

In [19]:
def evaluate_baseline_model(x_cryptos, y_crypto, kwargs):
    """Return MAE on test set."""
    X_train, X_test, y_train, y_test = get_data(x_cryptos, y_crypto, TEST_SIZE,
                                                kwargs)
    lr = LinearRegression().fit(X_train, y_train)
    mae = metrics.mean_absolute_error(y_test, lr.predict(X_test))
    return mae

#### Determine optimal rolling window for measuring changes in price and volume

Ultimately we want to determine which `n_rolling_volume`, `n_rolling_price` and `n_std_window` to use going forward, as it will influence our more advanced features.

In [29]:
# DataFrame to store results of trying different windows.

df_results = pd.DataFrame(columns=['y', 'mae', 'n_rolling_price', 
                                   'n_rolling_volume', 'n_std_window'])

params = {'n_rolling_price':None, 'n_rolling_volume':None,
          'x_assets':[], 'n_std_window':None}

n_rolling_prices = range(1, 5)
n_rolling_volumes = range(1, 5)
n_std_windows = range(5, 50, 5)

combo_total = len(n_rolling_prices) * len(n_rolling_volumes) * len(n_std_windows)
combo_count = 0

t0 = time.time()
for n_price in n_rolling_prices:
    for n_vol in n_rolling_volumes:
        for n_std in n_std_windows:
            combo_count += 1
            print_update('Trying param combination {}/{}...'.format(
                combo_count, combo_total))
            params['n_rolling_price'] = n_price
            params['n_rolling_volume'] = n_vol
            params['n_std_window'] = n_std
            new_row = {'n_rolling_price': n_price,
                       'n_rolling_volume': n_vol,
                       'n_std_window': n_std}
            for y_cryp in crypto_scope:
                x_cryps = [c for c in crypto_scope if c != y_cryp]
                new_row['y'] = y_cryp
                new_row['mae'] = evaluate_baseline_model(x_cryps, y_cryp, 
                                                         params)
                df_results = df_results.append(new_row, ignore_index=True)
print_update('Finished all parameter combinations in {:.2f} seconds.'.format(
    time.time() - t0))

Finished all parameter combinations in 1071.28 seconds.

In [32]:
avg_results = df_results.groupby(['n_rolling_price', 'n_rolling_volume', 'n_std_window']).mean()

After evaluating the results, we can determine that the optimal parameters are:
- `n_rolling_price`: 1
- `n_rolling_volume`: 1
- `n_std_window`: 10

### Introduce Regularization

In [10]:
def evaluate_lasso(x_cryptos, y_crypto, output=True):
    X_train, X_test, y_train, y_test = get_data(x_cryptos, y_crypto, TEST_SIZE)
    lasso = LassoCV(n_alphas=100, cv=N_CROSSVAL, random_state=RAND_STATE)
    lasso.fit(X_train, y_train)
    mae = metrics.mean_absolute_error(y_test, lasso.predict(X_test))
    if output:
        print('\t{0}: {1:.2%} (alpha={2:.2f})'.format(y_crypto, mae, 
                                                      lasso.alpha_))
    return lasso

In [11]:
print('MAE for Lasso Predicting Price Change for:')
for y_cryp in crypto_scope:
    x_cryps = [c for c in crypto_scope if c != y_cryp]
    evaluate_lasso(x_cryps, y_cryp)

MAE for Lasso Predicting Price Change for:
	ltc: 4.08% (alpha=0.00)
	xrp: 5.40% (alpha=0.03)
	xlm: 5.49% (alpha=0.03)
	eth: 4.59% (alpha=0.01)
	btc: 2.98% (alpha=0.00)


In [12]:
def get_features_df(lasso, X_train):
    df = pd.DataFrame(columns=['coeff', 'weight'])
    df['coeff'] = X_train.columns.tolist()
    df['weight'] = lasso.coef_
    df.sort_values('weight', ascending=False, inplace=True)
    df.set_index('coeff', inplace=True, drop=True)
    return df

In [13]:
# See what weights are assigned to features.

lasso = evaluate_lasso(['ltc', 'xrp', 'xlm', 'eth'], 'btc', output=False)
feature_weights = get_features_df(lasso, X_train)
display(feature_weights)

NameError: name 'lasso' is not defined

In [None]:
mae_lasso = metrics.mean_absolute_error(y_test, lasso.predict(X_test))
print('Lasso model MAE: {:.2%}'.format(mae_lasso))

### XGBRegressor

In [None]:
def build_xgb_model(X_train, y_train):
    """Iterate over a hyperparameter space and return best model on a 
    validation set reserved from input training data.
    """
    # Define hyperparam space.
    expon_distr = stats.expon(0, 50)
    cv_params = {
        'n_estimators': stats.randint(4, 100),
        'max_depth': stats.randint(2, 100),
        'learning_rate': stats.uniform(0.05, 0.95),
        'gamma': stats.uniform(0, 10),
        'reg_alpha': expon_distr,
        'min_child_weight': expon_distr
    }

    # Iterate over hyperparam space.
    xgb = XGBRegressor(nthreads=-1)  # nthreads=-1 => use max cores
    
    print_update('Tuning XGBRegressor hyperparams...')
    t0 = time.time()
    gs = RandomizedSearchCV(xgb, cv_params, n_iter=400, n_jobs=1, cv=3, 
                            random_state=88)
    gs.fit(X_train, y_train)
    print_update('Finished tuning XGBRegressor in {:.0f} secs.'.format(
        time.time() - t0))
    
    return gs.best_estimator_

In [None]:
xgb = build_xgb_model(X_train, y_train)

In [None]:
mae_xgb = metrics.mean_absolute_error(y_test, xgb.predict(X_test))
print('XGBRegressor MAE: {:.2%}'.format(mae_xgb))

In [None]:
1e-1