# Early-stage modeling

By: Jared Berry

In [1]:
# Import necessary libraries for data preparation/EDA
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
from scipy import stats

%matplotlib inline

sns.set_style('darkgrid')

## Set-up & data preparation

In [2]:
# Ensure working directory is set appropriately - change as needed
os.chdir('C:\\Users\\jared\\Documents\\data_science\\school\\georgetown\\capstone\\Passive-Stock-Fund-Optimization')

In [3]:
# Load datasets into memory
yahoo = pd.read_csv('stock_price_until_2019_04_28.csv')
drvd = pd.read_csv('moving-avg-momentum.csv')
simfin = pd.read_csv('simfin\daily_simfin.csv')

In [4]:
# Check dimensions
print("Yahoo! Finance data has {} observations and {} features.".format(yahoo.shape[0], yahoo.shape[1]))
print("Derived data has {} observations and {} features.".format(drvd.shape[0], drvd.shape[1]))
print("Daily SimFin data has {} observations and {} features.".format(simfin.shape[0], simfin.shape[1]))

Yahoo! Finance data has 1014554 observations and 22 features.
Derived data has 1014554 observations and 20 features.
Daily SimFin data has 1509516 observations and 53 features.


In [5]:
# Check keys
print("Yahoo! Finance:")
print(yahoo[['Symbol', 'date_of_transaction']].head())
print("Derived (Yahoo! Finance)")
print(drvd[['Symbol', 'Date']].head())
print("SimFin:")
print(simfin[['ticker', 'date']].head())

Yahoo! Finance:
  Symbol date_of_transaction
0    ABT          2011-01-03
1    ABT          2011-01-04
2    ABT          2011-01-05
3    ABT          2011-01-06
4    ABT          2011-01-07
Derived (Yahoo! Finance)
  Symbol        Date
0      A  2011-01-03
1      A  2011-01-04
2      A  2011-01-05
3      A  2011-01-06
4      A  2011-01-07
SimFin:
  ticker        date
0    MMM  2011-03-31
1    MMM  2011-04-01
2    MMM  2011-04-02
3    MMM  2011-04-03
4    MMM  2011-04-04


#### Merge preparation

In [6]:
# Some quick fixes on keys
yahoo['ticker'] = yahoo['Symbol']
yahoo.drop('Symbol', axis=1, inplace=True)

drvd['ticker'] = drvd['Symbol']
drvd['date_of_transaction'] = drvd['Date']
drvd.drop(['Symbol', 'Date', 'High', 'Low', 
           'Open', 'Close', 'Volume', 'AdjClose'], 
          axis=1, inplace=True)

simfin['date_of_transaction'] = simfin['date']
simfin.drop('date', axis=1, inplace=True)

In [7]:
# Construct the 'train' dataset by merging stock prices and fundamentals; ensure proper sorting and filter on early sample
train = pd.merge(yahoo, drvd, on=['ticker', 'date_of_transaction'])
train = pd.merge(train, simfin, how='left', on=['ticker', 'date_of_transaction'])

train = train.sort_values(['ticker','date_of_transaction'])
train.head()

Unnamed: 0,sno,date_of_transaction,High,Low,Open,Close,Volume,AdjClose,Year,Month,...,total_assets,total_current_assets,total_current_liabilities,total_equity,total_liabilities,total_liabilities_equity,total_noncurrent_assets,total_noncurrent_liabilities,common_outstanding_basic,common_outstanding_diluted
22334,22334,2011-01-03,30.143061,29.620888,29.728184,29.957081,4994000.0,27.591616,2011,1,...,,,,,,,,,,
22335,22335,2011-01-04,30.114449,29.456366,30.035765,29.678112,5017200.0,27.334681,2011,1,...,,,,,,,,,,
22336,22336,2011-01-05,29.849785,29.32761,29.513592,29.613733,4519000.0,27.275387,2011,1,...,,,,,,,,,,
22337,22337,2011-01-06,29.928469,29.477825,29.592276,29.670958,4699000.0,27.328091,2011,1,...,,,,,,,,,,
22338,22338,2011-01-07,29.899857,29.356224,29.699572,29.771101,3810900.0,27.420322,2011,1,...,,,,,,,,,,


#### Feature engineering 

In [8]:
# Construct some aggregate financial ratios from the SimFin data
train['eps'] = train['net_income_y'] / train['common_outstanding_basic']
train['pe_ratio'] = train['AdjClose'] / train['eps']
train['debt_ratio'] = train['total_liabilities'] / train['total_equity']
train['debt_to_equity'] = train['total_liabilities'] / train['total_equity']
train['roa'] = train['net_income_y'] / train['total_assets']

In [9]:
# Construct some additional ticker-level returns features
train['open_l1'] = train.groupby('ticker')['Open'].shift(1)
train['open_l5'] = train.groupby('ticker')['Open'].shift(5)
train['open_l10'] = train.groupby('ticker')['Open'].shift(10)

train['return_prev1_open_raw'] = 100*(train['Open'] - train['open_l1'])/train['open_l1']
train['return_prev5_open_raw'] = 100*(train['Open'] - train['open_l5'])/train['open_l5']
train['return_prev10_open_raw'] = 100*(train['Open'] - train['open_l10'])/train['open_l10']

train['close_l1'] = train.groupby('ticker')['AdjClose'].shift(1)
train['close_l5'] = train.groupby('ticker')['AdjClose'].shift(5)
train['close_l10'] = train.groupby('ticker')['AdjClose'].shift(10)

train['return_prev1_close_raw'] = 100*(train['AdjClose'] - train['close_l1'])/train['close_l1']
train['return_prev5_close_raw'] = 100*(train['AdjClose'] - train['close_l5'])/train['close_l5']
train['return_prev10_close_raw'] = 100*(train['AdjClose'] - train['close_l10'])/train['close_l10']

In [10]:
# Remove the quarter of pre-SimFin data
train = train[train['date_of_transaction'] >= '2011-03-31'].reset_index().drop('index', axis=1)

#### Target generation

In [49]:
# Specify ranges
n = 21 # n-day ahead return
q = 21 # q-day window

In [50]:
# At the ticker level, lead the AdjClose column by n-trading days
target_gen = train[['ticker', 'date_of_transaction', 'AdjClose']]
AdjClose_ahead = target_gen.groupby('ticker')['AdjClose'].shift(-n)
AdjClose_ahead.name = 'AdjClose_ahead'

The raw, month-ahead return is calculated as:
$$target_{t,i} = \frac{AdjClose_{t+n,i} - AdjClose_{t,i}}{AdjClose_{t,i}}$$

In [51]:
target_raw = np.array(100*((AdjClose_ahead - train['AdjClose'])/train['AdjClose']))

In [52]:
# Computing all of the returns for the next 21 days (month) relative to today
aheads = []
for i in range(0,n+1):
    AdjClose_ahead_i = target_gen.groupby('ticker')['AdjClose'].shift(-i)
    aheads.append(np.array(100*((AdjClose_ahead_i - train['AdjClose'])/train['AdjClose'])))

The average, raw returns for all periods within the next month (relative to today) is calculated as:
$$target_{t,i} = (\frac{1}{n})\sum_{k=1}^n \frac{AdjClose_{t+k,i} - AdjClose_{t,i}}{AdjClose_{t,i}}$$

In [44]:
target_cma = np.array(pd.DataFrame(aheads).mean(axis=0, skipna=False).tolist())

The w-day moving average of n-day ahead raw returns, relative to today is calculated as:
$$target_{t,i,q} = \frac{AdjClose_{t+n,i} - AdjClose_{t,i}}{AdjClose_{t,i}}, MA(q)$$

In [53]:
# Simpler moving-average ahead target?
target_gen['returns_ahead'] = 100*((AdjClose_ahead - train['AdjClose'])/train['AdjClose'])
target_ma = np.array(target_gen.groupby('ticker')['returns_ahead'].rolling(q).mean()) # NEED TO CHECK THE LAG STRUCTURE HERE

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
# Construct market-residualized variants (NEED S&P 500 OR CONSTRUCTION OF AN INDEX FROM CURRENT FEATURES)

## Exporatory Data Analysis

#### Feature selection

In [18]:
# Set a feature selection list (THINK ABOUT INFORMING THIS SELECTION WITH SHRINKAGE METHODS, I.E. RIDGE REGRESSION)
features = ['High', 'Low', 'Open', 'Close', 'Volume', 'AdjClose', 'Year',
            'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Pct_Change_Daily',
            'Pct_Change_Monthly', 'Pct_Change_Yearly', 'RSI', 'Volatility',
            'Yearly_Return_Rank', 'Monthly_Return_Rank', 'Pct_Change_Class',
            'Rolling_Yearly_Mean_Positive_Days', 'Rolling_Monthly_Mean_Positive_Days', 
            'Rolling_Monthly_Mean_Price', 'Rolling_Yearly_Mean_Price',
            'open_l1', 'open_l5', 'open_l10', 'close_l1', 'close_l5', 'close_l10',
            'return_prev1_open_raw', 'return_prev5_open_raw', 'return_prev10_open_raw',
            'return_prev1_close_raw', 'return_prev5_close_raw', 'return_prev10_close_raw',
            'pe_ratio', 'debt_ratio', 'debt_to_equity', 'roa'
            ]

In [19]:
# Select on features to pass to modeling machinery
X = train[features]

In [20]:
# Create a list of the tickers
tickers = train['ticker'].unique().tolist()

## Modeling

#### Set-up

In [21]:
# Set relevant scikit-learn functions/modules

# Tests for stationarity 
from statsmodels.tsa.stattools import adfuller

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# LightGBM
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

# Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Evaluation
from sklearn import metrics

In [30]:
# From Wheat Classification notebook 
def fit_sklearn_classifier(X, y, holdout, ticker, ema_gamma, valid_splits, model, label, param_search={}, export=False,
                           panel=False, groups=None, **kwargs):
    """
    Flexible function for fitting any number of sci-kit learn
    classifiers, with optional grid search.
    """
    start = time.time()
    # Convert to NumPy arrays - store feature names
    feature_names = X.columns.tolist()
    features = np.array(X)
    test_features = np.array(holdout)
    labels = y.copy()
    
    # Compute EMA smoothing of target prior to constructing classes
    EMA = 0
    gamma_ = ema_gamma
    for ti in range(len(labels)):
        EMA = gamma_*labels[ti] + (1-gamma_)*EMA
        labels[ti] = EMA  
        
    labels_smoothed = np.where(labels >= 0, 1, 0)    
    orig_labels = np.where(y.copy() >= 0, 1, 0)
    print("\n{} labels changed by smoothing.".format(np.sum(labels_smoothed != orig_labels)))

    # Compute some baselines
    all_pos_acc = np.mean(np.ones(test_features.shape[0], dtype='int') == labels_smoothed)
    random_walk_classes = np.array(pd.Series(labels_smoothed).shift(1).tolist()[1:], dtype="int") # Inappropriate for panel
    rw_acc = np.mean(random_walk_classes == np.array(labels_smoothed[1:], dtype="int"))
    print("Baseline accuracy is: {}%".format(100*np.round(all_pos_acc, 2)))
    print("RW accuracy is {}%".format(100*np.round(rw_acc, 2)))
    
    # Set time-series, cross-validation indices
    if panel:
        splits = PanelSplit(n_folds=valid_splits, groups=groups)
        search_splits = PanelSplit(n_folds=valid_splits, groups=groups)
    else:
        splits = TimeSeriesSplit(n_splits=valid_splits).split(X)
        search_splits = TimeSeriesSplit(n_splits=valid_splits).split(X)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Dictionary of lists for recording validation and training scores
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}
    
    # Perform a grid-search on the provided parameters to determine best options
    if param_search:
        print("Performing grid search for hyperparameter tuning")
        gridsearch = GridSearchCV(estimator=model(), 
                                  cv=search_splits,
                                  param_grid=param_search)

        # Fit to extract best parameters later
        gridsearch_model = gridsearch.fit(features, labels_smoothed)

    opt_thresholds = []; split_counter = 1
    for train_indices, valid_indices in splits:
        print("Training model on validation split #{}".format(split_counter))
        # Training data for the fold
        train_features, train_labels = features[train_indices], labels_smoothed[train_indices]
        #Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels_smoothed[valid_indices]
        
        expected = valid_labels
        
        # Generate a model given the optimal parameters established in grid search
        if param_search:
            estimator = model(**gridsearch_model.best_params_)
        else:
            estimator = model(**kwargs)
            
        # Train the estimator
        estimator.fit(train_features, train_labels)

        # Fit the fitted model on the test set and store positive class probabilities
        probs = estimator.predict_proba(valid_features)
        pos_probs = [p[1] for p in probs]

        # Dynamic classification threshold selection
        thresholds = list(np.arange(0.30, 0.90, 0.05))
        preds = [[1 if y >= t else 0 for y in pos_probs] for t in thresholds]
        scores_by_threshold_ = [metrics.f1_score(valid_labels, p) for p in preds]
        opt_thresh_ = thresholds[scores_by_threshold_.index(max(scores_by_threshold_))]
        opt_thresholds.append(opt_thresh_)

        # Generate class predictions
        predicted = [1 if y >= opt_thresh_ else 0 for y in pos_probs]

        # Append scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))
        
        # Store variable importances 
        if model in [RandomForestClassifier, GradientBoostingClassifier]:
            feature_importance_values += estimator.feature_importances_ / valid_splits
            
        # Iterate counter
        split_counter += 1
            
    # Properly format feature importances
    named_importances = list(zip(feature_names, feature_importance_values))
    sorted_importances = sorted(named_importances, key=lambda x: x[1], reverse=True)
    
    # Fit on full sample 
    if param_search:
        estimator = model(**gridsearch_model.best_params_)
    else:
        estimator = model(**kwargs)
        
    estimator.fit(features, labels_smoothed)
    
    # Test on hold out set
    out_of_sample_probs = estimator.predict_proba(holdout)
    pos_probs = [p[1] for p in out_of_sample_probs]
    predicted = [1 if y >= opt_thresh_ else 0 for y in pos_probs]

    # Store values for later reporting/use in app
    evals = {'precision':np.mean(scores['precision']), 
             'recall':np.mean(scores['recall']), 
             'accuracy':np.mean(scores['accuracy']), 
             'f1':np.mean(scores['f1']),
             'probabilities':pos_probs,
             'predictions':predicted,
             'importances':sorted_importances
             }
    
    # Report
    print("Build, hyperparameter selection, and validation of {} took {:0.3f} seconds\n".format(label, time.time()-start))
    print("Hyperparameters are as follows:")
    if param_search:
        for key in gridsearch_model.best_params_.keys():
            print("{}: {}\n".format(key, gridsearch_model.best_params_[key]))
    print("Validation scores are as follows:")
    print(pd.DataFrame(scores).mean())
    
    # Output the evals dictionary
    if export:
        outpath = "{}_{}.pickle".format(label.tolower().replace(" ", "_"), ticker.tolower())
        with open(outpath, 'wb') as f:
            pickle.dump(evals, f)

In [None]:
def fit_sklearn_regressor(X, y, holdout, ticker, ema_gamma, valid_splits, model, label, param_search={}, export=False, 
                                **kwargs):
    
    start = time.time()
    # Convert to NumPy arrays - store feature names
    feature_names = X.columns.tolist()
    features = np.array(X)
    test_features = np.array(holdout)
    returns = y.copy()
    
    # Compute EMA smoothing of target prior to constructing classes
    EMA = 0
    gamma_ = ema_gamma
    for ti in range(len(returns)):
        EMA = gamma_*returns[ti] + (1-gamma_)*EMA
        returns[ti] = EMA  
        
    returns_smoothed = returns.copy() 

    # Set time-series, cross-validation indices
    splits  = TimeSeriesSplit(n_splits=valid_splits)
    search_splits = TimeSeriesSplit(n_splits=valid_splits).split(features)
    
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])
    
    # Dictionary of lists for recording validation and training scores
    scores = {'mae':[], 'mse':[], 'r^2':[]}
    
    # Perform a grid-search on the provided parameters to determine best options
    if param_search:
        gridsearch = GridSearchCV(estimator=model(), 
                                  cv=search_splits,
                                  param_grid=param_search)

        # Fit to extract best parameters later
        gridsearch_model = gridsearch.fit(features, returns_smoothed)

    opt_thresholds = []
    for train_indices, valid_indices in splits.split(features):
        # Training data for the fold
        train_features, train_returns = features[train_indices], returns_smoothed[train_indices]
        #Validation data for the fold
        valid_features, valid_returns = features[valid_indices], returns_smoothed[valid_indices]
        
        expected = valid_returns
        
        # Generate a model given the optimal parameters established in grid search
        if param_search:
            estimator = model(**gridsearch_model.best_params_)
        else:
            estimator = model(**kwargs)
            
        # Train the estimator
        estimator.fit(train_features, train_returns)   
        
        # Generate predictions
        predicted = estimator.predict(valid_features)

        # Append scores to the tracker
        scores['mae'].append(metrics.mean_absolute_error(expected, predicted))
        scores['mse'].append(metrics.mean_squared_error(expected, predicted))
        scores['r^2'].append(metrics.r2_score(expected, predicted))
        
        # Store variable importances 
        if model in [RandomForestClassifier, GradientBoostingClassifier]:
            feature_importance_values += estimator.feature_importances_ / splits.n_splits
            
    # Properly format feature importances
    named_importances = list(zip(feature_names, feature_importance_values))
    sorted_importances = sorted(named_importances, key=lambda x: x[1], reverse=True)
    
    # Fit on full sample 
    if param_search:
        estimator = model(**gridsearch_model.best_params_)
    else:
        estimator = model(**kwargs)
        
    estimator.fit(features, returns_smoothed)
    
    # Test on hold out set
    predicted = estimator.predict(holdout)    
    
    # Store values for later reporting/use in app
    evals = {'mae':np.mean(scores['mae']), 
             'mse':np.mean(scores['mse']), 
             'r^2':np.mean(scores['r^2']),
             'predictions':predicted,
             'importances':sorted_importances
             }
    
    # Report
    print("Build, hyperparameter selection, and validation of {} took {:0.3f} seconds\n".format(label, time.time()-start))
    print("Hyperparameters are as follows:")
    if param_search:
        for key in gridsearch_model.best_params_.keys():
            print("{}: {}\n".format(key, gridsearch_model.best_params_[key]))
    print("Validation scores are as follows:")
    print(pd.DataFrame(scores).mean())
    
    # Output the evals dictionary
    if export:
        outpath = "{}_{}.pickle".format(label.tolower().replace(" ", "_"), ticker.tolower())
        with open(outpath, 'wb') as f:
            pickle.dump(evals, f)

In [34]:
def fit_lgbm_classifier(X, y, holdout, ticker, ema_gamma=1, valid_splits=12, export=False):
    
    # Convert to NumPy arrays - store feature names
    feature_names = X.columns.tolist()
    features = np.array(X)
    test_features = np.array(holdout)
    labels = y.copy()
    
    # Compute EMA smoothing of target prior to constructing classes
    EMA = 0
    gamma_ = ema_gamma
    for ti in range(len(labels)):
        EMA = gamma_*labels[ti] + (1-gamma_)*EMA
        labels[ti] = EMA  
        
    labels_smoothed = np.where(labels >= 0, 1, 0)
    orig_labels = np.where(y.copy() >= 0, 1, 0)
    print("\n{} labels changed by smoothing.".format(np.sum(labels_smoothed != orig_labels)))

    # Compute some baselines
    all_pos_acc = np.mean(np.ones(test_features.shape[0], dtype='int') == labels_smoothed)
    random_walk_classes = np.array(pd.Series(labels_smoothed).shift(1).tolist()[1:], dtype="int")
    rw_acc = np.mean(random_walk_classes == np.array(labels_smoothed[1:], dtype="int"))
    print("Baseline accuracy is: {}%".format(100*np.round(all_pos_acc, 2)))
    print("RW accuracy is {}%".format(100*np.round(rw_acc, 2)))
    
    ## pd.DataFrame(list(zip(labels_smoothed, y.copy()))).plot()

    # Generate splits
    splits = TimeSeriesSplit(n_splits=valid_splits)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    valid_accs = []
    train_accs = []
    
    # Iterate through each fold
    for train_indices, valid_indices in splits.split(X): 

        # Training data for the fold
        train_features, train_labels = features[train_indices], labels_smoothed[train_indices]
        #Validation data for the fold
        valid_features, valid_labels = features[valid_indices], labels_smoothed[valid_indices]

        # Create the bst
        bst = LGBMClassifier(n_estimators=10000, objective = 'binary', 
                             class_weight = 'balanced', learning_rate = 0.01,
                             max_bin = 25, num_leaves = 25, max_depth = 1,
                             reg_alpha = 0.1, reg_lambda = 0.1, 
                             subsample = 0.8, random_state = 101,
                             boosting = 'gbdt'
                            )

        # Train the bst
        bst.fit(train_features, train_labels, eval_metric = ['auc', 'binary_error'],
                eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 0)

        # Record the best iteration
        best_iteration = bst.best_iteration_

        # Record the feature importances
        feature_importance_values += bst.feature_importances_ / splits.n_splits
        
        # Make predictions
        test_predictions += bst.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / splits.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = bst.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]

        # Record the best score
        valid_score = bst.best_score_['valid']['auc']
        train_score = bst.best_score_['train']['auc']
        valid_acc = bst.best_score_['valid']['binary_error']
        train_acc = bst.best_score_['train']['binary_error']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        valid_accs.append(valid_acc)
        train_accs.append(train_acc)
        
    # Properly format feature importances
    named_importances = list(zip(feature_names, feature_importance_values))
    sorted_importances = sorted(named_importances, key=lambda x: x[1], reverse=True)
        
    # Set up an exportable dictionary with results from the model
    results = {
        'train_auc':train_scores,
        'train_acc':train_acc,
        'validation_auc':valid_scores,
        'validation_accuracy':valid_acc,
        'valid_preds':out_of_fold,
        'feature_importances':sorted_importances,
        'test_predictions':test_predictions
    }
    
    # Output the results dictionary
    if export:
        outpath = "lgbc_{}.pickle".format(ticker.tolower())
        with open(outpath, 'wb') as f:
            pickle.dump(results, f)
    
    print("Average AUC across {} splits: {}".format(valid_splits, np.mean(valid_scores)))
    print("Average accuracy across {} splits: {}%".format(valid_splits, 100*np.round((1-np.mean(valid_acc)),2)))
    for i in range(6):
        print(sorted_importances[i])

In [37]:
def fit_lgbm_regressor(X, y, holdout, ticker, ema_gamma=1, valid_splits=12, export=False):
    
    # Convert to NumPy arrays - store feature names
    feature_names = X.columns.tolist()
    features = np.array(X)
    test_features = np.array(holdout)
    returns = y.copy()
    
    # Compute EMA smoothing of target prior to constructing classes
    EMA = 0
    gamma_ = ema_gamma
    for ti in range(len(returns)):
        EMA = gamma_*returns[ti] + (1-gamma_)*EMA
        returns[ti] = EMA  
        
    returns_smoothed = returns.copy()

    # Generate splits
    splits = TimeSeriesSplit(n_splits=valid_splits)

    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])
    
    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []
    
    # Iterate through each fold
    for train_indices, valid_indices in splits.split(X): 

        # Training data for the fold
        train_features, train_returns = features[train_indices], returns_smoothed[train_indices]
        #Validation data for the fold
        valid_features, valid_returns = features[valid_indices], returns_smoothed[valid_indices]

        # Create the bst
        bst = LGBMRegressor(n_estimators=10000, objective = 'regression', 
                            learning_rate = 0.01,
                            max_bin = 25, num_leaves = 25, max_depth = 1,
                            reg_alpha = 0.1, reg_lambda = 0.1, 
                            subsample = 0.8, random_state = 101,
                            boosting = 'gbdt'
                            )

        # Train the bst
        bst.fit(train_features, train_returns, eval_metric = ['l1', 'l2'],
                eval_set = [(valid_features, valid_returns), (train_features, train_returns)],
                eval_names = ['valid', 'train'],
                early_stopping_rounds = 100, verbose = 0)

        # Record the best iteration
        best_iteration = bst.best_iteration_

        # Record the feature importances
        feature_importance_values += bst.feature_importances_ / splits.n_splits
        
        # Make predictions
        test_predictions += bst.predict(test_features, num_iteration = best_iteration) / splits.n_splits

        # Record the out of fold predictions
        out_of_fold[valid_indices] = bst.predict(valid_features, num_iteration = best_iteration)

        # Record the best score
        valid_score = bst.best_score_['valid']['l1']
        train_score = bst.best_score_['train']['l1']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
    # Properly format feature importances
    named_importances = list(zip(feature_names, feature_importance_values))
    sorted_importances = sorted(named_importances, key=lambda x: x[1], reverse=True)
        
    # Set up an exportable dictionary with results from the model
    results = {
        'train_auc':train_scores,
        'validation_auc':valid_scores,
        'valid_preds':out_of_fold,
        'feature_importances':sorted_importances,
        'test_predictions':test_predictions
    }
    
    # Output the results dictionary
    if export:
        outpath = "lgbr_{}.pickle".format(ticker.tolower())
        with open(outpath, 'wb') as f:
            pickle.dump(results, f)
    
    print("\nAverage MAE across {} splits: {}".format(valid_splits, np.mean(valid_scores)))

#### Panel-level

Given that there are bound to be a number of systemic considerations that impact the price of a stock at any given point in time, it is prudent to perform and evaluate predictions across the panel of S&P 500 stocks in our sample, which will capture potential linkages between different stocks, and allow us to explore the possibility of using features generated from clustering to group like stocks in the panel

In [24]:
# Create copies for panel-level regressions
X_p = X.copy(deep=True)
y_p = target_raw.copy()

# Indexes of hold-out test data (the 21 days of data preceding the present day)
test_idx = np.where(np.isnan(y_p))[0].tolist()

# In order to ensure grouping is done properly, remove this data from a ticker-identification set as well
ticker_locs = train['ticker'].drop(train.index[test_idx]).reset_index().drop('index', axis=1)

In [25]:
def PanelSplit(n_folds, groups, grouping_var='ticker'):
    """
    Function to generate time series splits of a panel, provided
    a number of folds, and an indexable dataframe to create groups.
    Returns a generator object for compliance with sci-kit learn API.
    """
    by_ticker_index = (groups.groupby(grouping_var)
                       .apply(lambda x: x.reset_index(drop=True))
                       .drop(grouping_var, axis=1)
                       .reset_index()
                       .rename({'level_1':'tsidx'}, axis=1)
                       )
    
    ticker_range = by_ticker_index['tsidx'].unique().tolist()
    ticker_range = sorted(ticker_range)
    
    splits = TimeSeriesSplit(n_splits=n_folds)
    
    for train_indices, valid_indices in splits.split(ticker_range):
        panel_train_indices = by_ticker_index[by_ticker_index['tsidx'].isin(train_indices)].index.tolist()
        panel_valid_indices = by_ticker_index[by_ticker_index['tsidx'].isin(valid_indices)].index.tolist()
        yield panel_train_indices, panel_valid_indices

In [None]:
def WindowSplit(window, groups, panel):    
    """
    Function to generate windowed time series splits of a panel, provided
    a number of folds, and an indexable dataframe to create groups.
    Returns a generator object for compliance with sci-kit learn API.
    """    
    wparams = window.split(':')
    wtrain = int(wparams[0])
    wvalid = int(wparams[1])
    witer = int(wparams[2])
    
    by_ticker_index = (groups.groupby('ticker')
                       .apply(lambda x: x.reset_index(drop=True))
                       .drop('ticker', axis=1)
                       .reset_index()
                       .rename({'level_1':'tsidx'}, axis=1)
                       )
    
    ticker_range = by_ticker_index['tsidx'].unique().tolist()
    ticker_range = sorted(ticker_range)
    
    stop = 0
    start = (max(ticker_range) % witer) + 1
    if panel:
        while stop < max(ticker_range):
            train_indices = np.arange(start, start + wtrain).tolist()
            valid_indices = np.arange(start + wtrain, start + wtrain + wvalid).tolist()
            stop = max(valid_indices)
            start += witer
            yield train_indices, valid_indices
    else:
        while stop < max(ticker_range):
            train_indices = np.arange(start, start + wtrain).tolist()
            valid_indices = np.arange(start + wtrain, start + wtrain + wvalid).tolist()
            panel_train_indices = by_ticker_index[by_ticker_index['tsidx'].isin(train_indices)].index.tolist()
            panel_valid_indices = by_ticker_index[by_ticker_index['tsidx'].isin(valid_indices)].index.tolist()
            stop = max(valid_indices)
            start += witer
            yield panel_train_indices, panel_valid_indices      

In [26]:
# Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature) within days
X_p = X_p.groupby(['Year', 'Month', 'Day']).apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)

In [27]:
# Remove hold-out test data
y_p = np.delete(y_p, test_idx)
X_p_holdout = X_p.loc[X_p.index[test_idx]]
X_p = X_p.drop(X_p.index[test_idx])

In [28]:
y_p_smoothed = np.zeros(y_p.shape[0])
for t in tickers:
    idx = ticker_locs.loc[ticker_locs['ticker'] == t].index.tolist()
    y_to_smooth = y_p[idx]
    
    # Compute EMA smoothing of target within ticker
    EMA = 0
    gamma_ = 1
    for ti in range(len(y_to_smooth)):
        EMA = gamma_*y_to_smooth[ti] + (1-gamma_)*EMA
        y_to_smooth[ti] = EMA
        
    y_p_smoothed[idx] = y_to_smooth

In [None]:
# Fit and evaluate - gamma MUST be 1 here
## fit_lgbm_classifier(X_p, y_p_smoothed, X_p_holdout, ticker="", ema_gamma=1, valid_splits=12, export=False)
fit_sklearn_classifier(X_p, y_p_smoothed, X_p_holdout, ticker="", ema_gamma=1, valid_splits=6, model=KNeighborsClassifier,
                       label='kNN Classifier', param_search = {}, export=False, n_jobs=-1,
                       panel = False, groups=ticker_locs
                      )

#### Ticker-level 

At the heart of this analysis is a time-series prediction problem. As such, it is prudent to explore running models for each individual stock. We can envision averaging the results of both modeling approaches to incorporate the contribution of both into a final prediction.

In [58]:
for i, t in enumerate(tickers[:5]):
    
    # Pull only feature/target data for the relevant stocker
    X_t = X.loc[train['ticker'] == t,:]
    y_t = target_ma[train['ticker'] == t]
    
    # Indexes of hold-out test data (the 21 days of data preceding the present day)
    test_idx = np.where(np.isnan(y_t))[0].tolist()
    
    # Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature)
    X_t = X_t.apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)
    
    # Remove hold-out test data
    y_t = np.delete(y_t, test_idx)
    X_t_holdout = X_t.loc[X_t.index[test_idx]]
    X_t = X_t.drop(X_t.index[test_idx])
    
    # Fit and evaluate
    fit_lgbm_classifier(X_t, y_t, X_t_holdout, ticker=t, ema_gamma=1, valid_splits=12, export=False)
    ## fit_lgbm_regressor(X_t, y_t, X_t_holdout, ticker=t, ema_gamma=1, valid_splits=12, export=False)
    ## fit_sklearn_classifier(X_t, y_t, X_t_holdout, ticker=t, ema_gamma=1, valid_splits=12, model=KNeighborsClassifier,
    ##                        label='kNN Classifier', param_search = {'n_neighbors':[2,4,6]}, export=False, n_jobs=-1
    ##                      )
    ## fit_sklearn_regressor(X_t, y_t, X_t_holdout, ticker=t, ema_gamma=0.1, valid_splits=12, model=KNeighborsRegressor,
    ##                        label='kNN Regressor', param_search = {}, export=False, n_jobs=-1
    ##                       )


0 labels changed by smoothing.
Baseline accuracy is: 0.0%
RW accuracy is 97.0%




Average AUC across 12 splits: 0.6806807948107082
Average accuracy across 12 splits: 68.0%
('return_prev10_close_raw', 12.750000000000002)
('RSI', 11.250000000000002)
('Rolling_Monthly_Mean_Price', 9.0)
('roa', 4.083333333333334)
('return_prev5_close_raw', 0.8333333333333334)
('Rolling_Yearly_Mean_Price', 0.08333333333333333)

0 labels changed by smoothing.
Baseline accuracy is: 0.0%
RW accuracy is 97.0%




Average AUC across 12 splits: 0.7488962954117812
Average accuracy across 12 splits: 51.0%
('return_prev10_close_raw', 17.416666666666668)
('Monthly_Return_Rank', 13.083333333333332)
('Rolling_Yearly_Mean_Price', 8.0)
('return_prev5_close_raw', 6.25)
('Dayofyear', 6.166666666666667)
('Rolling_Monthly_Mean_Positive_Days', 4.083333333333334)

0 labels changed by smoothing.
Baseline accuracy is: 0.0%
RW accuracy is 97.0%




Average AUC across 12 splits: 0.6486199711340722
Average accuracy across 12 splits: 66.0%
('return_prev10_close_raw', 5.166666666666667)
('RSI', 4.916666666666665)
('roa', 2.3333333333333335)
('Week', 0.6666666666666667)
('Pct_Change_Monthly', 0.5)
('pe_ratio', 0.4166666666666667)

0 labels changed by smoothing.
Baseline accuracy is: 0.0%
RW accuracy is 98.0%




Average AUC across 12 splits: 0.7290411976556231
Average accuracy across 12 splits: 84.0%
('return_prev10_close_raw', 8.250000000000002)
('Monthly_Return_Rank', 5.583333333333333)
('Week', 0.75)
('RSI', 0.49999999999999994)
('High', 0.0)
('Low', 0.0)

0 labels changed by smoothing.
Baseline accuracy is: 0.0%
RW accuracy is 97.0%




Average AUC across 12 splits: 0.6512643093388416
Average accuracy across 12 splits: 72.0%
('RSI', 9.25)
('return_prev10_close_raw', 7.000000000000001)
('Rolling_Monthly_Mean_Price', 3.1666666666666665)
('Pct_Change_Monthly', 1.5833333333333333)
('Week', 0.75)
('Rolling_Yearly_Mean_Price', 0.16666666666666666)


## Benchmarking

It's essential to benchmark the performance of our models against some work-horse time-series forecasting models. Regardless of differential performance (to a point), what these models lack is the ability for us to understand *what* makes a model perform the way it does, and doesn't directly serve our objective of providing users with *relative probabilities* of performance.

In [None]:
def benchmark_arima(y, gamma, p, d, q):
    pass

In [None]:
def benchmark_prophet(y, gamma):
    pass

In [None]:
i = 1; t = tickers[i]#for i, t in enumerate(tickers[:5]):
    
# Pull only feature/target data for the relevant stocker
X_t = X.loc[train['ticker'] == t,:]
y_t = target_raw[train['ticker'] == t]

# Indexes of hold-out test data (the 21 days of data preceding the present day)
test_idx = np.where(np.isnan(y_t))[0].tolist()

# Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature)
X_t = X_t.apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)

# Remove hold-out test data
y_t = np.delete(y_t, test_idx)
X_t_holdout = X_t.loc[X_t.index[test_idx]]
X_t = X_t.drop(X_t.index[test_idx])