# Stock Price Classification

Credits for inspiration for plot code:  
https://stackoverflow.com/questions/28200786/how-to-plot-scikit-learn-classification-report  
https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python  
https://stackoverflow.com/questions/29656550/how-to-plot-pr-curve-over-10-folds-of-cross-validation-in-scikit-learn

By: Jared Berry

In [None]:
# Quality of life
import os
import time
import warnings
from collections import defaultdict

# I/O and data structures
import pickle
import pandas as pd
import numpy as np

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

# Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

# Evaluation
from sklearn import metrics
import statsmodels.tsa.stattools as ts

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Magic
%matplotlib inline
%load_ext pycodestyle_magic
sns.set_style('darkgrid')

In [None]:
warnings.filterwarnings('ignore')

## Set-up

#### Imports

In [None]:
# Import modeling helper functions
from modeling_funcs import *

In [None]:
# Import
inpath = "model_dictionary.pickle"
with open(inpath, 'rb') as f:
    modeling = pickle.load(f)

In [None]:
# Pull out the features dataframe
train = modeling['features']

In [None]:
# Remove tickers with fewer than 5-years worth of data
ticker_counts = (train['ticker']
                 .value_counts()
                 .reset_index()
                 .rename({'ticker':'count','index':'ticker'}, axis=1))
keep_tickers = (ticker_counts
                .loc[ticker_counts['count'] >= (252*5), 'ticker']
                .tolist())
keep_idx = train['ticker'].isin(keep_tickers)
train = train[keep_idx]

#### Feature selection

In [None]:
# Set a feature selection list (THINK ABOUT INFORMING THIS SELECTION WITH SHRINKAGE METHODS, I.E. LASSO REGRESSION)
features = ['High', 'Low', 'Close', 'Volume', 'AdjClose', 'Year',
            'Month', 'Week', 'Day', 'Dayofyear', 
            'Pct_Change_Monthly', 'Pct_Change_Yearly', 'RSI', 'Volatility',
            'Yearly_Return_Rank', 'Monthly_Return_Rank',
            'Rolling_Yearly_Mean_Positive_Days', 'Rolling_Monthly_Mean_Positive_Days', 
            'Rolling_Monthly_Mean_Price', 'Rolling_Yearly_Mean_Price',
            'Momentum_Quality_Monthly', 'Momentum_Quality_Yearly', 'SPY_Trailing_Month_Return',
            'open_l10',  'return_prev5_close_raw', 'return_prev10_close_raw',
            'pe_ratio', 'debt_ratio', 'roa',
            'beta']

In [None]:
# Select on features to pass to modeling machinery, along with necessary indexers
X = train[features]
tickers = train['ticker'].unique().tolist()

In [None]:
# Choose a ticker - remove the tickers as above
target = modeling['target_21_rel_return']
target = target[keep_idx]

## Modeling

#### Panel-level

Given that there are bound to be a number of systemic considerations that impact the price of a stock at any given point in time, it is prudent to perform and evaluate predictions across the panel of S&P 500 stocks in our sample, which will capture potential linkages between different stocks, and allow us to explore the possibility of using features generated from clustering to group like stocks in the panel.

In [None]:
# Create a panel-level copy
y_p = target.copy()

# Indexes of hold-out test data (the 21 days of data preceding the present day)
test_idx = np.where(np.isnan(y_p))[0].tolist()

# In order to ensure grouping is done properly, remove this data from a ticker-identification set as well
ticker_locs = (train[['ticker','date_of_transaction']]
               .drop(train.index[test_idx])
               .reset_index()
               .drop('index', axis=1))

In [None]:
# Create a panel-level copy; normalize by day
X_p = X.copy(deep=True)
X_p = (X_p.groupby(['Year', 'Month', 'Day'])
       .apply(lambda x: (x - np.mean(x))/np.std(x))
       .fillna(0)
       .drop(['Year', 'Month', 'Day'], axis=1))

In [None]:
# Remove hold-out test data
y_p = np.delete(y_p, test_idx)
X_p_holdout = X_p.loc[X_p.index[test_idx]]
X_p = X_p.drop(X_p.index[test_idx])

In [None]:
# Exponential Moving Average smoothing (skip if not)
y_p_smoothed = np.zeros(y_p.shape[0])
for t in tickers:
    idx = ticker_locs.loc[ticker_locs['ticker'] == t].index.tolist()
    y_to_smooth = y_p[idx]
    
    # Compute EMA smoothing of target within ticker
    EMA = 0
    gamma_ = 1
    for ti in range(len(y_to_smooth)):
        EMA = gamma_*y_to_smooth[ti] + (1-gamma_)*EMA
        y_to_smooth[ti] = EMA
        
    y_p_smoothed[idx] = y_to_smooth

In [None]:
# LGBM
model_dict = fit_lgbm_classifier(X_p, 
                                 y_p_smoothed, 
                                 X_p_holdout, 
                                 ticker="", 
                                 ema_gamma=1, 
                                 n_splits=12,
                                 cv_method='ts',
                                 groups=ticker_locs, 
                                 labeled=False,
                                 label="lgbm_final",
                                 param_search=None,
                                 holdout_method='distributed',
                                 threshold_search=True,
                                 export=True)

In [None]:
# kNN
model_dict = fit_sklearn_classifier(X_p, 
                                    y_p, 
                                    X_p_holdout, 
                                    ticker="", 
                                    ema_gamma=1, 
                                    n_splits=12,
                                    cv_method='panel',
                                    model=KNeighborsClassifier,
                                    groups=ticker_locs, 
                                    label='kNN Classifier', 
                                    param_search=None,
                                    holdout_method='distributed',
                                    threshold_search=False,
                                    n_jobs=-1,
                                    export=True)

In [None]:
test = model_dict['preds_df']
test = test[test['split_number'] != 0]
print(metrics.confusion_matrix(test['expected'], test['predicted']))
print(metrics.roc_auc_score(test['expected'], test['predicted']))
print(metrics.classification_report(test['expected'], test['predicted']))

#### Ticker-level 

At the heart of this analysis is a time-series prediction problem. As such, it is prudent to explore running models for each individual stock. We can envision averaging the results of both modeling approaches to incorporate the contribution of both into a final prediction.

In [None]:
# Set parameters
cv_method_ = 'tswindow'
label_ = 'lgbm_final'
results_dfs = []
for i, t in enumerate(tickers[:5]):
    
    # Pull only feature/target data for the relevant stocker
    X_t = X.loc[train['ticker'] == t,:].drop(['Year', 'Month', 'Day'], axis=1)
    y_t = np.array(target)[train['ticker'] == t]
    
    # Indexes of hold-out test data (the 21 days of data preceding the present day)
    test_idx = np.where(np.isnan(y_t))[0].tolist()
    
    # Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature)
    X_t = X_t.apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)
    
    # Remove hold-out test data
    y_t = np.delete(y_t, test_idx)
    y_t = np.array((pd.Series(y_t) - pd.Series(y_t).shift()).fillna(0).tolist())
    X_t_holdout = X_t.loc[X_t.index[test_idx]]
    X_t = X_t.drop(X_t.index[test_idx])
    
    # Fit and evaluate
    model_dict = fit_lgbm_classifier(X_t, 
                                     y_t,
                                     X_t_holdout, 
                                     ticker=t, 
                                     ema_gamma=1, 
                                     n_splits=12,
                                     cv_method='tswindow', 
                                     labeled=False,
                                     param_search=None,
                                     holdout_method='distributed',
                                     threshold_search=True,
                                     export=False)
    
    results_dfs.append(model_dict)
    
(pd.Series(y_t) - pd.Series(y_t).shift())

In [None]:
# Export ticker-level models
model_outpath = "{}_{}_{}.pickle".format(slugify(label_), "all_tickers_", cv_method_)
with open(model_outpath, 'wb') as f:
    pickle.dump(results_dfs, f)

In [None]:
# Set parameters
cv_method_ = 'ts'
label_ = 'RF Window'
model_ = RandomForestClassifier

results_dfs = []
for i, t in enumerate(tickers):
    
    # Pull only feature/target data for the relevant stocker
    X_t = X.loc[train['ticker'] == t,:].drop(['Year', 'Month', 'Day'], axis=1)
    y_t = np.array(target)[train['ticker'] == t]
    
    # Indexes of hold-out test data (the 21 days of data preceding the present day)
    test_idx = np.where(np.isnan(y_t))[0].tolist()
    
    # Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature)
    X_t = X_t.apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)
    
    # Remove hold-out test data
    y_t = np.delete(y_t, test_idx)
    X_t_holdout = X_t.loc[X_t.index[test_idx]]
    X_t = X_t.drop(X_t.index[test_idx])
    
    # Fit and evaluate
    model_dict = fit_sklearn_classifier(X_t, 
                                        y_t, 
                                        X_t_holdout, 
                                        ticker=t, 
                                        ema_gamma=1, 
                                        n_splits=36,
                                        cv_method=cv_method_,
                                        model=model_,
                                        label=label_, 
                                        param_search=None,
                                        holdout_method='distributed',
                                        threshold_search=True,
                                        n_estimators=1000,
                                        export=False)
    
    results_dfs.append(model_dict)

In [None]:
# Export ticker-level models
model_outpath = "{}_{}_{}.pickle".format(slugify(label_), "all_tickers", cv_method_)
with open(model_outpath, 'wb') as f:
    pickle.dump(results_dfs, f)

## Evaluation

#### Panel-level

In [None]:
# Set path to pickle file containing panel-level model
model_inpath = "lgbm_final_select_panel_ts.pickle"

In [None]:
# Import
with open(model_inpath, 'rb') as f:
    results_df = pickle.load(f)

In [None]:
ticker_performance = results_df['preds_df']
try:
    feature_importances = pd.DataFrame(results_df['feature_importances'], columns=['feature', 'importance'])
except KeyError:
    print("No variable importances for this model")

#### Ticker-level

In [None]:
# Set path to pickle file containing ticker-level model
model_inpath = "lgbm_final_all_tickers_252_21_tswindow.pickle"

In [None]:
# Import
with open(model_inpath, 'rb') as f:
    results_dfs = pickle.load(f)

In [None]:
# Stand up results dataframes
performance_dfs = []
feature_importance_dfs = []
holdout_predictions = defaultdict(list)

for r in results_dfs:
    performance_dfs.append(r['preds_df'])
    try:
        feature_importance_dfs.append(pd.DataFrame(r['feature_importances'], columns=['feature', 'importance']))
    except KeyError:
        print("No variable importances for this model")
    holdout_predictions[r['preds_df'].ticker.unique().tolist()[0]] = r['holdout_probs']
    
ticker_performance = pd.concat(performance_dfs, axis=0)
feature_importances = pd.concat(feature_importance_dfs, axis=0)

### Visualization

In [None]:
# Remove unpopulated splits (training data never used for validation)
ticker_performance = ticker_performance[ticker_performance['split_number'] != 0]

In [None]:
# Average feature importances across all ticker-level models
average_importances = feature_importances.groupby('feature').mean().sort_values('importance')
average_importances.plot(kind='barh', title="Feature Importances - Ticker", legend=False, figsize=(16,12))
plt.savefig(fname='varimp_tickers_252_63_final.jpg', pad_inches=0, bbox_inches='tight')
plt.show()

In [None]:
# AUC Curve
fpr, tpr, threshold = metrics.roc_curve(ticker_performance['expected'], ticker_performance['predicted_prob'])
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic Curve')
plt.plot(fpr, tpr, 'c', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'k--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig(fname='auc_ticker_lgbm_252_21_final.jpg', pad_inches=0, bbox_inches='tight')
plt.show()

In [None]:
# Precision-Recall Curves
precision, recall, _ = metrics.precision_recall_curve(ticker_performance['expected'], ticker_performance['predicted_prob'], pos_label=1)
average_precision = metrics.average_precision_score(ticker_performance['expected'], ticker_performance['predicted_prob'])

plt.plot(recall, precision, label='area = %0.2f' % average_precision, color="green")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve')
plt.legend(loc="lower right")
plt.savefig(fname='prc_ticker_lgbm_252_126_final.jpg', pad_inches=0, bbox_inches='tight')
plt.show()

In [None]:
# Classification Report
fig, ax = plt.subplots(figsize=(12,8))
import matplotlib.pyplot as plt
scores = metrics.precision_recall_fscore_support(ticker_performance['expected'], ticker_performance['predicted'])
score_matrix = [[s[0] for s in scores[:3]],
                [s[1] for s in scores[:3]]]
print(score_matrix)

plt.imshow(score_matrix, interpolation='nearest', cmap='RdBu_r', vmin=0, vmax=1)
plt.title('LightGBM Classification Report - window CV')
plt.colorbar()
x_tick_marks = np.arange(3)
y_tick_marks = np.arange(2)
plt.xticks(x_tick_marks, ['precision', 'recall', 'f1-score'], rotation=45, )
ax.yaxis.label.set_size(25)
ax.xaxis.label.set_size(25)
ax.set_title('LightGBM Classification Report - window CV', size=20)
plt.yticks(y_tick_marks, ['Outperform', 'Underperform'])
plt.tight_layout()
plt.ylabel('Classes')
plt.xlabel('Measures')
plt.savefig(fname='lgbm_window_map_ticker_252_21_final.jpg', pad_inches=0, bbox_inches='tight')
plt.show()

In [None]:
print(metrics.classification_report(ticker_performance['expected'], ticker_performance['predicted']))