# Stock Price Classification
By: Jared Berry

In [1]:
# I/O and data structures
import pickle
import pandas as pd
import numpy as np

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor

# Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

# Evaluation
from sklearn import metrics

# Quality of life
import os
import time

## Set-up

#### Imports

In [2]:
# Import modeling helper functions
from modeling_funcs import *

In [3]:
# Import
inpath = "model_dictionary.pickle"
with open(inpath, 'rb') as f:
    modeling = pickle.load(f)

In [4]:
modeling.keys()

dict_keys(['target_1_return', 'target_1_return_res', 'target_1_composite', 'target_1_average', 'target_1_rank', 'target_1_up', 'target_1_rel_return', 'target_1_rel_up', 'target_5_return', 'target_5_return_res', 'target_5_composite', 'target_5_average', 'target_5_rank', 'target_5_up', 'target_5_rel_return', 'target_5_rel_up', 'target_10_return', 'target_10_return_res', 'target_10_composite', 'target_10_average', 'target_10_rank', 'target_10_up', 'target_10_rel_return', 'target_10_rel_up', 'target_21_return', 'target_21_return_res', 'target_21_composite', 'target_21_average', 'target_21_rank', 'target_21_up', 'target_21_rel_return', 'target_21_rel_up', 'features', 'ticker_features'])

In [5]:
# Pull out the features dataframe
train = modeling['features']

#### Feature selection

In [6]:
# Set a feature selection list (THINK ABOUT INFORMING THIS SELECTION WITH SHRINKAGE METHODS, I.E. LASSO REGRESSION)
features = ['High', 'Low', 'Open', 'Close', 'Volume', 'AdjClose', 'Year',
            'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Pct_Change_Daily',
            'Pct_Change_Monthly', 'Pct_Change_Yearly', 'RSI', 'Volatility',
            'Yearly_Return_Rank', 'Monthly_Return_Rank', 'Pct_Change_Class',
            'Rolling_Yearly_Mean_Positive_Days', 'Rolling_Monthly_Mean_Positive_Days', 
            'Rolling_Monthly_Mean_Price', 'Rolling_Yearly_Mean_Price',
            'Momentum_Quality_Monthly', 'Momentum_Quality_Yearly', 'SPY_Trailing_Month_Return',
            'open_l1', 'open_l5', 'open_l10', 'close_l1', 'close_l5', 'close_l10',
            'return_prev1_open_raw', 'return_prev5_open_raw', 'return_prev10_open_raw',
            'return_prev1_close_raw', 'return_prev5_close_raw', 'return_prev10_close_raw',
            'pe_ratio', 'debt_ratio', 'debt_to_equity', 'roa',
            'beta']

In [7]:
# Select on features to pass to modeling machinery, along with necessary indexers
X = train[features]
tickers = train['ticker'].unique().tolist()

In [8]:
# Choose a ticker
target = modeling['target_21_rel_return']

## Modeling

#### Panel-level

Given that there are bound to be a number of systemic considerations that impact the price of a stock at any given point in time, it is prudent to perform and evaluate predictions across the panel of S&P 500 stocks in our sample, which will capture potential linkages between different stocks, and allow us to explore the possibility of using features generated from clustering to group like stocks in the panel.

In [34]:
# Create a panel-level copy
y_p = target.copy()

# Indexes of hold-out test data (the 21 days of data preceding the present day)
test_idx = np.where(np.isnan(y_p))[0].tolist()

# In order to ensure grouping is done properly, remove this data from a ticker-identification set as well
ticker_locs = train[['ticker','date_of_transaction']].drop(train.index[test_idx]).reset_index().drop('index', axis=1)

In [35]:
# Create a panel-level copy
X_p = X.copy(deep=True)

# Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature) within days
X_p = X_p.groupby(['Year', 'Month', 'Day']).apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)

In [36]:
# Remove hold-out test data
y_p = np.delete(y_p, test_idx)
X_p_holdout = X_p.loc[X_p.index[test_idx]]
X_p = X_p.drop(X_p.index[test_idx])

In [None]:
y_p_smoothed = np.zeros(y_p.shape[0])
for t in tickers:
    idx = ticker_locs.loc[ticker_locs['ticker'] == t].index.tolist()
    y_to_smooth = y_p[idx]
    
    # Compute EMA smoothing of target within ticker
    EMA = 0
    gamma_ = 1
    for ti in range(len(y_to_smooth)):
        EMA = gamma_*y_to_smooth[ti] + (1-gamma_)*EMA
        y_to_smooth[ti] = EMA
        
    y_p_smoothed[idx] = y_to_smooth

In [38]:
y_p_smoothed = y_p.copy()

In [39]:
# Fit and evaluate - gamma MUST be 1 here
model_dict = fit_lgbm_classifier(X_p, 
                                 y_p_smoothed, 
                                 X_p_holdout, 
                                 ticker="", 
                                 ema_gamma=1, 
                                 n_splits=12,
                                 cv_method = 'panel', 
                                 groups = ticker_locs, 
                                 labeled = False,
                                 param_search = {},
                                 threshold_search = False)


0 targets changed by smoothing.


  preds = pd.DataFrame().from_items(zip(cols,vals))


Build, hyperparameter selection, and validation of LGBM Classifier took 447.362 seconds

Average AUC across 12 splits: 0.507832324860236
('roa', 578.7499999999999)
('debt_ratio', 525.0833333333334)
('pe_ratio', 392.25)
('beta', 356.1666666666667)
('Rolling_Yearly_Mean_Price', 257.0)
('Rolling_Yearly_Mean_Positive_Days', 175.58333333333334)


In [None]:
model_dict = fit_sklearn_classifier(X_p, 
                                    y_p, 
                                    X_p_holdout, 
                                    ticker=t, 
                                    ema_gamma=1, 
                                    n_splits=2, 
                                    model=KNeighborsClassifier,
                                    label='kNN Classifier', 
                                    param_search = {},
                                    threshold_search = True)

In [None]:
test = model_dict['preds_df']
test = test[test['split_number'] == 1]
print(metrics.confusion_matrix(test['expected'], test['predicted']))
print(metrics.roc_auc_score(test['expected'], test['predicted']))
print(metrics.classification_report(test['expected'], test['predicted']))

#### Ticker-level 

At the heart of this analysis is a time-series prediction problem. As such, it is prudent to explore running models for each individual stock. We can envision averaging the results of both modeling approaches to incorporate the contribution of both into a final prediction.

In [32]:
results_dfs = []
for i, t in enumerate(tickers[:1]):
    
    # Pull only feature/target data for the relevant stocker
    X_t = X.loc[train['ticker'] == t,:]
    y_t = np.array(target)[train['ticker'] == t]
    
    # Indexes of hold-out test data (the 21 days of data preceding the present day)
    test_idx = np.where(np.isnan(y_t))[0].tolist()
    
    # Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature)
    X_t = X_t.apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)
    
    # Remove hold-out test data
    y_t = np.delete(y_t, test_idx)
    X_t_holdout = X_t.loc[X_t.index[test_idx]]
    X_t = X_t.drop(X_t.index[test_idx])
    
    # Fit and evaluate
    #model_dict = fit_lgbm_classifier(X_t, 
    #                                 y_t,
    #                                 X_t_holdout, 
    #                                 ticker=t, 
    #                                 ema_gamma=1, 
    #                                 n_splits=12,
    #                                 cv_method='tsrecur', 
    #                                 labeled = False,
    #                                 param_search = {},
    #                                 threshold_search = True)
    model_dict = fit_sklearn_classifier(X_t, 
                                        y_t, 
                                        X_t_holdout, 
                                        ticker=t, 
                                        ema_gamma=1, 
                                        n_splits=12,
                                        cv_method='ts',
                                        model=KNeighborsClassifier,
                                        label='kNN Classifier', 
                                        param_search = {},
                                        threshold_search = True)
    
    results_dfs.append(model_dict)


0 targets changed by smoothing.


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Build, hyperparameter selection, and validation of kNN Classifier took 0.948 seconds

Hyperparameters are as follows:
Validation scores are as follows:
precision    0.666199
recall       0.547325
accuracy     0.547325
f1           0.524389
dtype: float64


  preds = pd.DataFrame().from_items(zip(cols,vals))


In [33]:
test = results_dfs[0]['preds_df']
test = test[test['split_number'] != 0]
print(metrics.confusion_matrix(test['expected'], test['predicted']))
print(metrics.roc_auc_score(test['expected'], test['predicted']))
print(metrics.classification_report(test['expected'], test['predicted']))

[[586 382]
 [498 478]]
0.5475629995935509
              precision    recall  f1-score   support

           0       0.54      0.61      0.57       968
           1       0.56      0.49      0.52       976

   micro avg       0.55      0.55      0.55      1944
   macro avg       0.55      0.55      0.55      1944
weighted avg       0.55      0.55      0.55      1944

