## Early-stage modeling

In [84]:
# Import necessary libraries for data preparation/EDA
import os
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

%matplotlib inline

sns.set_style('darkgrid')

### Set-up & data preparation

In [37]:
# Ensure working directory is set appropriately - change as needed
os.chdir('C:\\Users\\jared\\Documents\\data_science\\school\\georgetown\\capstone\\Passive-Stock-Fund-Optimization')

In [4]:
# Load datasets into memory
yahoo = pd.read_csv('stock_price_until_2019_04_28.csv')
simfin = pd.read_csv('simfin/daily_simfin.csv')

In [13]:
# Check dimensions
print("Yahoo! Finance data has {} observations and {} features.".format(yahoo.shape[0], yahoo.shape[1]))
print("Daily SimFin data has {} observations and {} features.".format(simfin.shape[0], simfin.shape[1]))

Yahoo! Finance data has 1014554 observations and 22 features.
Daily SimFin data has 1509516 observations and 53 features.


In [20]:
# Check keys
print("SimFin:")
print(simfin[['ticker', 'date']].head())
print("Yahoo! Finance:")
print(yahoo[['Symbol', 'date_of_transaction']].head())

SimFin:
  ticker        date
0    MMM  2011-03-31
1    MMM  2011-04-01
2    MMM  2011-04-02
3    MMM  2011-04-03
4    MMM  2011-04-04
Yahoo! Finance:
  Symbol date_of_transaction
0    ABT          2011-01-03
1    ABT          2011-01-04
2    ABT          2011-01-05
3    ABT          2011-01-06
4    ABT          2011-01-07


#### Merge preparation

In [22]:
# Some quick fixes on keys
yahoo['ticker'] = yahoo['Symbol']
yahoo.drop('Symbol', axis=1, inplace=True)

simfin['date_of_transaction'] = simfin['date']
simfin.drop('date', axis=1, inplace=True)

In [44]:
# Construct the 'train' dataset by merging stock prices and fundamentals; ensure proper sorting and filter on early sample
train = yahoo ## pd.merge(yahoo, simfin, on=['ticker', 'date_of_transaction'])
train = train.sort_values(['ticker','date_of_transaction'])
train = train[train['date_of_transaction'] >= '2011-03-31'].reset_index().drop('index', axis=1)
train.head()

Unnamed: 0,sno,date_of_transaction,High,Low,Open,Close,Volume,AdjClose,Year,Month,...,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,ticker
0,22395,2011-03-31,32.203148,31.802574,32.131618,32.031475,3068400.0,29.502216,2011,3,...,3,90,True,False,True,False,False,False,1301529600,A
1,22396,2011-04-01,32.532188,32.031475,32.103004,32.288982,3092700.0,29.739388,2011,4,...,4,91,False,True,False,True,False,False,1301616000,A
2,22397,2011-04-04,32.482117,31.731045,32.238914,32.067238,2983100.0,29.535152,2011,4,...,0,94,False,False,False,False,False,False,1301875200,A
3,22398,2011-04-05,32.546494,31.909872,32.010014,32.432045,4385200.0,29.871157,2011,4,...,1,95,False,False,False,False,False,False,1301961600,A
4,22399,2011-04-06,32.689556,31.988556,32.55365,32.396282,3287900.0,29.838217,2011,4,...,2,96,False,False,False,False,False,False,1302048000,A


#### Target generation

In [49]:
# At the ticker level, lead the AdjClose column by 21 trading days
target_gen = train[['ticker', 'date_of_transaction', 'AdjClose']]
AdjClose_ahead = target_gen.groupby('ticker')['AdjClose'].shift(-21)
AdjClose_ahead.name = 'AdjClose_ahead'

In [54]:
# Construct monthly-return target variables, both continuous and categorical
y_cont = np.array(100*((AdjClose_ahead - train['AdjClose'])/train['AdjClose']))
y_disc = np.where(y_cont > 0, 1, np.where(np.isnan(y_cont), np.nan, 0))

  This is separate from the ipykernel package so we can avoid doing imports until


In [96]:
# Construct a market-residualized variant (NEED S&P 500 OR CONSTRUCTION OF AN INDEX FROM CURRENT FEATURES)

In [97]:
# Construct various moving-average variants

### Exporatory Data Analysis

#### Feature selection

In [94]:
# Set a feature selection list (THINK ABOUT INFORMING THIS SELECTION WITH SHRINKAGE METHODS, I.E. RIDGE REGRESSION)
features = ['High', 
            'Low', 
            'Open', 
            'Close', 
            'Volume', 
            'AdjClose', 
            'Year',
            'Month',     # ohe? 
            'Week',      # ohe?
            'Day',       # ohe?
            'Dayofweek', # ohe?
            'Dayofyear'] # ohe?

In [None]:
# Select on features to pass to modeling machinery
X = train[features]

### Modeling

#### Set-up

In [103]:
# Set relevant scikit-learn functions/modules

# Tests for stationarity 
from statsmodels.tsa.stattools import adfuller

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# LightGBM
## import lightgbm as lgb

# Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Evaluation
from sklearn import metrics

In [138]:
# From Wheat Classification notebook 
def fit_and_evaluate(X, y, model, label, param_search):
    """
    Because of the Scikit-Learn API, we can create a function to
    do all of the fit and evaluate work on our behalf!
    """
    start  = time.time() # Start the clock! 
    scores = {'precision':[], 'recall':[], 'accuracy':[], 'f1':[]}

    # Set time-series, cross-validation indices
    tscv = TimeSeriesSplit(n_splits=12).split(X)
    tscv_grid = TimeSeriesSplit(n_splits=12).split(X)
    
    # Perform a grid-search on the provided parameters to determine best options
    gridsearch = GridSearchCV(estimator=model(), 
                              cv=tscv_grid,
                              param_grid=param_search,
                              n_jobs=-1)
    
    # Fit to extract best parameters later
    gridsearch_model = gridsearch.fit(X,y)

    for train, test in tscv:
        X_train, X_test = X_t.loc[X_t.index[train]], X_t.loc[X_t.index[test]]
        y_train, y_test = y[train], y[test]
        
        estimator = model(**gridsearch_model.best_params_)
        estimator.fit(X_train, y_train)
        
        expected  = y_test
        predicted = estimator.predict(X_test)

        # Append our scores to the tracker
        scores['precision'].append(metrics.precision_score(expected, predicted, average="weighted"))
        scores['recall'].append(metrics.recall_score(expected, predicted, average="weighted"))
        scores['accuracy'].append(metrics.accuracy_score(expected, predicted))
        scores['f1'].append(metrics.f1_score(expected, predicted, average="weighted"))

    # Report
    print("Build, hyperparameter selection, and validation of {} took {:0.3f} seconds\n".format(label, time.time()-start))
    print("Hyperparameters are as follows:")
    for key in gridsearch_model.best_params_.keys():
        print("{}: {}\n".format(key, gridsearch_model.best_params_[key]))
    print("Validation scores are as follows:")
    print(pd.DataFrame(scores).mean())

#### Panel-level

Given that there are bound to be a number of systemic considerations that impact the price of a stock at any given point in time, it is prudent to perform and evaluate predictions across the panel of S&P 500 stocks in our sample, which will capture potential linkages between different stocks, and allow us to explore the possibility of using features generated from clustering to group like stocks in the panel

#### Ticker-level 

At the heart of this analysis is a time-series prediction problem. As such, it is prudent to explore running models for each individual stock. We can envision averaging the results of both modeling approaches to incorporate the contribution of both into a final prediction.

In [65]:
# Create a list of the tickers to loop through
tickers = train['ticker'].tolist()

In [139]:
i = 1; t = tickers[i] ## for i, t in enumerate(tickers):

# Pull only feature/target data for the relevant stocker
X_t = X.loc[train['ticker'] == t,:]
y_t = y_disc[train['ticker'] == t]

# Indexes of hold-out test data (the 21 days of data preceding the present day)
test_idx = np.where(np.isnan(y_t))[0].tolist()

# Simple feature-scaling - for now, replace missings with 0 (i.e. the mean of a normalized feature)
X_t = X_t.apply(lambda x: (x - np.mean(x))/np.std(x)).fillna(0)

# Remove hold-out test data
y_t = np.delete(y_t, test_idx)
X_t_holdout = X_t.loc[X_t.index[test_idx]]
X_t = X_t.drop(X_t.index[test_idx])

# Fit and evaluate
fit_and_evaluate(X_t, y_t, 
                 KNeighborsClassifier, "kNN Classifier", 
                 {'n_neighbors':[2,4,6,8,10,12]})

Build, hyperparameter selection, and validation of kNN Classifier took 0.849 seconds

Hyperparameters are as follows:
n_neighbors: 4

Validation scores are as follows:
precision    0.584789
recall       0.478214
accuracy     0.478214
f1           0.467741
dtype: float64
