In [1]:
from IPython.core.display import Image
import sys

import numpy as np
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like # remove once updated pandas-datareader issue is fixed
# https://github.com/pydata/pandas-datareader/issues/534
import pandas_datareader.data as web
%matplotlib inline

In [2]:
def get_symbols(symbols, data_source, begin_date=None, end_date=None):
    out = pd.DataFrame()
    for symbol in symbols:
        df = web.DataReader(symbol, data_source, begin_date, end_date, api_key='2gwP5WpwYPiHshKy1H9v')\
        [['AdjOpen','AdjHigh','AdjLow','AdjClose','AdjVolume']].reset_index()
        df.columns = ['date','open','high','low','close','volume'] #my convention: always lowercase
        df['symbol'] = symbol # add a new column which contains the symbol so we can keep multiple symbols in the same dataframe
        df = df.set_index(['date', 'symbol'])
        out = pd.concat([out, df], axis=0) #stacks on top of previously collected data
    return out.sort_index()
        
prices = get_symbols(['AAPL','CSCO','AMZN','YHOO','MSFT'],data_source='quandl',begin_date='2012-01-01',end_date=None)
# note, we're only using real price data to get an accurate date/symbol index set.  

print(prices.shape)

(7639, 5)


In [3]:
num_obs = prices.close.count()

def add_memory(s,n_days=50,mem_strength=0.1):
    ''' adds autoregressive behavior to series of data'''
    add_ewm = lambda x: (1-mem_strength)*x + mem_strength*x.ewm(n_days).mean()
    out = s.groupby(level='symbol').apply(add_ewm)
    return out

# generate feature data
f01 = pd.Series(np.random.randn(num_obs),index=prices.index)
f01 = add_memory(f01,10,0.1)
f02 = pd.Series(np.random.randn(num_obs),index=prices.index)
f02 = add_memory(f02,10,0.1)
f03 = pd.Series(np.random.randn(num_obs),index=prices.index)
f03 = add_memory(f03,10,0.1)
f04 = pd.Series(np.random.randn(num_obs),index=prices.index)
f04 = f04 # no memory

features = pd.concat([f01,f02,f03,f04],axis=1)

## now, create response variable such that it is related to features
# f01 becomes increasingly important, f02 becomes decreasingly important,
# f03 oscillates in importance, f04 is stationary, 
# and finally a noise component is added

outcome =   f01 * np.linspace(0.5,1.5,num_obs) + \
            f02 * np.linspace(1.5,0.5,num_obs) + \
            f03 * pd.Series(np.sin(2*np.pi*np.linspace(0,1,num_obs)*2)+1,index=f03.index) + \
            f04 + \
            np.random.randn(num_obs) * 3 
outcome.name = 'outcome'

In [4]:
from sklearn.linear_model import LinearRegression

## fit models for each timestep on a walk-forward basis
recalc_dates = features.resample('Q',level='date').mean().index.values[:-1]
models = pd.Series(index=recalc_dates)
for date in recalc_dates:
    X_train = features.xs(slice(None,date),level='date',drop_level=False)
    y_train = outcome.xs(slice(None,date),level='date',drop_level=False)
    model = LinearRegression()
    model.fit(X_train,y_train)
    models.loc[date] = model

## predict values walk-forward (all predictions out of sample)
begin_dates = models.index
end_dates = models.index[1:].append(pd.to_datetime(['2099-12-31']))

predictions = pd.Series(index=features.index)

for i,model in enumerate(models): #loop thru each models object in collection
    X = features.xs(slice(begin_dates[i],end_dates[i]),level='date',drop_level=False)
    p = pd.Series(model.predict(X),index=X.index)
    predictions.loc[X.index] = p

  """


In [5]:
import sklearn.metrics as metrics

# make sure we have 1-for-1 mapping between pred and true
common_idx = outcome.dropna().index.intersection(predictions.dropna().index)
y_true = outcome[common_idx]
y_true.name = 'y_true'
y_pred = predictions[common_idx]
y_pred.name = 'y_pred'

standard_metrics = pd.Series()

standard_metrics.loc['explained variance'] = metrics.explained_variance_score(y_true, y_pred)
standard_metrics.loc['MAE'] = metrics.mean_absolute_error(y_true, y_pred)
standard_metrics.loc['MSE'] = metrics.mean_squared_error(y_true, y_pred)
standard_metrics.loc['MedAE'] = metrics.median_absolute_error(y_true, y_pred)
standard_metrics.loc['RSQ'] = metrics.r2_score(y_true, y_pred)

print(standard_metrics)

explained variance    0.251468
MAE                   2.490252
MSE                   9.782391
MedAE                 2.100599
RSQ                   0.251006
dtype: float64


  # Remove the CWD from sys.path while we load stuff.


In [6]:
print(pd.concat([y_pred,y_true],axis=1).tail())

                     y_pred    y_true
date       symbol                    
2018-03-26 MSFT   -0.760784 -1.580143
2018-03-27 AAPL   -0.807113  0.883849
           AMZN   -0.205872 -0.459917
           CSCO   -3.074272  1.196385
           MSFT   -0.616657  0.226104


In [8]:
def make_df(y_pred, y_true):
    y_pred.name = 'y_pred'
    y_true.name = 'y_true'
    
    df = pd.concat([y_pred,y_true],axis=1)

    df['sign_pred'] = df.y_pred.apply(np.sign)
    df['sign_true'] = df.y_true.apply(np.sign)
    df['is_correct'] = 0
    df.loc[df.sign_pred * df.sign_true > 0 ,'is_correct'] = 1 # only registers 1 when prediction was made AND it was correct
    df['is_incorrect'] = 0
    df.loc[df.sign_pred * df.sign_true < 0,'is_incorrect'] = 1 # only registers 1 when prediction was made AND it was wrong
    df['is_predicted'] = df.is_correct + df.is_incorrect
    df['result'] = df.sign_pred * df.y_true 
    return df

df = make_df(y_pred,y_true)
print(df.dropna().tail())

                     y_pred    y_true  sign_pred  sign_true  is_correct  \
date       symbol                                                         
2018-03-26 MSFT   -0.760784 -1.580143       -1.0       -1.0           1   
2018-03-27 AAPL   -0.807113  0.883849       -1.0        1.0           0   
           AMZN   -0.205872 -0.459917       -1.0       -1.0           1   
           CSCO   -3.074272  1.196385       -1.0        1.0           0   
           MSFT   -0.616657  0.226104       -1.0        1.0           0   

                   is_incorrect  is_predicted    result  
date       symbol                                        
2018-03-26 MSFT               0             1  1.580143  
2018-03-27 AAPL               1             1 -0.883849  
           AMZN               0             1  0.459917  
           CSCO               1             1 -1.196385  
           MSFT               1             1 -0.226104  


In [9]:
def calc_scorecard(df):
    scorecard = pd.Series()
    # building block metrics
    scorecard.loc['accuracy'] = df.is_correct.sum()*1. / (df.is_predicted.sum()*1.)*100
    scorecard.loc['edge'] = df.result.mean()
    scorecard.loc['noise'] = df.y_pred.diff().abs().mean()
    
    return scorecard    

calc_scorecard(df)

  


accuracy    66.925911
edge         1.443786
noise        2.356521
dtype: float64

In [10]:
def calc_scorecard(df):
    scorecard = pd.Series()
    # building block metrics
    scorecard.loc['accuracy'] = df.is_correct.sum()*1. / (df.is_predicted.sum()*1.)*100
    scorecard.loc['edge'] = df.result.mean()
    scorecard.loc['noise'] = df.y_pred.diff().abs().mean()

    # derived metrics
    scorecard.loc['y_true_chg'] = df.y_true.abs().mean()
    scorecard.loc['y_pred_chg'] = df.y_pred.abs().mean()
    scorecard.loc['prediction_calibration'] = scorecard.loc['y_pred_chg']/scorecard.loc['y_true_chg']
    scorecard.loc['capture_ratio'] = scorecard.loc['edge']/scorecard.loc['y_true_chg']*100

        # metrics for a subset of predictions
    scorecard.loc['edge_long'] = df[df.sign_pred == 1].result.mean()  - df.y_true.mean()
    scorecard.loc['edge_short'] = df[df.sign_pred == -1].result.mean()  - df.y_true.mean()

    scorecard.loc['edge_win'] = df[df.is_correct == 1].result.mean()  - df.y_true.mean()
    scorecard.loc['edge_lose'] = df[df.is_incorrect == 1].result.mean()  - df.y_true.mean()
    
    return scorecard    

calc_scorecard(df)

  


accuracy                  66.925911
edge                       1.443786
noise                      2.356521
y_true_chg                 2.879901
y_pred_chg                 1.662847
prediction_calibration     0.577397
capture_ratio             50.133187
edge_long                  1.458664
edge_short                 1.584487
edge_win                   3.308671
edge_lose                 -2.092591
dtype: float64

In [11]:
def scorecard_by_year(df):
    df['year'] = df.index.get_level_values('date').year
    return df.groupby('year').apply(calc_scorecard).T

print(scorecard_by_year(df))

year                         2012       2013       2014       2015       2016  \
accuracy                70.744681  66.825397  63.253968  70.000000  67.301587   
edge                     1.894709   1.452918   1.070811   1.752079   1.441837   
noise                    2.773124   2.696849   2.322163   2.038755   2.257845   
y_true_chg               3.120458   2.841021   2.750470   3.039346   2.867554   
y_pred_chg               1.932135   1.871917   1.608162   1.514261   1.582326   
prediction_calibration   0.619183   0.658889   0.584686   0.498220   0.551804   
capture_ratio           60.718941  51.140701  38.931934  57.646588  50.281082   
edge_long                1.805685   1.428200   1.077546   1.811012   1.481127   
edge_short               1.790653   1.503973   1.518405   1.935566   1.751144   
edge_win                 3.448366   3.225312   3.249500   3.545831   3.379922   
edge_lose               -2.191105  -2.079611  -2.056583  -2.022060  -2.001731   

year                       

  
  
  
  
  
  
  


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNetCV,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor

X_train,X_test,y_train,y_test = train_test_split(features,outcome,test_size=0.20,shuffle=False)

# linear regression
model1 = LinearRegression().fit(X_train,y_train)
model1_train = pd.Series(model1.predict(X_train),index=X_train.index)
model1_test = pd.Series(model1.predict(X_test),index=X_test.index)

model2 = RandomForestRegressor().fit(X_train,y_train)
model2_train = pd.Series(model2.predict(X_train),index=X_train.index)
model2_test = pd.Series(model2.predict(X_test),index=X_test.index)

# create dataframes for each 
model1_train_df = make_df(model1_train,y_train)
model1_test_df = make_df(model1_test,y_test)
model2_train_df = make_df(model2_train,y_train)
model2_test_df = make_df(model2_test,y_test)

s1 = calc_scorecard(model1_train_df)
s1.name = 'model1_train'
s2 = calc_scorecard(model1_test_df)
s2.name = 'model1_test'
s3 = calc_scorecard(model2_train_df)
s3.name = 'model2_train'
s4 = calc_scorecard(model2_test_df)
s4.name = 'model2_test'

print(pd.concat([s1,s2,s3,s4],axis=1))

                        model1_train  model1_test  model2_train  model2_test
accuracy                   67.844870    63.285340     92.292587    61.976440
edge                        1.553404     1.086505      2.836634     1.009460
noise                       2.225819     2.241585      3.114979     2.437531
y_true_chg                  2.927533     2.712464      2.927533     2.712464
y_pred_chg                  1.563711     1.593830      2.220332     1.693174
prediction_calibration      0.534140     0.587595      0.758431     0.624220
capture_ratio              53.061871    40.056022     96.895008    37.215620
edge_long                   1.617875     1.105491      2.934078     1.018890
edge_short                  1.657022     1.047895      2.909756     0.979784
edge_win                    3.387208     2.991158      3.207636     2.992391
edge_lose                  -2.051851    -2.224621     -0.504822    -2.249708


  
  
  
  
