In [17]:
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like # may be necessary in some versions of pandas
import pandas_datareader.data as web

In [48]:
def get_symbols(symbols, data_source, begin_date=None, end_date=None):
    out = pd.DataFrame()
    for symbol in symbols:
        df = web.DataReader(symbol, data_source, begin_date, end_date, api_key='2gwP5WpwYPiHshKy1H9v')\
        [['AdjOpen','AdjHigh','AdjLow','AdjClose','AdjVolume']].reset_index()
        df.columns = ['date','open','high','low','close','volume'] #my convention: always lowercase
        df['symbol'] = symbol # add a new column which contains the symbol so we can keep multiple symbols in the same dataframe
        df = df.set_index(['date', 'symbol'])
        out = pd.concat([out, df], axis=0) #stacks on top of previously collected data
    return out.sort_index()
        
prices = get_symbols(['AAPL','CSCO'], data_source='quandl', begin_date='2015-01-01',end_date='2017-01-01')

                         open        high         low       close      volume
date       symbol                                                            
2016-12-30 AAPL    115.209202  115.752409  114.004271  114.389454  30586265.0
2016-12-29 AAPL    115.011672  115.663027  114.962290  115.288214  15039519.0
2016-12-28 AAPL    116.068456  116.558923  114.764760  115.317843  20905892.0
2016-12-27 AAPL    115.080808  116.344998  115.051178  115.811668  18296855.0
2016-12-23 AAPL    114.162295  115.080808  114.162295  115.080808  14249484.0
                         open        high         low       close      volume
date       symbol                                                            
2016-12-30 AAPL    115.209202  115.752409  114.004271  114.389454  30586265.0
2016-12-29 AAPL    115.011672  115.663027  114.962290  115.288214  15039519.0
2016-12-28 AAPL    116.068456  116.558923  114.764760  115.317843  20905892.0
2016-12-27 AAPL    115.080808  116.344998  115.051178  115.81166

In [49]:
features = pd.DataFrame(index=prices.index)
features['volume_change_ratio'] = prices.groupby(level='symbol').volume\
.diff(1) / prices.groupby(level='symbol').shift(1).volume
features['momentum_5_day'] = prices.groupby(level='symbol').close\
.pct_change(5) 

features['intraday_chg'] = (prices.groupby(level='symbol').close\
                            .shift(0) - prices.groupby(level='symbol').open\
                            .shift(0))/prices.groupby(level='symbol').open.shift(0)

features['day_of_week'] = features.index.get_level_values('date').weekday

features['day_of_month'] = features.index.get_level_values('date').day

features.dropna(inplace=True)
print(features.tail(10))

MultiIndex([('2015-01-02', 'AAPL'),
            ('2015-01-02', 'CSCO'),
            ('2015-01-05', 'AAPL'),
            ('2015-01-05', 'CSCO'),
            ('2015-01-06', 'AAPL'),
            ('2015-01-06', 'CSCO'),
            ('2015-01-07', 'AAPL'),
            ('2015-01-07', 'CSCO'),
            ('2015-01-08', 'AAPL'),
            ('2015-01-08', 'CSCO'),
            ...
            ('2016-12-23', 'AAPL'),
            ('2016-12-23', 'CSCO'),
            ('2016-12-27', 'AAPL'),
            ('2016-12-27', 'CSCO'),
            ('2016-12-28', 'AAPL'),
            ('2016-12-28', 'CSCO'),
            ('2016-12-29', 'AAPL'),
            ('2016-12-29', 'CSCO'),
            ('2016-12-30', 'AAPL'),
            ('2016-12-30', 'CSCO')],
           names=['date', 'symbol'], length=1008)
                   volume_change_ratio  momentum_5_day  intraday_chg  \
date       symbol                                                      
2016-12-23 AAPL              -0.453747        0.004743      0.008046 

In [51]:
outcomes = pd.DataFrame(index=prices.index)
# next day's opening change
outcomes['open_1'] = prices.groupby(level='symbol').open.shift(-1)\
/prices.groupby(level='symbol').close.shift(0)-1
# next day's closing change
func_one_day_ahead = lambda x: x.pct_change(-1)
outcomes['close_1'] = prices.groupby(level='symbol').close\
.apply(func_one_day_ahead)
func_five_day_ahead = lambda x: x.pct_change(-5)
outcomes['close_5'] = prices.groupby(level='symbol').close\
.apply(func_five_day_ahead)

In [55]:
# a valid value exists for both y and X
y = outcomes.close_1
X = features
Xy = X.join(y).dropna()
y = Xy[y.name]
X = Xy[X.columns]

In [58]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X,y)
print("Model RSQ: "+ str(model.score(X,y)))

print(model.coef_)

print("Coefficients: ")
print(pd.Series(model.coef_, index=X.columns).sort_values(ascending=False))

Model RSQ: 0.01598347165537528
[ 9.75791921e-04 -5.54280366e-03  1.50481641e-01 -4.26758666e-04
  3.63597672e-05]
Coefficients: 
volume_change_ratio    0.000976
momentum_5_day        -0.005543
intraday_chg           0.150482
day_of_week           -0.000427
day_of_month           0.000036
dtype: float64


In [61]:
from sklearn.ensemble import RandomForestRegressor

y = outcomes.open_1
X = features
Xy = X.join(y).dropna()
y = Xy[y.name]
X = Xy[X.columns]
print(y.shape)
print(X.shape)

model = RandomForestRegressor(max_features=3)
model.fit(X,y)
print("Model Score: "+ str(model.score(X,y)))

print("Feature Importance: ")
print(pd.Series(model.feature_importances_,index=X.columns)\
.sort_values(ascending=False))

(996,)
(996, 5)
Model Score: 0.859886469880583
Feature Importance: 
intraday_chg           0.304055
momentum_5_day         0.259014
volume_change_ratio    0.247768
day_of_month           0.129304
day_of_week            0.059860
dtype: float64


In [62]:
print(pd.Series(model.predict(X),index=X.index).tail(10))

date        symbol
2016-12-22  AAPL     -0.004547
            CSCO      0.001259
2016-12-23  AAPL     -0.000201
            CSCO      0.001655
2016-12-27  AAPL      0.001574
            CSCO      0.001585
2016-12-28  AAPL     -0.001967
            CSCO     -0.000973
2016-12-29  AAPL     -0.000133
            CSCO      0.002231
dtype: float64
