In [329]:
%matplotlib inline

import requests
import pandas as pd 
import matplotlib.pyplot as plt
from taIndicators import basic, momentum

from pandas.plotting import scatter_matrix

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier

from sklearn.metrics import r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

import time

START_DATE = '2011-01-03'
END_DATE = '2019-04-03'
YEARLY_TRADING_DAYS = 252
MONTHLY_TRADING_DAYS = 21

In [330]:
# import files and convert to dataframe
file_name = "data/spy-momentum.csv"
df = pd.read_csv(file_name)

In [331]:
df.set_index(['Symbol', 'Date'], inplace=True)

In [332]:
df.drop(columns=['Pct_Change_Class','Rolling_Yearly_Mean_Price', 'Rolling_Monthly_Mean_Price', 'Rolling_Monthly_Mean_Positive_Days', 'Rolling_Yearly_Mean_Positive_Days', 'High', 'Low', 'Open', 'Close'], inplace=True)

In [333]:
target_AdjClose = df.groupby(level=0)['AdjClose'].shift(-1 * MONTHLY_TRADING_DAYS)
target_return_rank_monthly = df.groupby(level=0)['Monthly_Return_Rank'].shift(-1 * MONTHLY_TRADING_DAYS)
target_beat_SPY = df.groupby(level=0)['SPY_Trailing_Month_Return'].shift(-1 * MONTHLY_TRADING_DAYS)
target_SPY_return = df.groupby(level=0)['SPY_Trailing_Month_Return'].shift(-1 * MONTHLY_TRADING_DAYS)
target_month_return = df.groupby(level=0)['Pct_Change_Monthly'].shift(-1 * MONTHLY_TRADING_DAYS)

In [334]:
df['target_AdjClose'] = target_AdjClose.values
df['target_return_rank'] = target_return_rank_monthly
df['target_beat_SPY'] = target_beat_SPY.values
df['target_SPY_return'] = target_beat_SPY.values
df['target_month_return'] = target_month_return.values

In [335]:
df['target_isUp'] = df['target_AdjClose']
df['target_isTop100'] = df['target_return_rank']

In [336]:
topTile = 250
df['target_isUp'].where(df['target_AdjClose'] > df['AdjClose'], other=0, inplace=True)
df['target_isUp'].where(df['target_AdjClose'] < df['AdjClose'], other=1, inplace=True)
df['target_isTop100'].where(df['target_return_rank'] > topTile, other=1, inplace=True)
df['target_isTop100'].where(df['target_return_rank'] < topTile, other=0, inplace=True)

In [337]:
df['target_beat_SPY'].where(df['target_month_return'] > df['target_beat_SPY'], other=0, inplace=True)
df['target_beat_SPY'].where(df['target_month_return'] < df['target_beat_SPY'], other=1, inplace=True)

In [338]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Volume,AdjClose,Pct_Change_Daily,Pct_Change_Monthly,Pct_Change_Yearly,RSI,Volatility,Yearly_Return_Rank,Monthly_Return_Rank,Momentum_Quality_Monthly,Momentum_Quality_Yearly,SPY_Trailing_Month_Return,target_AdjClose,target_return_rank,target_beat_SPY,target_SPY_return,target_month_return,target_isUp,target_isTop100
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A,2011-01-03,4994000.0,27.591616,,,,,,,,,,,27.163389,348.0,0.0,0.027076,-0.01552,0.0,0.0
A,2011-01-04,5017200.0,27.334681,-0.009312,,,,,,,,,,27.005262,363.0,0.0,0.029926,-0.012051,0.0,0.0
A,2011-01-05,4519000.0,27.275387,-0.002169,,,,,,,,,,28.322918,210.0,1.0,0.027499,0.038406,1.0,1.0
A,2011-01-06,4699000.0,27.328091,0.001932,,,,,,,,,,29.278212,130.0,1.0,0.035953,0.07136,1.0,1.0
A,2011-01-07,3810900.0,27.420322,0.003375,,,,,,,,,,29.100332,183.0,1.0,0.042709,0.061269,1.0,1.0


# # Run Simple Random Forest Classifier Model on the momentum dataset

In [339]:
ticker = 'AMD' # choose ticker to run through the model

In [340]:
feature_list = [ "Pct_Change_Daily", "Pct_Change_Monthly", "Pct_Change_Yearly", "RSI", "Volatility",
               "Yearly_Return_Rank", "Monthly_Return_Rank", "Momentum_Quality_Monthly", "Momentum_Quality_Yearly",
               "SPY_Trailing_Month_Return"]
target_label_isTop100 = "target_isTop100"
target_label_isUp = "target_isUp"
target_label_beatSPY = "target_beat_SPY"

In [341]:
#df_stock = df.loc[ticker]
li = ['AAPL', 'AMD', 'MSFT', 'INTC', 'NFLX', 'AMZN', 'GOOG', 'FB', 'CRM', 'PYPL', 'ORCL']
#df_stock = df.loc[li]
df_stock = df
#df_stock.reset_index(inplace=True)
#df_stock.drop(columns=['Date'], inplace=True)
df_stock = df_stock.shift(-1 * MONTHLY_TRADING_DAYS)

In [342]:
df_stock.reset_index(inplace=True)
#df_stock.drop(columns=['Date', 'index'], inplace=True)
#df_stock.head()

In [343]:
df_stock.drop(columns=['Date'], inplace=True)

In [344]:
df_stock['Pct_Change_Yearly'].fillna(value=df_stock['Pct_Change_Monthly'], inplace=True)
df_stock['Yearly_Return_Rank'].fillna(value=df_stock['Monthly_Return_Rank'], inplace=True)
df_stock['Momentum_Quality_Yearly'].fillna(value=df_stock['Momentum_Quality_Monthly'], inplace=True)
df_stock['Volatility'].fillna(df_stock['Volatility'].mean(), inplace=True)
df_stock.fillna(0, inplace=True)
df_stock.head()

Unnamed: 0,Symbol,Volume,AdjClose,Pct_Change_Daily,Pct_Change_Monthly,Pct_Change_Yearly,RSI,Volatility,Yearly_Return_Rank,Monthly_Return_Rank,Momentum_Quality_Monthly,Momentum_Quality_Yearly,SPY_Trailing_Month_Return,target_AdjClose,target_return_rank,target_beat_SPY,target_SPY_return,target_month_return,target_isUp,target_isTop100
0,A,5551100.0,27.163389,-0.019501,-0.01552,-0.01552,46.658953,0.378358,348.0,348.0,0.073906,0.073906,0.027076,30.800106,29.0,1.0,0.015174,0.133883,1.0,1.0
1,A,4924800.0,27.005262,-0.005821,-0.012051,-0.012051,45.49263,0.378358,363.0,363.0,0.057387,0.057387,0.029926,30.220327,23.0,1.0,0.00497,0.119053,1.0,1.0
2,A,7265100.0,28.322918,0.048793,0.038406,0.038406,55.294661,0.378358,210.0,210.0,0.182884,0.182884,0.027499,30.286217,74.0,1.0,0.010904,0.069318,1.0,1.0
3,A,6388700.0,29.278212,0.033729,0.07136,0.07136,60.689525,0.378358,130.0,130.0,0.339807,0.339807,0.035953,30.338928,137.0,1.0,0.003182,0.036229,1.0,1.0
4,A,5220900.0,29.100332,-0.006075,0.061269,0.061269,59.287206,0.378358,183.0,183.0,-0.291756,-0.291756,0.042709,29.568096,132.0,1.0,-0.019838,0.016074,1.0,1.0


In [345]:
stock_features = df_stock[feature_list]
stock_labels_top100 = df_stock[target_label_isTop100]
stock_labels_isUp = df_stock[target_label_isUp]
stock_labels_beat_SPY = df_stock[target_label_beatSPY]

In [346]:
# normalize features: monthly return rank, rsi, yearly return rank
normalized_mr = stock_features['Monthly_Return_Rank'] / 500
stock_features['Monthly_Return_Rank'] = normalized_mr.values

normalized_yr = stock_features['Yearly_Return_Rank'] / 500
stock_features['Yearly_Return_Rank'] = normalized_yr.values

normalized_rsi = stock_features['RSI'] / 100
stock_features['RSI'] = normalized_rsi.values
stock_features.head()

Unnamed: 0,Pct_Change_Daily,Pct_Change_Monthly,Pct_Change_Yearly,RSI,Volatility,Yearly_Return_Rank,Monthly_Return_Rank,Momentum_Quality_Monthly,Momentum_Quality_Yearly,SPY_Trailing_Month_Return
0,-0.019501,-0.01552,-0.01552,0.46659,0.378358,0.696,0.696,0.073906,0.073906,0.027076
1,-0.005821,-0.012051,-0.012051,0.454926,0.378358,0.726,0.726,0.057387,0.057387,0.029926
2,0.048793,0.038406,0.038406,0.552947,0.378358,0.42,0.42,0.182884,0.182884,0.027499
3,0.033729,0.07136,0.07136,0.606895,0.378358,0.26,0.26,0.339807,0.339807,0.035953
4,-0.006075,0.061269,0.061269,0.592872,0.378358,0.366,0.366,-0.291756,-0.291756,0.042709


In [347]:
def basic_random_forest_classifier(stock_features, stock_labels, n_splits=2):
    #X_train, X_test, y_train, y_test = train_test_split(stock_features, stock_labels, test_size=test_size)
    tseries = TimeSeriesSplit(n_splits)
    test_results = []
    for train_index, test_index in tseries.split(stock_features):  
        X_train, X_test = stock_features.iloc[train_index], stock_features.iloc[test_index]
        y_train, y_test = stock_labels.iloc[train_index], stock_labels.iloc[test_index]
        model = RandomForestClassifier(n_estimators=16)
        model.fit(X_train, y_train)

        expected = y_test
        predicted = model.predict(X_test)
        
        false_positive_rate, true_positive_rate, thresholds = roc_curve(expected, predicted)
        roc_auc = auc(false_positive_rate, true_positive_rate)
        """
        print(roc_auc)
        print(false_positive_rate)
        print(true_positive_rate)
        print(thresholds)
        """

        print('Random Forest model')
        #print("Stock Percent Up Days: ", expected.mean(), "\n\n")
        print("F1:",f1_score(expected, predicted))
        print("accuracy: ", accuracy_score(expected, predicted))
        
        print(classification_report(expected, predicted))
        print(confusion_matrix(expected, predicted))
    #from matplotlib.legend_handler import HandlerLine2D


In [348]:
def tss_cross_validation(stock_features, stock_labels, model=RandomForestClassifier(n_estimators=16), n_splits=2, scoring='precision'):
    tseries = TimeSeriesSplit(n_splits)
    #X_train, X_test = stock_features.iloc[train_index], stock_features.iloc[test_index]
    #y_train, y_test = stock_labels.iloc[train_index], stock_labels.iloc[test_index]    
    scores = cross_val_score(model, stock_features, stock_labels, cv=tseries, scoring=scoring)
    print("{}: {}".format(model.__class__.__name__, scores.mean()))
    return scores


## Use Random Forest Classification to try to predict whether a stock's return will be ranked in the top 100 of the s&p 500

In [350]:
basic_random_forest_classifier(stock_features, stock_labels_beat_SPY)

Random Forest model
F1: 0.7171529443810042
accuracy:  0.6008297258297258
              precision    recall  f1-score   support

         0.0       0.41      0.27      0.32    120542
         1.0       0.66      0.79      0.72    217642

   micro avg       0.60      0.60      0.60    338184
   macro avg       0.53      0.53      0.52    338184
weighted avg       0.57      0.60      0.58    338184

[[ 32055  88487]
 [ 46506 171136]]
Random Forest model
F1: 0.7207797790933475
accuracy:  0.6062882927637026
              precision    recall  f1-score   support

         0.0       0.41      0.28      0.33    118315
         1.0       0.67      0.78      0.72    219869

   micro avg       0.61      0.61      0.61    338184
   macro avg       0.54      0.53      0.53    338184
weighted avg       0.58      0.61      0.58    338184

[[ 33184  85131]
 [ 48016 171853]]


In [304]:
models = [
    SVC(gamma='auto'), LinearSVC(), 
    SGDClassifier(max_iter=100, tol=1e-3), KNeighborsClassifier(), 
    LogisticRegression(solver='lbfgs'), LogisticRegressionCV(cv=3), 
    BaggingClassifier(), ExtraTreesClassifier(n_estimators=100), 
    RandomForestClassifier()
]

In [320]:
# Test the optimal number of time series splits

import warnings
warnings.filterwarnings('ignore')
splits = [2, 4, 8, 12, 16]
for n in splits:
    tss_cross_validation(stock_features, stock_labels_beat_SPY, n_splits=n)


RandomForestClassifier: 0.6767242665172374
RandomForestClassifier: 0.6731254892391498
RandomForestClassifier: 0.6575960999413994
RandomForestClassifier: 0.6541745778362187
RandomForestClassifier: 0.6505783610516205


In [321]:
# Test N_estimators
n_estimators = [1, 2, 4, 8, 16, 32]

for est in n_estimators:
    rf = RandomForestClassifier(n_estimators=est)
    tss_cross_validation(stock_features, stock_labels_beat_SPY, n_splits=n, model=rf)

RandomForestClassifier: 0.6637271323650095
RandomForestClassifier: 0.6532958091510843
RandomForestClassifier: 0.6588647336761889
RandomForestClassifier: 0.649651830258309
RandomForestClassifier: 0.6483400511410219
RandomForestClassifier: 0.6498221137901672


"""
OUTPUT:
RandomForestClassifier: 0.6430098832358393
RandomForestClassifier: 0.5119726051805669
RandomForestClassifier: 0.586813596047268
RandomForestClassifier: 0.6455814593350315
RandomForestClassifier: 0.6824951525466871
RandomForestClassifier: 0.6977027288607828
RandomForestClassifier: 0.7141188097330256
"""

In [322]:
# Test Max dept of the tree
max_depths = [2, 4, 8, 16, 25, 32]
n_estimators = 16
for max_depth in max_depths:
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    print("max_depth=", max_depth)
    tss_cross_validation(stock_features, stock_labels_isUp, n_splits=n, model=rf)
    print("-----------------------------------")

max_depth= 2
RandomForestClassifier: 0.6056004889345219
-----------------------------------
max_depth= 4
RandomForestClassifier: 0.6038358617690565
-----------------------------------
max_depth= 8
RandomForestClassifier: 0.6048758860109652
-----------------------------------
max_depth= 16
RandomForestClassifier: 0.6121072485074783
-----------------------------------
max_depth= 25
RandomForestClassifier: 0.608231389602127
-----------------------------------
max_depth= 32
RandomForestClassifier: 0.6065819080502513
-----------------------------------


## Use Random Forest Classifcation to try to predict whether a stock will have a positive return in the next month

In [715]:
basic_random_forest_classifier(stock_features, stock_labels_isUp)

Random Forest model
Accuracy: 0.5200974421437271
              precision    recall  f1-score   support

         0.0       0.52      0.54      0.53       814
         1.0       0.53      0.50      0.51       828

   micro avg       0.52      0.52      0.52      1642
   macro avg       0.52      0.52      0.52      1642
weighted avg       0.52      0.52      0.52      1642

[[442 372]
 [416 412]]
Random Forest model
Accuracy: 0.4823386114494519
              precision    recall  f1-score   support

         0.0       0.37      0.54      0.44       618
         1.0       0.62      0.45      0.52      1024

   micro avg       0.48      0.48      0.48      1642
   macro avg       0.49      0.49      0.48      1642
weighted avg       0.52      0.48      0.49      1642

[[331 287]
 [563 461]]
Random Forest model
Accuracy: 0.5548112058465287
              precision    recall  f1-score   support

         0.0       0.40      0.45      0.42       593
         1.0       0.66      0.61      0.64 

In [716]:
import warnings
warnings.filterwarnings('ignore')
rfc_score = tss_cross_validation(stock_features, stock_labels_isUp)
print(rfc_score)
print(rfc_score.mean())
#for model in models:
#    tss_cross_validation(stock_features, stock_labels_top100, model=model)

[0.58121019 0.49715909 0.57751938 0.59960552 0.73514431 0.6218638
 0.64840183 0.60631229 0.56862745 0.5945122  0.62820513 0.56116208
 0.5799373  0.681458   0.72713864 0.54561404 0.64171123 0.59587956
 0.55197657 0.70017331]
0.6121805962313976
