# Create and train ML models

In [3]:
from sklearn import base
import pandas as pd
import numpy as np
import dill
from datetime import datetime,timedelta
from sklearn.linear_model import LinearRegression, SGDClassifier, Ridge, SGDRegressor
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import matplotlib.pyplot as plt
from pandas.plotting import lag_plot
#from pandas import datetime
from statsmodels.tsa.arima_model import ARIMA
import os
from ediblepickle import checkpoint
import updates
from collections import Counter, defaultdict

## 1. Create news file with sentiment index

We only have sentiment labels for a portion of the news, so we use those labels to create train data, so the rest of the news is labeled using the trained model.

Only done once, do not re-run

In [4]:
sentiment_dict= dill.load(open('data/sentiment_dict.pkd', 'rb'))
tickers = dill.load(open('data/tickers.pkd', 'rb'))

today=datetime(datetime.now().year,datetime.now().month,datetime.now().day)
last_news_update=dill.load(open('data/last_news_update.pkd', 'rb'))
if last_news_update!=today:
    !rm data/BenzNewscache/*
    updates.update_all_news()
    last_news_update=today
    dill.dump(last_news_update, open('data/last_news_update.pkd', 'wb'))
    
all_ticker_news = dill.load(open('data/all_ticker_news.pkd', 'rb'))

In [19]:
dates=[]
sentiments=[]
ticks=[]
for date,ticker_dict in sentiment_dict.items():
    for ticker,value in ticker_dict.items():
        if value>0:
            dates.append(date)
            sentiments.append(value)
            ticks.append(ticker)
df_sentiment=pd.DataFrame()
df_sentiment['Ticker']=ticks
df_sentiment['Date']=dates
df_sentiment['Sentiment']=sentiments
df_sentiment=df_sentiment.set_index(['Ticker','Date']).sort_index()
df_sentiment

Unnamed: 0_level_0,Unnamed: 1_level_0,Sentiment
Ticker,Date,Unnamed: 2_level_1
A,2020-10-21,71.68
A,2020-10-23,71.68
A,2020-10-26,71.68
A,2020-10-27,71.68
A,2020-10-28,71.73
...,...,...
ZTS,2020-11-05,77.24
ZTS,2020-11-06,77.36
ZTS,2020-11-09,80.70
ZTS,2020-11-10,80.70


In [20]:
df_sentiment.loc['MSFT']

Unnamed: 0_level_0,Sentiment
Date,Unnamed: 1_level_1
2020-10-21,78.35
2020-10-23,77.48
2020-10-26,77.76
2020-10-27,79.89
2020-10-28,77.71
2020-10-29,78.03
2020-10-30,78.75
2020-11-03,78.89
2020-11-04,79.83
2020-11-05,80.87


In [21]:
def read_word_file(word_type):
    l=[]
    filename=word_type+'_words.txt'
    with open(filename,'r') as wf:
        for line in wf:
            if line[0] != ';' and line[0] != '\n':
                l.append(line.strip())
    return l

l_pos_words=read_word_file('positive')
l_neg_words=read_word_file('negative')

def pos_sentiment(txt):
    word_list=[x.lower() for x in txt.split()]
    return len([x for x in word_list if x in l_pos_words])

def neg_sentiment(txt):
    word_list=[x.lower() for x in txt.split()]
    return -len([x for x in word_list if x in l_neg_words])

In [22]:
def prep_traindata():
    train_data=pd.DataFrame(columns=['Ticker','Date','NewsText','PosSentiment'
                                   ,'NegSentiment','SentimentIndex','Sentiment','SentimentClass'])
    for ticker,date in df_sentiment.index:
        data=pd.DataFrame()
        news=[]
        dates=[]
        sentiments=[]
        if all_ticker_news[ticker]==None:
            continue
        elif all_ticker_news[ticker].get(date,0)==0:
            continue
        for item in all_ticker_news[ticker][date]:
            news.append(item)
            dates.append(date)
            sentiments.append(df_sentiment.loc[ticker].loc[date][0])
        data['Ticker']=[ticker]*len(news)
        data['Date']=dates
        data['NewsText']=news   
        data['PosSentiment']=data['NewsText'].apply(pos_sentiment)
        data['NegSentiment']=data['NewsText'].apply(neg_sentiment)
        upper_bound=max(data['PosSentiment'].max(),data['NegSentiment'].max())
        data['SentimentIndex']=50*(data['PosSentiment']-data['NegSentiment'])/upper_bound+50
        data['SentimentIndex']=data['SentimentIndex'].fillna(50)
        data['Sentiment']=sentiments
        data['SentimentClass']=['Positive' if x>70 else 'Negative' if x<30 else 'Neutral' for x in sentiments]
        train_data=train_data.append(data)
    train_data=train_data.set_index(['Ticker','Date']).sort_index()
    return train_data
all_data=prep_traindata()

In [23]:
all_data.loc['MRK']

Unnamed: 0_level_0,NewsText,PosSentiment,NegSentiment,SentimentIndex,Sentiment,SentimentClass
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10-26,"FAANGs In Focus: Amazon, Facebook, Apple, Alph...",0,0,50.0,74.28,Positive
2020-10-27,The Daily Biotech Pulse: Catabasis Halts Duche...,0,0,50.0,73.42,Positive
2020-10-27,Recap: Merck & Co Q3 Earnings,0,0,50.0,73.42,Positive
2020-10-27,A Peek Into The Markets: US Stock Futures Edge...,0,0,50.0,73.42,Positive
2020-10-27,"7 Stocks To Watch For October 27, 2020",0,0,50.0,73.42,Positive
2020-10-27,"Earnings Scheduled For October 27, 2020",0,0,50.0,73.42,Positive
2020-10-29,The Daily Biotech Pulse: Novartis Acquires Gen...,0,0,50.0,74.23,Positive
2020-10-30,Attention Biotech Investors: Mark Your Calenda...,0,0,50.0,74.17,Positive
2020-11-03,Attention Biotech Investors: Mark Your Calenda...,0,0,50.0,74.17,Positive
2020-11-04,Rich Saperstein And Joe Terranova Share Their ...,1,0,100.0,74.6,Positive


In [24]:
all_data[['NewsText','PosSentiment','NegSentiment']]

Unnamed: 0_level_0,Unnamed: 1_level_0,NewsText,PosSentiment,NegSentiment
Ticker,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,2020-11-04,Stocks That Hit 52-Week Highs On Wednesday,0,0
A,2020-11-05,Stocks That Hit 52-Week Highs On Thursday,0,0
A,2020-11-06,Stocks That Hit 52-Week Highs On Friday,0,0
A,2020-11-09,Stocks That Hit 52-Week Highs On Monday,0,0
AAP,2020-11-10,Recap: Advance Auto Parts Q3 Earnings,0,0
...,...,...,...,...
ZION,2020-11-09,74 Stocks Moving In Monday's Mid-Day Session,0,0
ZION,2020-11-10,100 Biggest Movers From Yesterday,0,0
ZTS,2020-11-04,Stocks That Hit 52-Week Highs On Wednesday,0,0
ZTS,2020-11-05,Stocks That Hit 52-Week Highs On Thursday,0,0


In [26]:
class AddTfIdfVect(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        self.tfidf=TfidfVectorizer(stop_words=STOP_WORDS.union({'ll', 've'}))
    
    def fit(self, X, y=None):
        self.tfidf.fit(X['NewsText'])
        return self
    
    def transform(self, X):
        X_transformed=pd.DataFrame(self.tfidf.transform(X['NewsText']).todense())
        X_transformed['PosSentiment']=X['PosSentiment'].values
        X_transformed['NegSentiment']=X['NegSentiment'].values
        return X_transformed
    
X=all_data[['NewsText','PosSentiment','NegSentiment']]
y=all_data['SentimentClass'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
classifiers={'KNN':KNeighborsClassifier(n_neighbors=7),
            'SGD':SGDClassifier(max_iter=1000),
            'RFC':RandomForestClassifier(n_estimators=100)}
for name,classifier in classifiers.items():
    pipe = Pipeline([('vectorizer', AddTfIdfVect()),
                    ('classifier', classifier)])
    pipe.fit(X_train,y_train)
    print(name,pipe.score(X_test,y_test) )

KNN 0.6208425720620843
SGD 0.6430155210643016
RFC 0.6252771618625277


In [27]:
sgdpipe1 = Pipeline([('vectorizer', AddTfIdfVect()),
                ('classifier', SGDClassifier(max_iter=1000))])

sgdpipe1.fit(all_data[['NewsText','PosSentiment','NegSentiment']]
             ,all_data['SentimentClass'].values)

Pipeline(memory=None,
         steps=[('vectorizer', AddTfIdfVect()),
                ('classifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=None,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False)

In [38]:
vectorizer=AddTfIdfVect()
vectorizer.fit_transform(all_data[['NewsText','PosSentiment','NegSentiment']])
news_classifier=SGDClassifier(max_iter=1000)
news_classifier.fit(vectorizer.fit_transform(all_data[['NewsText','PosSentiment','NegSentiment']])
                    ,all_data['SentimentClass'].values)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [39]:
dill.dump(news_classifier, open('predict_data/news_classifier.pkd', 'wb'))

In [35]:
def prep_old_news():
    old_news=pd.DataFrame(columns=['Ticker','Date','NewsText','PosSentiment','NegSentiment'])
    for ticker,newsdict in all_ticker_news.items():
        data=pd.DataFrame()
        news=[]
        dates=[]
        if newsdict==None:
            continue
        for date,newslist in newsdict.items():
            if date<datetime.datetime(2020,10,21):
                for item in newslist:
                    news.append(item)
                    dates.append(date)
        data['Ticker']=[ticker]*len(news)
        data['Date']=dates
        data['NewsText']=news
        data['PosSentiment']=data['NewsText'].apply(pos_sentiment)
        data['NegSentiment']=data['NewsText'].apply(neg_sentiment)
        old_news=old_news.append(data)
    return old_news

In [36]:
news=prep_old_news()
news['SentimentClass']=sgdpipe1.predict(news[['NewsText','PosSentiment','NegSentiment']])

In [39]:
news=news.set_index(['Ticker','Date']).sort_index()
news=news.append(all_data[['NewsText','PosSentiment','NegSentiment','SentimentClass']])
news['SentimentIndex']=news['SentimentClass'].apply(lambda x: 1 if x=='Positive' else -1 if x=='Negative' else 0)

In [43]:
dill.dump(news, open('data/news.pkd', 'wb'))

In [45]:
len(news)

54361

In [46]:
news= dill.load(open('data/news.pkd', 'rb'))
for ticker in all_ticker_news.keys():
    dates=[]
    newsl=[]
    possents=[]
    negsents=[]  
    if all_ticker_news[ticker]:
        for date,news_list in all_ticker_news[ticker].items():
            if date not in news.loc[ticker].index:
                for item in news_list:
                    dates.append(date)
                    newsl.append(item)
                    possents.append(pos_sentiment(item))
                    negsents.append(neg_sentiment(item))
    if dates:
        df=pd.DataFrame([ticker]*len(dates),columns=['Ticker'])
        df['Date']=dates
        df['NewsText']=newsl
        df['PosSentiment']=df['NewsText'].apply(pos_sentiment)
        df['NegSentiment']=df['NewsText'].apply(neg_sentiment)
        df['SentimentClass']=sgdpipe1.predict(df[['NewsText','PosSentiment','NegSentiment']])
        df['SentimentIndex']=df['SentimentClass'].apply(lambda x: 1 if x=='Positive' else -1 if x=='Negative' else 0)
        df=df.set_index(['Ticker','Date'])
        news=news.append(df[['NewsText','PosSentiment','NegSentiment','SentimentClass','SentimentIndex']])
        print(ticker,len(news))

SPY 54419
ABT 54421
ABMD 54422
ATVI 54423
AAP 54425
AFL 54426
A 54427
APD 54436
ALK 54442
ALB 54444
ALXN 54445
ALGN 54452
ALLE 54454
LNT 54455
GOOGL 54531
GOOG 54555
AMZN 54594
AMCR 54596
AEP 54599
AXP 54602
AIG 54603
AMP 54604
ABC 54605
AME 54606
AMGN 54611
ANTM 54613
AON 54614
AAPL 54653
AMAT 54659
APTV 54660
ANET 54666
AIZ 54667
T 54678
ATO 54679
BKR 54680
BAC 54684
BK 54686
BAX 54687
BDX 54689
BRK.B 54690
BBY 54692
BIO 54696
BIIB 54700
BLK 54703
BA 54715
BKNG 54717
BXP 54718
BSX 54721
BMY 54729
CDNS 54730
COF 54731
CAH 54732
CCL 54735
CARR 54742
CTLT 54753
CAT 54754
CBOE 54756
CNC 54757
CERN 54758
SCHW 54759
CHTR 54760
CVX 54764
CMG 54772
CB 54773
CI 54776
CSCO 54789
C 54792
CTXS 54794
CLX 54799
KO 54811
CTSH 54812
CL 54815
CMCSA 54820
CPRT 54821
CTVA 54826
COST 54829
CCI 54831
CSX 54841
CMI 54843
DHI 54844
DHR 54848
DRI 54849
DVA 54852
DE 54853
DAL 54857
DXCM 54858
DFS 54861
DISCK 54862
DPZ 54863
DOW 54866
EMN 54867
EBAY 54870
EW 54874
EL 54885
EXPE 54888
EXR 54889
XOM 54893
FFIV 

In [47]:
news=news.sort_index()
news.loc['SPY']

Unnamed: 0_level_0,NewsText,PosSentiment,NegSentiment,SentimentClass,SentimentIndex
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-02-02,The GOP Surveillance Memo: What You Need To Know,0,0,Positive,1
2018-02-05,The Market In 5 Minutes: Stocks And Cryptos Ma...,0,0,Positive,1
2018-02-06,2 Pros Break Down The Market Meltdown,1,-2,Positive,1
2018-02-07,"The Market In 5 Minutes: Snap Beats, Wynn Resi...",0,-1,Positive,1
2018-02-08,Another Rough Day For The Markets,0,-1,Positive,1
...,...,...,...,...,...
2020-11-19,Robinhood Co-Founder Vladimir Tenev Talks Youn...,0,0,Positive,1
2020-11-19,This Is How Much Prediction Markets Made On Pr...,0,0,Positive,1
2020-11-19,Investor Optimism Dented Globally On Rising CO...,1,-1,Positive,1
2020-11-20,"The Week In Cannabis: Stocks Outperform S&P, M...",1,0,Positive,1


In [48]:
dill.dump(news, open('data/news.pkd', 'wb'))

## 2. Select stocks that have enough news on days before unemployment announcements
At least 50% of the time

In [45]:
df_UR_Releases = dill.load(open('data/df_UR_Releases.pkd', 'rb'))
tickers = dill.load(open('data/tickers.pkd', 'rb'))

news= dill.load(open('data/news.pkd', 'rb'))

start_date=datetime(2018,3,8) #the day before the first release to consider

pre_rel_dates=[x-timedelta(days=1) for x in df_UR_Releases['Release Date'] if x>=start_date]
pre_rel_dates=[x-timedelta(days=1) if x==datetime(2019,7,4) else x for x in pre_rel_dates]

def validate_ticker(ticker):
    invalid_count=0
    valid_count=0
    try:
        df_ticker_news=news.groupby(['Ticker','Date']).agg({'SentimentIndex':'sum'}).loc[ticker]
        for date in pre_rel_dates:
            if date in df_ticker_news.index:
                valid_count+=1
            else:
                invalid_count+=1
        if invalid_count>valid_count:
            #print ("not enough news for "+ticker)
            return False
    except KeyError:
        #print ("no news for "+ticker)
        return False
    return True

valid_tickers=[]
for ticker in tickers:
    if validate_ticker(ticker):
        valid_tickers.append(ticker)
dill.dump(valid_tickers, open('data/valid_tickers.pkd', 'wb'))

## 3. Train ARIMA models for selected stocks

In [7]:
d_all_EOD = dill.load(open('data/d_all_EOD.pkd', 'rb'))
valid_tickers = dill.load(open('data/valid_tickers.pkd', 'rb'))
start_date=datetime(2018,3,8)

def train_arima(ticker):
     
    df=d_all_EOD[ticker][['date','adjClose']]
    df=df.set_index('date')

    train_data=df.loc[:start_date].values
    test_data=df.loc[start_date:].values
    train_data=[x for x in train_data]
    predictions=[]
    for i in range(len(test_data)):
        model=ARIMA(train_data,order=(4,1,0))
        output=model.fit(disp=0).forecast()
        predictions.append(output[0])
        train_data.append(test_data[i])
    print(ticker, mean_squared_error(test_data, predictions))
    return model,[x[0] for x in predictions]

arima_models={}
for ticker in valid_tickers:
    arima_models[ticker]=train_arima(ticker)
dill.dump(arima_models, open('data/arima_models.pkd', 'wb'))



SPY 16.06914218954535
AMZN 2127.3218644956855




AAPL 2.7900025127680186




FB 23.207296996444736
GM 0.5988619306148006
MSFT 9.567525400998367
TWTR 1.5504917904703395


In [6]:
valid_tickers

['SPY', 'AMZN', 'AAPL', 'FB', 'GM', 'MSFT', 'TWTR']

## 4. Create machine learning datasets

In [11]:
df_UR_Releases = dill.load(open('data/df_UR_Releases.pkd', 'rb'))
news = dill.load(open('data/news.pkd', 'rb'))

rates=df_UR_Releases[df_UR_Releases['Release Date']>=start_date]['Announced Value'].values
changes=df_UR_Releases[df_UR_Releases['Release Date']>=start_date]['Announced Percent Change'].values

pre_rel_dates=[x-timedelta(days=1) for x in df_UR_Releases['Release Date'] if x>=start_date]
pre_rel_dates=[x-timedelta(days=1) if x==datetime(2019,7,4) else x for x in pre_rel_dates]

def create_dataframe(ticker, arima_predictions):
    
    df=d_all_EOD[ticker][['date','adjClose']]
    df=df.set_index('date')
    df['nextClose']=df['adjClose'].shift(-1)
    
    df1=df.loc[start_date:]
    df1['predicted']=arima_predictions
    
    pre_rel_data=[row for row in df1.to_records() if row[0] in pre_rel_dates]

    df2=pd.DataFrame()
    df2['date']=[x[0] for x in pre_rel_data]
    df2['rate']=rates
    df2['change']=changes
    df2['close']=[x[1] for x in pre_rel_data]
    
    sentiment=[]
    df_news_sentiment=news.groupby(['Ticker','Date']).agg({'SentimentIndex':'sum'}).loc[ticker]
    for date in pre_rel_dates:
        if date in df_news_sentiment.index:
            sentiment.append(df_news_sentiment.loc[date][0])
        else:
            sentiment.append(0)
    df2['sentiment']=sentiment
    
    df2['predicted']=[x[2] for x in pre_rel_data]
    df2['nextClose']=[x[3] for x in pre_rel_data ]
    df2['nextChange']=[1 if x[3]-x[1]>0 else -1 for x in pre_rel_data ]
    
    
    return df2.set_index('date')

valid_datasets={}
for ticker in valid_tickers:
    valid_datasets[ticker]=create_dataframe(ticker,arima_models[ticker][1])
dill.dump(valid_datasets, open('data/valid_datasets.pkd', 'wb'))    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [13]:
valid_datasets['SPY']

Unnamed: 0_level_0,rate,change,close,sentiment,predicted,nextClose,nextChange
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-03-08,4.1,0.0,260.141486,1,264.668574,260.147569,1
2018-04-05,4.1,0.0,253.120741,1,247.479743,250.824045,-1
2018-05-03,3.9,-4.88,250.24307,2,253.482833,250.881168,1
2018-05-31,3.8,-2.56,258.17096,2,260.705597,259.878082,1
2018-07-05,4.0,5.26,261.40837,5,263.619396,259.369652,-1
2018-08-02,3.9,-2.5,270.290761,2,271.448917,268.891432,-1
2018-09-06,3.9,0.0,275.81354,1,275.277534,276.803669,1
2018-10-04,3.7,-5.13,278.293574,0,276.73596,280.589327,1
2018-11-01,3.7,0.0,262.977043,1,261.41943,260.152856,-1
2018-12-06,3.7,0.0,259.448376,2,253.419836,260.108057,1


## 5. Train and predict

We use forward chaining cross validation and the last six months values are predicted values. After cross-validating, we select the best regressor for each stock.

In [23]:
valid_tickers = dill.load(open('data/valid_tickers.pkd', 'rb'))
valid_datasets = dill.load(open('data/valid_datasets.pkd', 'rb'))
def mape(ytrue,ypred):
    return abs(ytrue-ypred)/ytrue*100

def train_model(ticker,df):
    
    print(ticker)
    cut_date=datetime(2020,6,4)
    val_dates=df.loc[cut_date:].index
    
    regressors={'LR':LinearRegression(),'RF':RandomForestRegressor(),
                'KR':KNeighborsRegressor(n_neighbors=5),'RR':Ridge(alpha=10)}
    
    classifiers={'KNN':KNeighborsClassifier(n_neighbors=5),
            'SGD':SGDClassifier(max_iter=1000),
            'RFC':RandomForestClassifier(n_estimators=50)}
    
    best_regressors=[]
    best_classifiers=[]
    reg_predictions=defaultdict(list)
    clf_predictions=defaultdict(list)
    for date in val_dates:
        X_train=df.loc[:date][['rate','change','predicted','sentiment']]
        y_train_reg=df.loc[:date]['nextClose']
        y_train_clf=df.loc[:date]['nextChange']
        
        min_error=100000
        for name,regressor in regressors.items():
            reg = regressor
            trained_reg=reg.fit(X_train,y_train_reg)
            prediction=trained_reg.predict(np.array(df.loc[date][['rate','change','predicted','sentiment']]).reshape(1,-1))
            reg_predictions[name].append((date,prediction[0]
                                ,np.sign(prediction[0]-df.loc[date]['close'])==y_train_clf.loc[date]))
            score=reg.score(X_train,y_train_reg)
            print(name,score,df.loc[date]['close'],prediction[0],df.loc[date]['nextClose'],df.loc[date]['nextChange'])
            if np.sign(df.loc[date]['nextChange'])==np.sign(prediction[0]-df.loc[date]['close']):
                error=mape(df.loc[date]['nextClose'],prediction[0])
                if error<min_error:
                    min_error=error
                    best_regressor=name
        print(best_regressor,'error=',min_error,'%')   
        best_regressors.append(regressors[best_regressor])
        
        clf_scores=defaultdict(list)
        for name,classifier in classifiers.items():
            clf=classifier
            trained_clf=clf.fit(X_train,y_train_clf)
            prediction=trained_clf.predict(np.array(df.loc[date][['rate','change','predicted','sentiment']]).reshape(1,-1))
            clf_predictions[name].append(prediction[0])
            score=clf.score(X_train,y_train_clf)
            print(name,score,df.loc[date]['close'],prediction[0],df.loc[date]['nextClose'],df.loc[date]['nextChange'])
            if np.sign(df.loc[date]['nextChange'])==np.sign(prediction[0]):
                clf_scores[name].append(1)
            else:
                clf_scores[name].append(0)
                
    clf_scores=dict([(item[0],sum(item[1]) )for item in clf_scores.items()])   
    print(clf_scores)
    best_clf=max(clf_scores,key=clf_scores.get)#classifiers[max(clf_scores,key=clf_scores.get)]
    
    print('best classifier: ' ,best_clf,clf_scores[best_clf])
    
    best_reg_count=Counter(best_regressors)
    best_reg=max(best_reg_count,key=best_reg_count.get)
    
    best_acc=0
    print(reg_predictions)
    for reg in reg_predictions.keys():
        accuracy=sum([x[2] for x in reg_predictions[reg]])/len(reg_predictions[reg]) 
        if accuracy>best_acc:
            best_acc=accuracy
            best_acc_reg=reg
    print('best based on mape:',best_reg,'best based on accuracy:',best_acc_reg,best_acc)
    best_reg.fit(X_train,y_train_reg)
    mape_error=np.mean(mape(y_train_reg[cut_date:],[x[1] for x in reg_predictions[best_regressor]]))
    return best_reg,mape_error,reg_predictions[best_regressor]
    
predictions={}
for ticker in valid_tickers:
    predictions[ticker]=train_model(ticker,valid_datasets[ticker])
dill.dump(predictions, open('data/predictions/valid_predictions.pkd', 'wb'))  

SPY
defaultdict(<class 'list'>, {'LR': [(Timestamp('2020-06-04 00:00:00'), 309.1152432067128, True), (Timestamp('2020-07-01 00:00:00'), 305.5923611425665, True), (Timestamp('2020-08-06 00:00:00'), 328.932228484557, True), (Timestamp('2020-09-03 00:00:00'), 341.6200463005004, False), (Timestamp('2020-10-01 00:00:00'), 334.35937647502436, True), (Timestamp('2020-11-05 00:00:00'), 350.0829148849252, True)], 'RF': [(Timestamp('2020-06-04 00:00:00'), 309.6383871617386, True), (Timestamp('2020-07-01 00:00:00'), 306.74360734764184, True), (Timestamp('2020-08-06 00:00:00'), 325.11482921803315, True), (Timestamp('2020-09-03 00:00:00'), 343.29905044033666, False), (Timestamp('2020-10-01 00:00:00'), 331.7798456013591, True), (Timestamp('2020-11-05 00:00:00'), 342.99440980925783, True)], 'KR': [(Timestamp('2020-06-04 00:00:00'), 312.203620489414, True), (Timestamp('2020-07-01 00:00:00'), 307.90699873419214, True), (Timestamp('2020-08-06 00:00:00'), 318.8435893711752, True), (Timestamp('2020-09-03 

defaultdict(<class 'list'>, {'LR': [(Timestamp('2020-06-04 00:00:00'), 184.7659133759073, True), (Timestamp('2020-07-01 00:00:00'), 202.67234429786328, True), (Timestamp('2020-08-06 00:00:00'), 210.2937310444717, True), (Timestamp('2020-09-03 00:00:00'), 225.09661278522947, True), (Timestamp('2020-10-01 00:00:00'), 204.17005404205887, True), (Timestamp('2020-11-05 00:00:00'), 226.07338432106843, False)], 'RF': [(Timestamp('2020-06-04 00:00:00'), 180.9908252617555, False), (Timestamp('2020-07-01 00:00:00'), 193.9894614957231, True), (Timestamp('2020-08-06 00:00:00'), 204.86914429371856, True), (Timestamp('2020-09-03 00:00:00'), 219.11294871923633, True), (Timestamp('2020-10-01 00:00:00'), 209.2427032666044, True), (Timestamp('2020-11-05 00:00:00'), 213.47226602542818, True)], 'KR': [(Timestamp('2020-06-04 00:00:00'), 168.29444355575146, False), (Timestamp('2020-07-01 00:00:00'), 178.75564827859307, True), (Timestamp('2020-08-06 00:00:00'), 189.6680634921981, True), (Timestamp('2020-09-0

In [7]:
predictions

{'SPY': (RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                        max_depth=None, max_features='auto', max_leaf_nodes=None,
                        max_samples=None, min_impurity_decrease=0.0,
                        min_impurity_split=None, min_samples_leaf=1,
                        min_samples_split=2, min_weight_fraction_leaf=0.0,
                        n_estimators=100, n_jobs=None, oob_score=False,
                        random_state=None, verbose=0, warm_start=False),
  0.9030456479152223,
  [(Timestamp('2020-06-04 00:00:00'), 309.29851024866286),
   (Timestamp('2020-07-01 00:00:00'), 307.44861710358947),
   (Timestamp('2020-08-06 00:00:00'), 326.1911672209862),
   (Timestamp('2020-09-03 00:00:00'), 345.0863741604417),
   (Timestamp('2020-10-01 00:00:00'), 331.9784855110338),
   (Timestamp('2020-11-05 00:00:00'), 344.1379858897966)]),
 'AMZN': (LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False),
  4.997082903785

In [50]:
d_train_dfs={}
for ticker in tickers:
    tdf=create_train_df(ticker)
    if type(tdf)==pd.DataFrame:
        d_train_dfs[ticker]=create_train_df(ticker)

In [51]:
d_train_dfs['SPY']

Unnamed: 0,date,rate,change,close,predicted,sentiment
0,2018-02-02,4.1,0.0,261.422737,267.508507,1
1,2018-03-09,4.1,0.0,264.668574,260.147569,0
2,2018-04-06,4.1,0.0,247.479743,253.216681,6
3,2018-05-04,3.9,-4.88,253.482833,250.462649,4
4,2018-06-01,3.8,-2.56,260.705597,258.109977,1
5,2018-07-06,4.0,5.26,263.619396,261.45698,1
6,2018-08-03,3.9,-2.5,271.448917,270.385252,1
7,2018-09-07,3.9,0.0,275.277534,275.954526,1
8,2018-10-05,3.7,-5.13,276.73596,278.432623,2
9,2018-11-02,3.7,0.0,261.41943,262.971745,3


In [38]:
dill.dump(d_train_dfs, open('data/d_train_dfs.pkd', 'wb'))

In [59]:
d_trained_models={}
d_predictions={}
for ticker in d_train_dfs.keys():
    X=d_train_dfs[ticker][['rate','change','predicted','sentiment']]
    y=d_train_dfs[ticker]['close']
    regressors={'LR':LinearRegression(),'RF':RandomForestRegressor(),
                'KR':KNeighborsRegressor(n_neighbors=5),'RR':Ridge(alpha=10),
               'SGDR':SGDRegressor()}
    max_score=0
    for name,regressor in regressors.items():
        reg = regressor
        trained_model=reg.fit(X,y)
        score=reg.score(X,y)
        if score>max_score:
            max_score=score
            best_model=trained_model
    print(ticker,max_score)
    d_trained_models[ticker]=best_model
    predictions=pd.DataFrame(best_model.predict(X)
                                       ,d_train_dfs[ticker]['date'],columns=['prediction'])
    d_predictions[ticker]=predictions
    dill.dump(predictions, open('data/predictions/'+ticker+'.pkd', 'wb'))

SPY 0.995418572351244
GOOGL 0.9893671030271621
GOOG 0.9824830291594422
AMZN 0.9957285887579254
AAPL 0.9971915867831391
BA 0.9951737619883345
DAL 0.9950362121375359
XOM 0.9936167618267246
FB 0.993936899308937
FDX 0.9927569182464169
F 0.9880338702695479
GM 0.9857433476365604
MRK 0.9966533282113345
MSFT 0.9980369610447868
NFLX 0.9944497893987981
UPS 0.996392740132636
WMT 0.9970021392388302
DIS 0.9974164579254955


In [57]:
dill.dump(d_predictions, open('data/d_predictions.pkd', 'wb'))