In [1]:
import tweepy
import nltk
import json
import os
import pandas as pd
from pmaw import PushshiftAPI
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from twitter_auth import consumer_key, consumer_secret, access_token, access_token_secret
from tweepy.streaming import StreamListener
import yfinance
import numpy as np
from sklearn.metrics import accuracy_score


pd.set_option('display.max_rows', None)
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

historic_sentiment = pd.read_csv('wallstreetbets_big_df.csv')
historic_sentiment.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\desha\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Comment,Timestamp,Link,Time,Year,Month,Day,Hour,Sentiment
0,0,Enjoy you salty bastard :),1599700428,/r/wallstreetbets/comments/ipnztr/what_are_you...,2020-09-09 21:13:48,2020,9,9,21,0.4019
1,1,"""A red sun rises, blood has been spilled this ...",1599700430,/r/wallstreetbets/comments/ipniz0/hey_if_its_f...,2020-09-09 21:13:50,2020,9,9,21,-0.1761
2,2,this information was not useful to me,1599700430,/r/wallstreetbets/comments/ipnztr/what_are_you...,2020-09-09 21:13:50,2020,9,9,21,-0.3412
3,3,3am “they” are going to buy the shit out of an...,1599700436,/r/wallstreetbets/comments/ipnztr/what_are_you...,2020-09-09 21:13:56,2020,9,9,21,0.6124
4,4,"Nah, you didn’t know. Now you know 😁",1599700437,/r/wallstreetbets/comments/ipnztr/what_are_you...,2020-09-09 21:13:57,2020,9,9,21,-0.1027


In [258]:
def get_best_models(user_stock):
    user_df = historic_sentiment.loc[historic_sentiment['Comment'].str.lower().str.contains(user_stock.lower(), na=False)]
    user_df.reset_index(inplace=True, drop=True)
    user_time_period = ['Year', 'Month', 'Day']
    user_df = user_df.groupby(user_time_period).mean('Sentiment')
    sentiment_df = pd.Series(user_df['Sentiment'])
    sentiment_df = sentiment_df.reset_index()
    sentiment_df['Date'] = [f'''{sentiment_df['Year'][i]}-{sentiment_df['Month'][i]}-{sentiment_df['Day'][i]}''' for i in range(len(sentiment_df))]
    sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date']).dt.strftime('%Y-%m-%d')
    sentiment_df = sentiment_df.set_index('Date')
    

    start_time = f'''{user_df['Sentiment'].index[0][0]}-{user_df['Sentiment'].index[0][1]}-{user_df['Sentiment'].index[0][2]}'''
    end_time = f'''{user_df['Sentiment'].index[-1][0]}-{user_df['Sentiment'].index[-1][1]}-{user_df['Sentiment'].index[-1][2]}'''

    y_ticker = yfinance.Ticker(user_stock)
    close = y_ticker.history(start=start_time, end=end_time, interval='1d')['Close']

    close_df = pd.DataFrame(close, columns=['Close'])
    close_df = close_df.reset_index()
    close_df['Year'] = close_df['Date'].dt.strftime('%Y')
    close_df['Month'] = close_df['Date'].dt.strftime('%m')
    close_df['Day'] = close_df['Date'].dt.strftime('%d')
    close_df_user_time_period = close_df.groupby(user_time_period).mean()
    close_df_user_time_period = close_df_user_time_period.reset_index()
    close_df_user_time_period['Date'] = [f'''{close_df_user_time_period['Year'][i]}-{close_df_user_time_period['Month'][i]}-{close_df_user_time_period['Day'][i]}''' for i in range(len(close_df_user_time_period))]
    close_df_user_time_period = close_df_user_time_period.set_index('Date')
    close_df_user_time_period['Pct Change'] = close_df_user_time_period['Close'].pct_change()
    close_df_user_time_period = close_df_user_time_period.dropna()
    close_df_user_time_period['Pct Change Very High Class'] = np.where(close_df_user_time_period['Pct Change'] > close_df_user_time_period['Pct Change'].describe()['75%'], 'Very High', 0)
    close_df_user_time_period['Pct Change High Class'] = np.where(close_df_user_time_period['Pct Change'].between(close_df_user_time_period['Pct Change'].describe()['50%'], close_df_user_time_period['Pct Change'].describe()['75%']), 'High', 0)
    close_df_user_time_period['Pct Change Low Class'] = np.where(close_df_user_time_period['Pct Change'].between(close_df_user_time_period['Pct Change'].describe()['25%'], close_df_user_time_period['Pct Change'].describe()['50%']), 'Low', 0)
    close_df_user_time_period['Pct Change Very Low Class'] = np.where(close_df_user_time_period['Pct Change'] < close_df_user_time_period['Pct Change'].describe()['25%'], 'Very Low', 0)
    all_classes = []
    for i in range(len(close_df_user_time_period)):
        if 'Very High' in close_df_user_time_period['Pct Change Very High Class'][i]:
            all_classes.append('Very High')
        elif 'High' in close_df_user_time_period['Pct Change High Class'][i]:
            all_classes.append('High')
        elif 'Low' in close_df_user_time_period['Pct Change Low Class'][i]:
            all_classes.append('Low')
        elif 'Very Low' in close_df_user_time_period['Pct Change Very Low Class'][i]:
            all_classes.append('Very Low')

    close_df_user_time_period['All Classes'] = [class_ for class_ in all_classes]
    close_df_user_time_period['Positive'] = np.where(close_df_user_time_period['Pct Change'] > 0, 'Positive', 'Negative')

    sentiment_df = sentiment_df.loc[sentiment_df.index.isin(close_df_user_time_period.index)]
    sentiment_df = sentiment_df.dropna()
    close_df_user_time_period = close_df_user_time_period.loc[close_df_user_time_period.index.isin(sentiment_df.index)]
    X = sentiment_df['Sentiment'].to_numpy().reshape(-1, 1)
    y1 = close_df_user_time_period['All Classes']
    y2 = close_df_user_time_period['Positive']

    from sklearn.model_selection import train_test_split

    X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)

    X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

    from sklearn.model_selection import GridSearchCV

    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    lr_params = [{'solver':['newton-cg', 'lbfgs', 'liblinear']}]
    lr_clf = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')
    lr_clf_copy = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')

    lr1_clf = lr_clf.fit(X_train, y1_train)
    lr2_clf = lr_clf_copy.fit(X_train, y2_train)

    lr1_clf_pred = lr1_clf.predict(X_test)
    lr1_clf_acc = accuracy_score(y1_test, lr1_clf_pred)

    lr2_clf_pred = lr2_clf.predict(X_test)
    lr2_clf_acc = accuracy_score(y2_test, lr2_clf_pred)

    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier()
    forest_param = [{'max_depth':list(range(10, 20))}]

    forest_clf = GridSearchCV(rf, forest_param, cv=5, scoring='accuracy')
    forest_clf_copy = GridSearchCV(rf, forest_param, cv=5, scoring='accuracy')

    forest1_clf = forest_clf.fit(X_train, y1_train)
    forest2_clf = forest_clf_copy.fit(X_train, y2_train)

    forest1_clf_pred = forest1_clf.predict(X_test)
    forest1_clf_acc = accuracy_score(y1_test, forest1_clf_pred)

    forest2_clf_pred = forest2_clf.predict(X_test)
    forest2_clf_acc = accuracy_score(y2_test, forest2_clf_pred)

    from sklearn.naive_bayes import GaussianNB
    nb_clf = GaussianNB()
    nb_clf_copy = GaussianNB()

    nb1_clf = nb_clf.fit(X_train, y1_train)
    nb2_clf = nb_clf_copy.fit(X_train, y2_train)

    nb1_clf_pred = nb1_clf.predict(X_test)
    nb1_clf_acc = accuracy_score(y1_test, nb1_clf_pred)

    nb2_clf_pred = nb2_clf.predict(X_test)
    nb2_clf_acc = accuracy_score(y2_test, nb2_clf_pred)

    from sklearn.ensemble import GradientBoostingClassifier
    gb = GradientBoostingClassifier()
    gb_params = [{'learning_rate':[0.0001, 0.001, 0.01, 0.1], 'n_estimators':[50, 100, 200], 'criterion':['friedman_mse', 'mse']}]
    gb_clf = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')
    gb_clf_copy = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')

    gb1_clf = gb_clf.fit(X_train, y1_train)
    gb2_clf = gb_clf_copy.fit(X_train, y2_train)

    gb1_clf_pred = gb1_clf.predict(X_test)
    gb1_clf_acc = accuracy_score(y1_test, gb1_clf_pred)

    gb2_clf_pred = gb2_clf.predict(X_test)
    gb2_clf_acc = accuracy_score(y2_test, gb2_clf_pred)

    all_accs1 = [lr1_clf_acc, forest1_clf_acc, nb1_clf_acc, gb1_clf_acc]
    all_accs2 = [lr2_clf_acc, forest2_clf_acc, nb2_clf_acc, gb2_clf_acc]

    models1 = [lr1_clf, forest1_clf, nb1_clf, gb1_clf]
    models2 = [lr2_clf, forest2_clf, nb2_clf, gb2_clf]

    best_acc1 = max(all_accs1)
    best_acc2 = max(all_accs2)

    models_preds_dct1 = dict(zip(models1, all_accs1))
    models_preds_dct2 = dict(zip(models2, all_accs2))


    def get_highest(dct, val):
        for key, value in dct.items():
            if val == value:
                return key

    best_model1 = get_highest(models_preds_dct1, best_acc1)

    print('The best severity class model is:')
    if best_model1 == lr1_clf:
        print('Logistic Regression')
    elif best_model1 == forest1_clf:
        print('Random Forest')
    elif best_model1 == nb1_clf:
        print('Naive Bayes')
    elif best_model1 == gb1_clf:
        print('Gradient Boosting')

    print()
    print('The accuracy of the severity model is:')
    print(best_acc1)
    print()
    best_model2 = get_highest(models_preds_dct2, best_acc2)

    print('The best direction class model is:')
    if best_model2 == lr2_clf:
        print('Logistic Regression')
    elif best_model2 == forest2_clf:
        print('Random Forest')
    elif best_model2 == nb2_clf:
        print('Naive Bayes')
    elif best_model2 == gb2_clf:
        print('Gradient Boosting')

    print()
    print('The accuracy of the direction model is:')
    print(best_acc2)
    
    return best_model1, best_model2
    

In [281]:
def back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy):
    
    # Search for tweets of user's stock on a certain day
    search = tweepy.Cursor(api.search, q=f'${user_stock}', lang='en', since=backtest_start, until=backtest_end)
    # Limit results to 2000
    tweets = search.items(2000)
    # Create lists of the dates of the tweets and the comments
    dates = []
    comments = []
    for tweet in tweets:
        dates.append(tweet.created_at)
        comments.append(tweet.text)
        
    tweet_df = pd.DataFrame([dates, comments]).T
    tweet_df.columns = ['Dates', 'Comments']
    # Convert the Date to the same format as the yfinance dates
    tweet_df['Dates'] = tweet_df['Dates'].dt.strftime('%Y-%m-%d %H:%M')
    tweet_df = tweet_df.set_index('Dates')
    tweet_df.index = pd.to_datetime(tweet_df.index)
    # The index has to be associated with a timezone to change it to EST.  The original dates are from the UTC timezone
    tweet_df.index = tweet_df.index.tz_localize('UTC')
    tweet_df = tweet_df.tz_convert('US/Eastern')
    tweet_df = tweet_df.sort_index(ascending=True)
    
    # Use the same parameters as the twitter api search
    y_ticker = yfinance.Ticker(user_stock)
    back_test_close = y_ticker.history(start=backtest_start, end=backtest_end, interval='1m')['Close']
    
    # Make the indices the same
    back_test_close = back_test_close.loc[back_test_close.index.isin(tweet_df.index)]
    back_test_close = back_test_close.dropna()
    
    tweet_df = tweet_df.loc[tweet_df.index.isin(back_test_close.index)]
    tweet_df = tweet_df.dropna()

    tweet_df['Close'] = back_test_close
    
    # Get the sentiment of the tweets
    sid = SentimentIntensityAnalyzer()
    sentiment = tweet_df['Comments'].apply(lambda x: sid.polarity_scores(x)['compound'])
    tweet_df['Sentiment'] = sentiment
    
    # Get the percent change of each row
    tweet_df['Pct Change'] = tweet_df['Close'].pct_change()
    tweet_df = tweet_df.dropna()
    
    # Group by the index; get the mean of all of them
    sentiment_df = tweet_df.groupby(tweet_df.index).mean()
    
    # This takes the mean of every sentiment up to that point; exactly the way the program works
    rolling_avg_sentiment = sentiment_df['Sentiment'].expanding().mean()
    
    # Here we get the severity and direction predictions for each sentiment
    severity_pred = rolling_avg_sentiment.apply(lambda x: best_model1.predict(np.array(x).reshape(1, -1))[0])
    direction_pred = rolling_avg_sentiment.apply(lambda x: best_model2.predict(np.array(x).reshape(1, -1))[0])
    
    # Here we get the change from the first price for each
    open_change = sentiment_df['Close'].apply(lambda x: (x - sentiment_df['Close'].iloc[0])/sentiment_df['Close'].iloc[0])
    
    # Initializing the math variables and lists
    all_total = 0
    buy_low_sell_high_total = 0
#     prob_total = 0


    all_total_list = []
    buy_low_sell_high_total_list = []
#     prob_total_list = []

    all_buy_count_list = []
    buy_low_sell_high_buy_count_list = []
#     prob_buy_count_list = []

    all_sell_count_list = []
    buy_low_sell_high_sell_count_list = []
#     prob_sell_count_list = []

    for i in range(len(open_change)):

        if direction_pred[i] == 'Positive' and severity_pred[i] == 'Very High':
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] <= close_df_user_time_period['Pct Change'].describe()['75%']: 
                # This condition triggers a buy signal. Since we are buying, we subtract the stock's current price from our running total
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] >= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.66))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.66))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
                # This condition represents the highest conviction buy signal. The stock's price is very low and the direction prediction is positive and the severity prediction is very high.
                buy_low_sell_high_total = buy_low_sell_high_total - sentiment_df['Close'][i]
                buy_low_sell_high_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - sentiment_df['Close'][i]
#                 prob_buy_count_list.append(sentiment_df['Close'][i])

        elif direction_pred[i] == 'Positive' and severity_pred[i] == 'High':
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                # This condition triggers a sale signal.  The current stock price is added to the running total, since we are selling.
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[0] >= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.66))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.66))

        elif direction_pred[i] == 'Positive' and severity_pred[i] == 'Low': 
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.66))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.66))

            elif open_change[i] >= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] <= close_df_user_time_period['Pct Change'].describe()['75%']: 
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

        elif direction_pred[i] == 'Positive' and severity_pred[i] == 'Very Low': 
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.66))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.66))

            elif open_change[i] >= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] <= close_df_user_time_period['Pct Change'].describe()['75%']: 
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

        if direction_pred[i] == 'Negative' and severity_pred[i] == 'Very High':
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] >= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.66))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.66))

        elif direction_pred[i] == 'Negative' and severity_pred[i] == 'High':
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] >= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.66))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.66))

        elif direction_pred[i] == 'Negative' and severity_pred[i] == 'Low': 
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.66))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.66))

            elif open_change[i] >= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] <= close_df_user_time_period['Pct Change'].describe()['75%']: 
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total - sentiment_df['Close'][i]
                all_buy_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total - (sentiment_df['Close'][i] * (1-0.33))
#                 prob_buy_count_list.append(sentiment_df['Close'][i] * (1-0.33))

        elif direction_pred[i] == 'Negative' and severity_pred[i] == 'Very Low': 
            if open_change[i] >= close_df_user_time_period['Pct Change'].describe()['75%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
                buy_low_sell_high_total = buy_low_sell_high_total + sentiment_df['Close'][i]
                buy_low_sell_high_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + sentiment_df['Close'][i]
#                 prob_sell_count_list.append(sentiment_df['Close'][i])

            elif open_change[i] >= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] <= close_df_user_time_period['Pct Change'].describe()['75%']: 
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.66))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.66))

            elif open_change[i] <= close_df_user_time_period['Pct Change'].describe()['50%'] and open_change[i] >= close_df_user_time_period['Pct Change'].describe()['25%']:
                all_total = all_total + sentiment_df['Close'][i]
                all_sell_count_list.append(sentiment_df['Close'][i])
#                 prob_total = prob_total + (sentiment_df['Close'][i] * (1-0.33))
#                 prob_sell_count_list.append(sentiment_df['Close'][i] * (1-0.33))
        
        # We append the running totals to a list after each iteration
        all_total_list.append(all_total)
        buy_low_sell_high_total_list.append(buy_low_sell_high_total)
#         prob_total_list.append(prob_total)
    # Getting the counts of the buys and sells for later balancing
    all_buys = len(all_buy_count_list)
    all_sells = len(all_sell_count_list)
    buy_low_sell_high_buys = len(all_buy_count_list)
    buy_low_sell_high_sells = len(all_buy_count_list)
#     prob_buys = len(prob_buy_count_list)
#     prob_sells = len(prob_sell_count_list)

    # We balance the surplus buys by selling off the difference at the end of the day's stock price
    if all_buys != all_sells:
        if all_buys > all_sells:
            all_surplus = all_buys - all_sells
            all_buys_value = all_total_list[-1] + (sentiment_df['Close'][-1] * all_surplus)

        else:
            # The same concept.  If we shorted the stock, we buy them back to balance out the surplus at the end of the day
            all_surplus = all_sells - all_buys
            all_buys_value = all_total_list[-1] - (sentiment_df['Close'][-1] * all_surplus)
    else:
        all_buys_value = all_total_list[-1]
    
    
    if buy_low_sell_high_buys != buy_low_sell_high_sells:
        if buy_low_sell_high_buys > buy_low_sell_high_sells:
            buy_low_sell_high_surplus = buy_low_sell_high_buys - buy_low_sell_high_sells
            buy_low_sell_high_value = buy_low_sell_high_total_list[-1] + (sentiment_df['Close'][-1] * buy_low_sell_high_surplus)

        else:
            buy_low_sell_high_surplus = buy_low_sell_high_sells - buy_low_sell_high_buys
            buy_low_sell_high_value = buy_low_sell_high_total_list[-1] - (sentiment_df['Close'][-1] * buy_low_sell_high_surplus)
    else:
        buy_low_sell_high_value = buy_low_sell_high_total_list[-1]
    
    print('Number of buy suggestions:', all_buys)
    print('Number of sell suggestions:', all_sells)
    print('Number of ignore suggestions:', len(open_change) - (all_buys + all_sells))
    print('Value after end-of-day liquidation:')
    # These values represent the money that we earned at the end of the day by following the program's suggestions.
    if strategy == 'All Suggestions':
        return all_buys_value
    elif strategy == 'Strongest Convictions':
        return buy_low_sell_high_value


In [270]:
user_stock = 'BA'

severity_model, direction_model = get_best_models(user_stock)

The best severity class model is:
Naive Bayes

The accuracy of the severity model is:
0.3125

The best direction class model is:
Naive Bayes

The accuracy of the direction model is:
0.4583333333333333


In [279]:
backtest_start = '2021-12-02'
backtest_end = '2021-12-03'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'All Suggestions'
ba_1202 = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
ba_1202

Number of buy suggestions: 60
Number of sell suggestions: 151
Number of ignore suggestions: 90


156.2085723876953

In [282]:
user_stock = 'BA'

backtest_start = '2021-12-03'
backtest_end = '2021-12-04'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'All Suggestions'
ba_1203 = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
ba_1203

Number of buy suggestions: 192
Number of sell suggestions: 0
Number of ignore suggestions: 6
Value after end-of-day liquidation:


213.77743530273438

In [284]:
user_stock = 'MSFT'

severity_model, direction_model = get_best_models(user_stock)

backtest_start = '2021-12-02'
backtest_end = '2021-12-03'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'All Suggestions'
msft_1202 = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
msft_1202

The best severity class model is:
Naive Bayes

The accuracy of the severity model is:
0.2765957446808511

The best direction class model is:
Naive Bayes

The accuracy of the direction model is:
0.5319148936170213
Number of buy suggestions: 58
Number of sell suggestions: 0
Number of ignore suggestions: 250
Value after end-of-day liquidation:


30.408172607421875

In [290]:
user_stock = 'MSFT'
backtest_start = '2021-12-03'
backtest_end = '2021-12-04'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'All Suggestions'
msft_1203 = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
msft_1203

Number of buy suggestions: 318
Number of sell suggestions: 0
Number of ignore suggestions: 8
Value after end-of-day liquidation:


529.4197082519531

In [294]:
user_stock = 'MSFT'
backtest_start = '2021-12-02'
backtest_end = '2021-12-03'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'Strongest Convictions'
msft_1202_sc = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
msft_1202_sc

Number of buy suggestions: 58
Number of sell suggestions: 0
Number of ignore suggestions: 250
Value after end-of-day liquidation:


0

In [295]:
user_stock = 'TSLA'

severity_model, direction_model = get_best_models(user_stock)

The best severity class model is:
Random Forest

The accuracy of the severity model is:
0.2708333333333333

The best direction class model is:
Logistic Regression

The accuracy of the direction model is:
0.5416666666666666


In [298]:
backtest_start = '2021-12-02'
backtest_end = '2021-12-03'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'All Suggestions'
tsla_1202 = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
tsla_1202

Number of buy suggestions: 1
Number of sell suggestions: 0
Number of ignore suggestions: 66
Value after end-of-day liquidation:


13.06005859375

In [300]:
backtest_start = '2021-12-03'
backtest_end = '2021-12-04'
bestmodel1 = severity_model
bestmodel2 = direction_model
strategy = 'All Suggestions'
tsla_1203 = back_test(user_stock, backtest_start, backtest_end, bestmodel1, bestmodel2, strategy)
tsla_1203

Number of buy suggestions: 21
Number of sell suggestions: 0
Number of ignore suggestions: 8
Value after end-of-day liquidation:


126.87554931640625