In [102]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
from pytrends import dailydata
import requests
import collections
import os

from sklearn import linear_model
import statsmodels.api as sm

from backtesting import Backtest

from scipy.optimize import minimize


In [103]:
x = os.walk("trends")
for word in x:
    tuplee = word
words = tuplee[2]

In [122]:
def getStats(key_word, start_year, start_month, end_year, end_month, index):
    #trend = pd.DataFrame(dailydata.get_daily_data(key_word, start_year, start_month, end_year, end_month, wait_time = 0)[key_word])
    trend = pd.read_csv("trends/"+key_word)
    key_word = key_word[:-4]
    trend['date'] = pd.to_datetime(trend['date'])
    trend = trend.set_index('date')
    trend["change"] = trend[key_word].pct_change()

    # grab index data
    index_data = yf.download(index, start = str(start_year)+"-"+str(start_month)+"-01", end = str(end_year)+"-"+str(end_month)+"-01")

    # join trend data with DJIA data
    joined = trend.merge(index_data, left_on = trend.index, right_on = index_data.index)
    joined = joined.rename(columns = {"key_0": "Date"})

    # grab the adj close price difference for each day
    #joined['Diff'] = joined['Adj Close'].diff()
    joined = joined.set_index(joined['Date'])

    joined["change moving avg"] = joined["change"].rolling("14d", min_periods = 1).mean()
    joined = joined[joined.index.dayofweek == 1]
    
    return joined


In [138]:
# grab data from yfinance and merge with la
start_year = 2010
start_month = 1
end_year = 2014
end_month = 12
index = "NDAQ"
training = {}

#for word in words:
for word in words:
    if word[0] == ".": # extra thing in there
        continue
    training[word] = getStats(word, start_year, start_month, end_year, end_month, index)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [139]:
# testing data
start_year = 2015
start_month = 1
end_year = 2015
end_month = 12
index = "NDAQ"

testing = {}

#for word in words:
for word in words:
    if word[0] == ".": # extra thing in there
        continue
    testing[word] = getStats(word, start_year, start_month, end_year, end_month, index)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

# Training

## We begin backtesting our model using the python backtesting library. 
## Our algorithm is as follows:
###   1. We have already calculated the change moving average. If that value is above some "high" threshold, then we buy at the current value. The intuition being that an increase in search volume on positive words will lead to a positive impact on the market and a increased price in our index. Vice versa for "low" threshold.
###   2. Now, we need to define some threshold at which to execute our buy and sell trades. We optimize our backtest (on all words available) to find the best values for our "high" and "low" parameters to optimize returns. 
###   3. We get our 80-20 split by grabbing the first 8 years (2010-2018) as our training data and the next two years (2019-2020) as our testing.
###   4. We run our testing model to find the optimized high and low thresholds.

In [140]:
def getMovingAvg(df):
    return pd.Series(df['change moving avg'])

In [141]:
from backtesting import Strategy
from backtesting.lib import crossover

class trainingStrat(Strategy):
    high = .1
    low = -.01
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        high = self.high
        low = self.low

    def next(self):
        if self.change[-1] > self.high:
            self.position.close()
            #print("buying")
            self.buy()

        elif self.change[-1] < self.low:
            self.position.close()
            #print("selling")
            self.sell()
        else:
            self.position.close()

In [142]:
# optimizing each word and saving best parameters high and low

optimized_stats = {}
bts = {}

for word in training.keys():
    bt = Backtest(training[word], trainingStrat, cash=100_000_000, commission=0)
    """
    stats = bt.optimize(low = list(np.asarray(range(-100, 100))/100),
                        high = list(np.asarray(range(-100, 100))/100), 
                        constraint = lambda param: param.low > param.high, # flipped?!
                        maximize = 'Return [%]', 
                        return_optimization = True, method = 'skopt')
    """
    stats = bt.run()
    optimized_stats[word] = stats
    bts[word] = bt
    print("Finished "+word)
    
optimized_stats

Finished chance.csv
Finished politics.csv
Finished invest.csv
Finished dividend.csv
Finished hedge.csv
Finished gold.csv
Finished bonds.csv
Finished culture.csv
Finished economy.csv
Finished leverage.csv
Finished risk.csv
Finished nasdaq.csv
Finished banking.csv
Finished derivatives.csv
Finished sell.csv
Finished office.csv
Finished dow jones.csv
Finished food.csv
Finished revenue.csv
Finished economics.csv
Finished fine.csv
Finished holiday.csv
Finished society.csv
Finished crash.csv
Finished profit.csv
Finished money.csv
Finished tourism.csv
Finished growth.csv
Finished arts.csv


  s.loc['Sortino Ratio'] = np.clip(annualized_return / (np.sqrt(np.mean(day_returns.clip(-np.inf, 0)**2)) * np.sqrt(annual_trading_days)), 0, np.inf)  # noqa: E501


Finished rich.csv
Finished investment.csv
Finished debt.csv
Finished earnings.csv
Finished present.csv
Finished house.csv
Finished financial markets.csv
Finished markets.csv
Finished crisis.csv
Finished inflation.csv
Finished train.csv
Finished ore.csv
Finished buy.csv
Finished freedom.csv
Finished gains.csv
Finished world.csv
Finished metals.csv
Finished war.csv
Finished unemployment.csv
Finished health.csv
Finished return.csv
Finished returns.csv
Finished stocks.csv


{'chance.csv': Start                     2010-01-05 00:00:00
 End                       2014-11-25 00:00:00
 Duration                   1785 days 00:00:00
 Exposure Time [%]                     73.1225
 Equity Final [$]                  7.86161e+07
 Equity Peak [$]                   1.23557e+08
 Return [%]                           -21.3839
 Buy & Hold Return [%]                 119.606
 Return (Ann.) [%]                    -21.3091
 Volatility (Ann.) [%]                 34.7268
 Sharpe Ratio                                0
 Sortino Ratio                               0
 Calmar Ratio                                0
 Max. Drawdown [%]                    -38.5947
 Avg. Drawdown [%]                    -14.4429
 Max. Drawdown Duration     1547 days 00:00:00
 Avg. Drawdown Duration      434 days 00:00:00
 # Trades                                  140
 Win Rate [%]                               45
 Best Trade [%]                        12.5786
 Worst Trade [%]                      -11.7506

In [143]:
# unpack the various values held within the optimized stats
returns = {}
sharpe = {}
win_rate = {}
avg_trade = {}
#optimized_parameters = {}

for word in training.keys():
    returns[word] = optimized_stats[word][6]#[0][6]
    returns['BENCHMARK'] = optimized_stats[word][7]#[0][7]
    sharpe[word] = optimized_stats[word][10]#[0][10]
    win_rate[word] = optimized_stats[word][18]#[0][18]
    avg_trade[word] = optimized_stats[word][2]#[0][21]
    #optimized_parameters[word] = optimized_stats[word][1].x

In [144]:
# our best words
optimized_words = [i[0] for i in sorted(returns.items(), key = lambda x: x[1], reverse = True)[:20]]
optimized_words_returns = sorted(returns.items(), key = lambda x: x[1], reverse = True)[:20]
optimized_words_returns

[('dividend.csv', 161.2408223220501),
 ('derivatives.csv', 148.5492659604702),
 ('revenue.csv', 140.58041204367257),
 ('nasdaq.csv', 121.13095097949599),
 ('dow jones.csv', 121.13095097949599),
 ('BENCHMARK', 119.60592860352197),
 ('gains.csv', 115.52424213109971),
 ('debt.csv', 112.13434160151672),
 ('health.csv', 111.9238229294796),
 ('metals.csv', 98.05842782024193),
 ('investment.csv', 91.70232634330559),
 ('economics.csv', 86.157782956007),
 ('office.csv', 84.66624001600837),
 ('inflation.csv', 56.69914625979233),
 ('bonds.csv', 50.93179614742851),
 ('politics.csv', 46.532472429828644),
 ('invest.csv', 42.73654016172409),
 ('stocks.csv', 41.45171914568329),
 ('earnings.csv', 33.84717300044632),
 ('financial markets.csv', 21.067433241443634)]

In [145]:
optimized_words_sharpe = sorted(sharpe.items(), key = lambda x: x[1], reverse = True)[:20]
optimized_words_sharpe

[('growth.csv', 1.5845771024552862),
 ('society.csv', 1.548169013768821),
 ('economics.csv', 1.4437320900483759),
 ('debt.csv', 1.421649076212633),
 ('gains.csv', 1.3963100373572448),
 ('dividend.csv', 1.2085982455367912),
 ('inflation.csv', 1.1334284974423194),
 ('derivatives.csv', 1.1293048980334108),
 ('bonds.csv', 1.1099068011939097),
 ('politics.csv', 1.0696247846389628),
 ('revenue.csv', 0.9896660541723554),
 ('health.csv', 0.8953810110292972),
 ('nasdaq.csv', 0.8937225984218718),
 ('dow jones.csv', 0.8937225984218718),
 ('investment.csv', 0.8705712033610485),
 ('metals.csv', 0.8230796146513839),
 ('invest.csv', 0.7962063310971591),
 ('office.csv', 0.7819952875799256),
 ('stocks.csv', 0.7314428413425497),
 ('present.csv', 0.7041615097982288)]

In [132]:
optimized_words_parameters = {word:optimized_parameters[word] for word in optimized_words if word != "BENCHMARK"}
optimized_words_parameters

KeyError: 'revenue.csv'

In [115]:
"""
# non optimized 
returns = {}
sharpe = {}
win_rate = {}
avg_trade = {}
bts = {}

count = 0

for word in word_trends.keys():
    bt = Backtest(word_trends[word], trainingStrat, cash=100_000_000, commission=0)
    stats = bt.run()
    bts[word] = bt
    returns[word] = stats[6]
    returns['BENCHMARK'] = stats[7]
    sharpe[word] = stats[10]
    win_rate[word] = stats[18]
    avg_trade[word] = stats[21]

returns
"""

"\n# non optimized \nreturns = {}\nsharpe = {}\nwin_rate = {}\navg_trade = {}\nbts = {}\n\ncount = 0\n\nfor word in word_trends.keys():\n    bt = Backtest(word_trends[word], trainingStrat, cash=100_000_000, commission=0)\n    stats = bt.run()\n    bts[word] = bt\n    returns[word] = stats[6]\n    returns['BENCHMARK'] = stats[7]\n    sharpe[word] = stats[10]\n    win_rate[word] = stats[18]\n    avg_trade[word] = stats[21]\n\nreturns\n"

# Testing

In [146]:
# all positive words here have sharpe ratio > 1
testing_words = ["invest.csv", "money.csv", "financial markets.csv", "bonds.csv", "stocks.csv", "gains.csv", "derivatives.csv", "gold.csv", "dividend.csv", "rich.csv","revenue.csv", "returns.csv", "nasdaq.csv", "dow jones.csv"]
testing_words

['invest.csv',
 'money.csv',
 'financial markets.csv',
 'bonds.csv',
 'stocks.csv',
 'gains.csv',
 'derivatives.csv',
 'gold.csv',
 'dividend.csv',
 'rich.csv',
 'revenue.csv',
 'returns.csv',
 'nasdaq.csv',
 'dow jones.csv']

In [147]:
# slight modifications to testing dfs to let backtesting work
for word in testing_words:
    testing[word]['word'] = str(word)

In [148]:
from backtesting import Strategy
from backtesting.lib import crossover

def getMovingAvg(df):
    return pd.Series(df['change moving avg'])

def getOptimizedParameters(key_word):
    return optimized_parameters[key_word]

def getOptimizedParameters2(df):
    return optimized_parameters[df['word'][0]]

class testingStrat(Strategy):
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        #self.optimized_parameters = getOptimizedParameters2(self.data) # grabs the word from the df, had to add a new column with name
        #print(self.optimized_parameters)
        #self.high = self.optimized_parameters[0]
        #self.low = self.optimized_parameters[1]
        self.high = .1
        self.low = -.01
        
    def next(self):
        if self.change[-1] > self.high:
            self.position.close()
            #print("buying")
            self.buy()

        elif self.change[-1] < self.low:
            self.position.close()
            #print("selling")
            self.sell()
        else:
            self.position.close()

In [149]:
# testing
testing_stats = {}
for word in testing_words:
    bt = Backtest(testing[word], testingStrat, cash=100_000_000, commission=0)
    stats = bt.run() # we no longer optimize, but we run and grab the optimized parameters from training
    testing_stats[word] = stats
    
testing_stats

{'invest.csv': Start                     2015-01-06 00:00:00
 End                       2015-11-24 00:00:00
 Duration                    322 days 00:00:00
 Exposure Time [%]                     17.0213
 Equity Final [$]                  1.04985e+08
 Equity Peak [$]                   1.04985e+08
 Return [%]                            4.98488
 Buy & Hold Return [%]                 22.9298
 Return (Ann.) [%]                     29.8002
 Volatility (Ann.) [%]                 13.4561
 Sharpe Ratio                          2.21462
 Sortino Ratio                         15.9573
 Calmar Ratio                          28.0074
 Max. Drawdown [%]                    -1.06401
 Avg. Drawdown [%]                    -0.76295
 Max. Drawdown Duration      203 days 00:00:00
 Avg. Drawdown Duration      137 days 00:00:00
 # Trades                                    5
 Win Rate [%]                               80
 Best Trade [%]                        2.62605
 Worst Trade [%]                      -1.15914

In [150]:
# unpack the various values held within the stats
testing_returns = {}
testing_sharpe = {}
testing_win_rate = {}
testing_avg_trade = {}

for word in testing_words:
    testing_returns[word] = testing_stats[word][6]
    testing_returns["BENCHMARK"] = testing_stats[word][7]
    testing_sharpe[word] = testing_stats[word][10]
    testing_win_rate[word] = testing_stats[word][18]
    testing_avg_trade[word] = testing_stats[word][21]

In [151]:
testing_optimized_words = [i[0] for i in sorted(testing_returns.items(), key = lambda x: x[1], reverse = True)[:20]]
testing_optimized_words_returns = sorted(testing_returns.items(), key = lambda x: x[1], reverse = True)[:20]
testing_optimized_words_returns

[('nasdaq.csv', 25.51501945627594),
 ('dow jones.csv', 25.51501945627594),
 ('revenue.csv', 25.21910511155319),
 ('stocks.csv', 24.223442227752688),
 ('BENCHMARK', 22.929801234877907),
 ('dividend.csv', 18.079125001037596),
 ('derivatives.csv', 10.315100855960846),
 ('financial markets.csv', 7.886743749134063),
 ('returns.csv', 7.359276527778626),
 ('invest.csv', 4.9848772309532166),
 ('bonds.csv', 1.161872679458618),
 ('rich.csv', -3.323451942848205),
 ('gains.csv', -3.541814113433838),
 ('money.csv', -3.616499239665985),
 ('gold.csv', -10.904461430130006)]

In [152]:
testing_sharpe_words = sorted(testing_sharpe.items(), key = lambda x: x[1], reverse = True)[:20]
testing_sharpe_words

[('stocks.csv', 2.52306135024861),
 ('invest.csv', 2.2146238647150853),
 ('nasdaq.csv', 1.6212384944175682),
 ('dow jones.csv', 1.6212384944175682),
 ('revenue.csv', 1.6085267594392785),
 ('dividend.csv', 1.545751429059997),
 ('returns.csv', 1.4354997242266192),
 ('derivatives.csv', 1.2225886354466986),
 ('financial markets.csv', 0.8325771800632812),
 ('bonds.csv', 0.5983898333256525),
 ('money.csv', 0.0),
 ('gains.csv', 0.0),
 ('gold.csv', 0.0),
 ('rich.csv', 0.0)]

### Random Alg

In [None]:
import random

# randomly buy and sell 
class randomStrat(Strategy):
    def init(self):
        self.change = self.I(ourFunc1, self.data)
    def next(self):
        if random.choice([True, False]):
            self.position.close()
            self.buy()
        else:
            self.position.close()
            self.sell()

In [None]:
random_returns = []
for x in range(201):
    bt = Backtest(word_trends["debt"], randomStrat, cash=100_000_000, commission=0) # initializes
    random_stats = bt.run()
    # returns["RANDOM"] = random_stats[6]
    random_returns.append(random_stats[6])

In [None]:
np.mean(random_returns)

In [None]:
stats['Return [%]']

In [21]:
# by the paper
from backtesting import Strategy
from backtesting.lib import crossover

class trainingStrat(Strategy):
    high = 0
    low = 0
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        high = self.high
        low = self.low
        self.buy_next_week = False
        self.sell_next_week = False

    def next(self):
        assert not (self.buy_next_week == True and self.sell_next_week == True)
        
        if self.buy_next_week:
            self.buy()
            self.buy_next_week = False
        
        if self.sell_next_week:
            self.sell()
            self.sell_next_week = False
        
        if self.change[-1] > self.high:
            #self.position.close()
            #print("selling this week, buying next week")
            self.sell()
            self.buy_next_week = True
            self.sell_next_week = False

        elif self.change[-1] < self.low:
            #self.position.close()
            #print("buying this week, selling next week")
            self.buy()
            self.sell_next_week = True
            self.buy_next_week = False