In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
from pytrends import dailydata
import requests
import collections
import os

from sklearn import linear_model
import statsmodels.api as sm

from backtesting import Backtest

from scipy.optimize import minimize




In [3]:
x = os.walk("trends")
for word in x:
    tuplee = word
words = tuplee[2]

In [17]:
def getStats(key_word, start_year, start_month, end_year, end_month, index):
    #trend = pd.DataFrame(dailydata.get_daily_data(key_word, start_year, start_month, end_year, end_month, wait_time = 0)[key_word])
    trend = pd.read_csv("trends/"+key_word)
    key_word = key_word[:-4]
    trend['date'] = pd.to_datetime(trend['date'])
    trend = trend.set_index('date')
    trend["change"] = trend[key_word].pct_change()

    # grab index data
    index_data = yf.download(index, start = str(start_year)+"-"+str(start_month)+"-01", end = str(end_year)+"-"+str(end_month)+"-01")

    # join trend data with DJIA data
    joined = trend.merge(index_data, left_on = trend.index, right_on = index_data.index)
    joined = joined.rename(columns = {"key_0": "Date"})

    # grab the adj close price difference for each day
    #joined['Diff'] = joined['Adj Close'].diff()
    joined = joined.set_index(joined['Date'])

    joined["change moving avg"] = joined["change"].rolling("14d", min_periods = 1).mean()
    joined = joined[joined.index.dayofweek == 1]
    
    return joined


In [18]:
# training data
start_year = 2010
start_month = 1
end_year = 2018
end_month = 1
index = "NDAQ"
training = {}

#for word in words:
for word in words:
    if word[0] == ".": # extra thing in there
        continue
    training[word] = getStats(word, start_year, start_month, end_year, end_month, index)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [None]:
# testing data
start_year = 2018
start_month = 1
end_year = 2020
end_month = 11
index = "NDAQ"

testing = {}

#for word in words:
for word in words:
    if word[0] == ".": # extra thing in there
        continue
    testing[word] = getStats(word, start_year, start_month, end_year, end_month, index)

# Training

## We begin backtesting our model using the python backtesting library. 
## Our algorithm is as follows:
###   1. We have already calculated the change moving average. If that value is above some "high" threshold, then we buy at the current value. The intuition being that an increase in search volume on positive words will lead to a positive impact on the market and a increased price in our index. Vice versa for "low" threshold.
###   2. Now, we need to define some threshold at which to execute our buy and sell trades. We optimize our backtest (on all words available) to find the best values for our "high" and "low" parameters to optimize returns. 
###   3. We get our 80-20 split by grabbing the first 8 years (2010-2018) as our training data and the next two years (2019-2020) as our testing.
###   4. We run our testing model to find the optimized high and low thresholds.

In [105]:
def getMovingAvg(df):
    return pd.Series(df['change moving avg'])

In [108]:
from backtesting import Strategy
from backtesting.lib import crossover

class trainingStrat(Strategy):
    high = 0
    low = 0
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        high = self.high
        low = self.low

    def next(self):
        if self.change[-1] > self.high:
            self.position.close()
            #print("buying")
            self.buy()

        elif self.change[-1] < self.low:
            self.position.close()
            #print("selling")
            self.sell()
        else:
            self.position.close()

In [75]:
# optimizing each word and saving best parameters high and low

optimized_stats = {}
bts = {}

for word in training.keys():
    bt = Backtest(training[word], trainingStrat, cash=100_000_000, commission=0)
    stats = bt.optimize(low = list(np.asarray(range(-100, 100))/100),
                        high = list(np.asarray(range(-100, 100))/100), 
                        constraint = lambda param: param.low > param.high, # flipped?!
                        maximize = 'Return [%]', 
                        return_optimization = True, method = 'skopt')
    optimized_stats[word] = stats
    bts[word] = bt
    print("Finished "+word)
    
optimized_stats

Finished chance.csv
Finished politics.csv
Finished invest.csv
Finished dividend.csv
Finished hedge.csv
Finished gold.csv
Finished bonds.csv
Finished culture.csv
Finished economy.csv
Finished leverage.csv
Finished risk.csv
Finished nasdaq.csv
Finished banking.csv
Finished derivatives.csv
Finished sell.csv
Finished office.csv
Finished dow jones.csv
Finished food.csv
Finished revenue.csv
Finished economics.csv
Finished fine.csv
Finished holiday.csv
Finished society.csv
Finished crash.csv
Finished profit.csv
Finished money.csv
Finished tourism.csv
Finished growth.csv
Finished arts.csv
Finished rich.csv
Finished investment.csv
Finished debt.csv
Finished earnings.csv
Finished present.csv
Finished house.csv
Finished financial markets.csv
Finished markets.csv
Finished crisis.csv
Finished inflation.csv
Finished train.csv
Finished ore.csv
Finished buy.csv
Finished freedom.csv
Finished gains.csv
Finished world.csv
Finished metals.csv
Finished war.csv
Finished unemployment.csv
Finished health.csv


{'chance.csv': (Start                     2010-01-05 00:00:00
  End                       2020-10-27 00:00:00
  Duration                   3948 days 00:00:00
  Exposure Time [%]                     99.6422
  Equity Final [$]                  7.39059e+08
  Equity Peak [$]                   7.96482e+08
  Return [%]                            639.059
  Buy & Hold Return [%]                 516.256
  Return (Ann.) [%]                      146.38
  Volatility (Ann.) [%]                  129.05
  Sharpe Ratio                          1.13429
  Sortino Ratio                         4.47043
  Calmar Ratio                          4.86852
  Max. Drawdown [%]                    -30.0666
  Avg. Drawdown [%]                    -5.56314
  Max. Drawdown Duration      686 days 00:00:00
  Avg. Drawdown Duration       68 days 00:00:00
  # Trades                                  557
  Win Rate [%]                          58.7074
  Best Trade [%]                         29.398
  Worst Trade [%]         

In [118]:
# unpack the various values held within the optimized stats
returns = {}
sharpe = {}
win_rate = {}
avg_trade = {}
optimized_parameters = {}

for word in word_trends.keys():
    returns[word] = optimized_stats[word][0][6]
    returns['BENCHMARK'] = optimized_stats[word][0][7]
    sharpe[word] = optimized_stats[word][0][10]
    win_rate[word] = optimized_stats[word][0][18]
    avg_trade[word] = optimized_stats[word][0][21]
    optimized_parameters[word] = optimized_stats[word][1].x

In [119]:
# our best words
optimized_words = [i[0] for i in sorted(returns.items(), key = lambda x: x[1], reverse = True)[:20]]
optimized_words_returns = sorted(returns.items(), key = lambda x: x[1], reverse = True)[:20]
optimized_words_returns

[('rich.csv', 841.9246971428547),
 ('gains.csv', 829.6334024404543),
 ('money.csv', 801.6426059977359),
 ('financial markets.csv', 771.5518638041611),
 ('unemployment.csv', 734.9189102488156),
 ('debt.csv', 729.4948095697613),
 ('bonds.csv', 724.4984900575829),
 ('stocks.csv', 720.8293785937939),
 ('derivatives.csv', 687.1315155655251),
 ('crisis.csv', 674.787580125639),
 ('invest.csv', 674.6626307986888),
 ('culture.csv', 647.8005087729873),
 ('revenue.csv', 644.684179927391),
 ('chance.csv', 639.0586592940635),
 ('arts.csv', 629.549687693943),
 ('ore.csv', 628.3324245995102),
 ('world.csv', 613.7002681739731),
 ('inflation.csv', 609.1887817375431),
 ('gold.csv', 589.3485308846931),
 ('dividend.csv', 586.4598460686378)]

In [120]:
optimized_words_sharpe = sorted(sharpe.items(), key = lambda x: x[1], reverse = True)[:20]
optimized_words_sharpe

[('rich.csv', 1.22174407607611),
 ('gains.csv', 1.1811178092450931),
 ('stocks.csv', 1.1741564802488127),
 ('money.csv', 1.1685073216665627),
 ('crisis.csv', 1.1530160371535711),
 ('bonds.csv', 1.1524258737332733),
 ('invest.csv', 1.145652863372182),
 ('derivatives.csv', 1.1435173989751541),
 ('debt.csv', 1.1360327443050875),
 ('chance.csv', 1.134288193968425),
 ('financial markets.csv', 1.1334296496572807),
 ('dividend.csv', 1.1288389543381385),
 ('revenue.csv', 1.1256715364506604),
 ('unemployment.csv', 1.1220495150930743),
 ('inflation.csv', 1.1215208807563304),
 ('ore.csv', 1.1197600593010713),
 ('culture.csv', 1.1138896729822119),
 ('world.csv', 1.1135517687100154),
 ('gold.csv', 1.1106275421936864),
 ('arts.csv', 1.1057843882630458)]

In [121]:
optimized_words_parameters = {word:optimized_parameters[word] for word in optimized_words if word != "BENCHMARK"}
optimized_words_parameters

{'rich.csv': [0.2883767460193285, -0.04570598645097723],
 'gains.csv': [0.59879120559446, -0.013266551637252655],
 'money.csv': [0.8055477118704932, -0.007679051936578363],
 'financial markets.csv': [0.6848490919447632, -0.11244418072522144],
 'unemployment.csv': [0.8411904360115949, -0.1460326462762669],
 'debt.csv': [0.9374844694869853, -0.008949933126740195],
 'bonds.csv': [0.7624137028516973, -0.0029252172010015842],
 'stocks.csv': [0.4282367136398806, 0.023611281894326597],
 'derivatives.csv': [0.10555841425229162, -0.01836732467462654],
 'crisis.csv': [0.1496723642453488, -0.03767019844220154],
 'invest.csv': [0.8134556178336052, -0.04568038370694516],
 'culture.csv': [0.3548556511227823, -0.054349499304523596],
 'revenue.csv': [0.7115059282533849, 0.10708342199137921],
 'chance.csv': [0.24026654288695592, -0.06616867858108189],
 'arts.csv': [0.9362461208135318, -0.06867829565875738],
 'ore.csv': [0.21320446519147485, -0.06921075593795445],
 'world.csv': [0.41395275095383854, -0.

In [110]:
"""
# non optimized 
returns = {}
sharpe = {}
win_rate = {}
avg_trade = {}
bts = {}

count = 0

for word in word_trends.keys():
    bt = Backtest(word_trends[word], trainingStrat, cash=100_000_000, commission=0)
    stats = bt.run()
    bts[word] = bt
    returns[word] = stats[6]
    returns['BENCHMARK'] = stats[7]
    sharpe[word] = stats[10]
    win_rate[word] = stats[18]
    avg_trade[word] = stats[21]

returns
"""

"\n# non optimized \nreturns = {}\nsharpe = {}\nwin_rate = {}\navg_trade = {}\nbts = {}\n\ncount = 0\n\nfor word in word_trends.keys():\n    bt = Backtest(word_trends[word], trainingStrat, cash=100_000_000, commission=0)\n    stats = bt.run()\n    bts[word] = bt\n    returns[word] = stats[6]\n    returns['BENCHMARK'] = stats[7]\n    sharpe[word] = stats[10]\n    win_rate[word] = stats[18]\n    avg_trade[word] = stats[21]\n\nreturns\n"

# Train/Test Split

In [122]:
optimized_words

['rich.csv',
 'gains.csv',
 'money.csv',
 'financial markets.csv',
 'unemployment.csv',
 'debt.csv',
 'bonds.csv',
 'stocks.csv',
 'derivatives.csv',
 'crisis.csv',
 'invest.csv',
 'culture.csv',
 'revenue.csv',
 'chance.csv',
 'arts.csv',
 'ore.csv',
 'world.csv',
 'inflation.csv',
 'gold.csv',
 'dividend.csv']

In [None]:
from backtesting import Strategy
from backtesting.lib import crossover


def getOptimizedParameters(key_word):
    return optimized_parameters[key_word]

class testingStrat(Strategy):
    
    def init(self):
        self.change = self.I(ourFunc1, self.data)
        self.optimized_parameters = getOptimizedParameters(self.data.columns[5]) # grabs the word from the df
        self.high = self.optimized_parameters[0]
        self.low = self.optimized_parameters[1]
        
    def next(self):
        if self.change[-1] > self.high:
            self.position.close()
            #print("buying")
            self.buy()

        elif self.change[-1] < self.low:
            self.position.close()
            #print("selling")
            self.sell()
        else:
            self.position.close()

In [None]:
# testing
testing_stats = {}
for word in optimized_words:
    bt = Backtest(testing[word], testingStrat, cash=100_000_000, commission=0)
    stats = bt.optimize(low = list(np.asarray(range(-100, 100))/100),
                        high = list(np.asarray(range(-100, 100))/100), 
                        constraint = lambda param: param.low > param.high, # flipped?!
                        maximize = 'Return [%]', 
                        return_optimization = True, method = 'skopt')
    testing_stats[word] = stats
    
testing_stats

### Random Alg

In [10]:
import random

# randomly buy and sell 
class randomStrat(Strategy):
    def init(self):
        self.change = self.I(ourFunc1, self.data)
    def next(self):
        if random.choice([True, False]):
            self.position.close()
            self.buy()
        else:
            self.position.close()
            self.sell()

In [11]:
random_returns = []
for x in range(201):
    bt = Backtest(word_trends["debt"], randomStrat, cash=100_000_000, commission=0) # initializes
    random_stats = bt.run()
    # returns["RANDOM"] = random_stats[6]
    random_returns.append(random_stats[6])

KeyError: 'debt'

In [None]:
np.mean(random_returns)

In [None]:
stats['Return [%]']

In [None]:
# by the paper
from backtesting import Strategy
from backtesting.lib import crossover

class trainingStrat(Strategy):
    high = 0
    low = 0
    
    def init(self):
        self.change = self.I(ourFunc1, self.data)
        high = self.high
        low = self.low
        self.buy_next_week = False
        self.sell_next_week = False

    def next(self):
        assert not (self.buy_next_week == True and self.sell_next_week == True)
        
        if self.buy_next_week:
            self.buy()
            self.buy_next_week = False
        
        if self.sell_next_week:
            self.sell()
            self.sell_next_week = False
        
        if self.change[-1] > self.high:
            #self.position.close()
            #print("selling this week, buying next week")
            self.sell()
            self.buy_next_week = True
            self.sell_next_week = False

        elif self.change[-1] < self.low:
            #self.position.close()
            #print("buying this week, selling next week")
            self.buy()
            self.sell_next_week = True
            self.buy_next_week = False