In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pytrends.request import TrendReq
from pytrends import dailydata
import requests
import collections
import os

from sklearn import linear_model
import statsmodels.api as sm

from backtesting import Backtest

from scipy.optimize import minimize




## Intro
We will implement [this paper](https://www.nature.com/articles/srep01684.pdf) by Preis, Moat, and Stanley and make modifications to their algorithm logic that will be detailed later.

## Data Collection and Processing
We sourced our alternative data from [Google Trends](https://trends.google.com/trends/?geo=US). Downloading data over a long time frame from the Google Trends website gives weekly datapoints with dates that are hard to control. To work around this, we used the third-party [pytrends](https://pypi.org/project/pytrends/) API for Python. This API allows us to request daily data for any time interval.

For our financial data we used yfinance because of its ease of use. Because our trading intervals will be on the scale of days-weeks,  

### Problems
Because the API is third-party, you will get 429 (too many requests) errors after about 3 years of daily data. We created a script that downloads and saves Google search trend data and ran it in a Kaggle Docker Container to save time.

Another problem is how the search volume values are computed by Google. Google samples a subset of its servers for the volume data. As a result, there are small variations between different requests with the same parameters. We determined that the difference was negligible to the overall trends although this could be looked into more thoroughly.


In [11]:
x = os.walk("trends")
for word in x:
    tuplee = word
words = tuplee[2]

In [12]:
def getStats(key_word, start_year, start_month, end_year, end_month, index):
    #trend = pd.DataFrame(dailydata.get_daily_data(key_word, start_year, start_month, end_year, end_month, wait_time = 0)[key_word])
    trend = pd.read_csv("trends/"+key_word)
    key_word = key_word[:-4]
    trend['date'] = pd.to_datetime(trend['date'])
    trend = trend.set_index('date')
    trend["change"] = trend[key_word].pct_change()

    # grab index data
    index_data = yf.download(index, start = str(start_year)+"-"+str(start_month)+"-01", end = str(end_year)+"-"+str(end_month)+"-01")

    # join trend data with DJIA data
    joined = trend.merge(index_data, left_on = trend.index, right_on = index_data.index)
    joined = joined.rename(columns = {"key_0": "Date"})

    # grab the adj close price difference for each day
    #joined['Diff'] = joined['Adj Close'].diff()
    joined = joined.set_index(joined['Date'])

    joined["change moving avg"] = joined["change"].rolling("14d", min_periods = 1).mean()
    joined = joined[joined.index.dayofweek == 1]
    
    return joined


In [13]:
# grab data from yfinance and merge with la
start_year = 2010
start_month = 1
end_year = 2018
end_month = 12
index = "NDAQ"
training = {}

#for word in words:
for word in words:
    if word[0] == ".": # extra thing in there
        continue
    training[word] = getStats(word, start_year, start_month, end_year, end_month, index)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [14]:
# testing data
start_year = 2019
start_month = 1
end_year = 2020
end_month = 11
index = "NDAQ"

testing = {}

#for word in words:
for word in words:
    if word[0] == ".": # extra thing in there
        continue
    testing[word] = getStats(word, start_year, start_month, end_year, end_month, index)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

# EDA
Our downloaded Google Trends DataFrame looks like this.
```
word_trends["debt"].head()
```
We are primarily interested in the last column which gives the scaled search volume. The other columns are used to calculate the scaling.

# Training

### We begin backtesting our model using the python backtesting library. 
### Our algorithm is as follows:
1. We have already calculated the change moving average. If that value is above some "high" threshold, then we buy at the current value. The intuition being that an increase in search volume on positive words will lead to a positive impact on the market and a increased price in our index. Vice versa for "low" threshold.
2. Now, we need to define some threshold at which to execute our buy and sell trades. We optimize our backtest (on all words available) to find the best values for our "high" and "low" parameters to optimize returns. 
3. We get our 80-20 split by grabbing the first 8 years (2010-2018) as our training data and the next two years (2019-2020) as our testing.
4. We run our testing model to find the optimized high and low thresholds.

In [15]:
def getMovingAvg(df):
    return pd.Series(df['change moving avg'])

In [16]:
from backtesting import Strategy
from backtesting.lib import crossover

class trainingStrat(Strategy):
    high = 0
    low = 0
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        high = self.high
        low = self.low

    def next(self):
        if self.change[-1] > self.high:
            self.position.close()
            #print("buying")
            self.buy()

        elif self.change[-1] < self.low:
            self.position.close()
            #print("selling")
            self.sell()
        else:
            self.position.close()

In [17]:
# optimizing each word and saving best parameters high and low

optimized_stats = {}
bts = {}

for word in training.keys():
    bt = Backtest(training[word], trainingStrat, cash=100_000_000, commission=0)
    stats = bt.optimize(low = list(np.asarray(range(-10, 20))/100),
                        high = list(np.asarray(range(-10, 20))/100), 
                        constraint = lambda param: param.low > param.high, # flipped?!
                        maximize = 'Return [%]', 
                        return_optimization = True, method = 'skopt')
    #stats = bt.run()
    optimized_stats[word] = stats
    bts[word] = bt
    print("Finished "+word)
    
optimized_stats

Finished housing.csv
Finished chance.csv
Finished stock market.csv
Finished politics.csv
Finished cash.csv
Finished invest.csv
Finished dividend.csv
Finished hedge.csv
Finished gold.csv
Finished bonds.csv
Finished finance.csv
Finished culture.csv
Finished economy.csv
Finished leverage.csv
Finished conflict.csv
Finished risk.csv
Finished nasdaq.csv
Finished banking.csv
Finished derivatives.csv
Finished sell.csv
Finished office.csv
Finished portfolio.csv
Finished dow jones.csv
Finished food.csv
Finished revenue.csv
Finished economics.csv
Finished fine.csv
Finished holiday.csv
Finished credit.csv
Finished society.csv
Finished crash.csv
Finished profit.csv
Finished money.csv
Finished tourism.csv
Finished growth.csv
Finished arts.csv
Finished rich.csv
Finished fed.csv
Finished energy.csv
Finished investment.csv
Finished debt.csv
Finished earnings.csv
Finished present.csv
Finished loss.csv
Finished house.csv
Finished financial markets.csv
Finished markets.csv
Finished consumption.csv
Finishe

{'housing.csv': (Start                     2010-01-05 00:00:00
  End                       2018-11-27 00:00:00
  Duration                   3248 days 00:00:00
  Exposure Time [%]                     99.5662
  Equity Final [$]                  4.86939e+08
  Equity Peak [$]                   5.30909e+08
  Return [%]                            386.939
  Buy & Hold Return [%]                  340.69
  Return (Ann.) [%]                     137.574
  Volatility (Ann.) [%]                 122.233
  Sharpe Ratio                          1.12551
  Sortino Ratio                          4.4305
  Calmar Ratio                          6.75321
  Max. Drawdown [%]                    -20.3717
  Avg. Drawdown [%]                     -6.4773
  Max. Drawdown Duration      497 days 00:00:00
  Avg. Drawdown Duration       75 days 00:00:00
  # Trades                                  459
  Win Rate [%]                           56.427
  Best Trade [%]                        12.9267
  Worst Trade [%]        

In [19]:
# unpack the various values held within the optimized stats
returns = {}
sharpe = {}
win_rate = {}
avg_trade = {}
optimized_parameters = {}

for word in training.keys():
    returns[word] = optimized_stats[word][0][6]
    returns['BENCHMARK'] = optimized_stats[word][0][7]
    sharpe[word] = optimized_stats[word][0][10]
    win_rate[word] = optimized_stats[word][0][18]
    avg_trade[word] = optimized_stats[word][0][21]
    optimized_parameters[word] = optimized_stats[word][1].x

In [20]:
# our best words
optimized_words = [i[0] for i in sorted(returns.items(), key = lambda x: x[1], reverse = True)[:20]]
optimized_words_returns = sorted(returns.items(), key = lambda x: x[1], reverse = True)[:20]
optimized_words_returns

[('debt.csv', 795.0575856304988),
 ('gains.csv', 776.8546632537784),
 ('returns.csv', 742.30168151054),
 ('cash.csv', 722.2443648875751),
 ('chance.csv', 721.9696428815842),
 ('ore.csv', 650.0741230444451),
 ('rich.csv', 634.5444777629891),
 ('inflation.csv', 609.8613088584976),
 ('bonds.csv', 586.6496114617042),
 ('stocks.csv', 565.9995778417606),
 ('crisis.csv', 553.2511713751069),
 ('culture.csv', 550.6657504408055),
 ('money.csv', 534.4526466465225),
 ('invest.csv', 515.8156430075932),
 ('markets.csv', 486.4963841792564),
 ('economics.csv', 485.96380880803684),
 ('financial markets.csv', 481.39991105744934),
 ('derivatives.csv', 477.10938211500354),
 ('revenue.csv', 472.6969002535534),
 ('gold.csv', 458.7998455067787)]

In [21]:
optimized_words_sharpe = sorted(sharpe.items(), key = lambda x: x[1], reverse = True)[:20]
optimized_words_sharpe

[('returns.csv', 1.361172605424319),
 ('gains.csv', 1.325912481124905),
 ('cash.csv', 1.311952409501982),
 ('debt.csv', 1.304656954064381),
 ('chance.csv', 1.3004967933211076),
 ('rich.csv', 1.3001859124693909),
 ('ore.csv', 1.2872323893811022),
 ('inflation.csv', 1.2792329011730368),
 ('stocks.csv', 1.2707112852583289),
 ('crisis.csv', 1.2515639939385765),
 ('bonds.csv', 1.2415465423785443),
 ('money.csv', 1.2263825608840988),
 ('culture.csv', 1.2104223090351542),
 ('gold.csv', 1.1953806296506626),
 ('revenue.csv', 1.188780357274654),
 ('derivatives.csv', 1.185968742204323),
 ('portfolio.csv', 1.174702542542661),
 ('invest.csv', 1.1701941400188778),
 ('markets.csv', 1.1527564435301911),
 ('train.csv', 1.1519137878968904)]

In [22]:
optimized_words_parameters = {word:optimized_parameters[word] for word in optimized_words if word != "BENCHMARK"}
optimized_words_parameters

{'debt.csv': [0.10637565512064695, 0.022288941572030208],
 'gains.csv': [0.064259552980693, -0.0003766892728374882],
 'returns.csv': [-0.012044210839731409, -0.021446214502672173],
 'cash.csv': [0.04272270538905243, -0.0015497219816388297],
 'chance.csv': [0.10227483411440266, -0.03167647488061659],
 'ore.csv': [0.13738686433831318, -0.05364414807046748],
 'rich.csv': [0.15440884960589887, -0.06290898795966182],
 'inflation.csv': [-0.018714118798630844, -0.02468613004595792],
 'bonds.csv': [0.11513946014069493, -4.6371130172112585e-05],
 'stocks.csv': [0.11978702370503039, 0.030456134074797275],
 'crisis.csv': [0.1676045055550012, -0.005307253059356948],
 'culture.csv': [-0.004014058326115955, -0.0066484579701269475],
 'money.csv': [0.0497830685273678, -0.013277044778375408],
 'invest.csv': [0.06477677982690558, 0.00441196244537112],
 'markets.csv': [0.11539729024036696, -0.01769341380187643],
 'economics.csv': [0.17013482389157583, 0.031255883303678955],
 'financial markets.csv': [-0.

In [115]:
"""
# non optimized 
returns = {}
sharpe = {}
win_rate = {}
avg_trade = {}
bts = {}

count = 0

for word in word_trends.keys():
    bt = Backtest(word_trends[word], trainingStrat, cash=100_000_000, commission=0)
    stats = bt.run()
    bts[word] = bt
    returns[word] = stats[6]
    returns['BENCHMARK'] = stats[7]
    sharpe[word] = stats[10]
    win_rate[word] = stats[18]
    avg_trade[word] = stats[21]

returns
"""

"\n# non optimized \nreturns = {}\nsharpe = {}\nwin_rate = {}\navg_trade = {}\nbts = {}\n\ncount = 0\n\nfor word in word_trends.keys():\n    bt = Backtest(word_trends[word], trainingStrat, cash=100_000_000, commission=0)\n    stats = bt.run()\n    bts[word] = bt\n    returns[word] = stats[6]\n    returns['BENCHMARK'] = stats[7]\n    sharpe[word] = stats[10]\n    win_rate[word] = stats[18]\n    avg_trade[word] = stats[21]\n\nreturns\n"

# Testing

In [23]:
# all positive words here have sharpe ratio > 1
testing_words = ["invest.csv", "money.csv", "financial markets.csv", "bonds.csv", "stocks.csv", "gains.csv", "derivatives.csv", "gold.csv", "dividend.csv", "rich.csv","revenue.csv", "returns.csv", "nasdaq.csv", "dow jones.csv"]
testing_words

['invest.csv',
 'money.csv',
 'financial markets.csv',
 'bonds.csv',
 'stocks.csv',
 'gains.csv',
 'derivatives.csv',
 'gold.csv',
 'dividend.csv',
 'rich.csv',
 'revenue.csv',
 'returns.csv',
 'nasdaq.csv',
 'dow jones.csv']

In [24]:
# slight modifications to testing dfs to let backtesting work
for word in testing_words:
    testing[word]['word'] = str(word)

In [37]:
from backtesting import Strategy
from backtesting.lib import crossover

def getMovingAvg(df):
    return pd.Series(df['change moving avg'])

def getOptimizedParameters(key_word):
    return optimized_parameters[key_word]

def getOptimizedParameters2(df):
    return optimized_parameters[df['word'][0]]

class testingStrat(Strategy):
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        self.optimized_parameters = getOptimizedParameters2(self.data) # grabs the word from the df, had to add a new column with name
        #print(self.optimized_parameters)
        self.high = self.optimized_parameters[0]
        self.low = self.optimized_parameters[1]
        
    def next(self):
        if self.change[-1] > self.high:
            self.position.close()
            #print("buying")
            self.buy()

        elif self.change[-1] < self.low:
            self.position.close()
            #print("selling")
            self.sell()
        else:
            self.position.close()

In [38]:
# testing
testing_stats = {}
for word in testing_words:
    bt = Backtest(testing[word], testingStrat, cash=100_000_000, commission=0)
    stats = bt.run() # we no longer optimize, but we run and grab the optimized parameters from training
    testing_stats[word] = stats
    
testing_stats

{'invest.csv': Start                     2019-01-08 00:00:00
 End                       2020-10-27 00:00:00
 Duration                    658 days 00:00:00
 Exposure Time [%]                     48.4211
 Equity Final [$]                  7.99077e+07
 Equity Peak [$]                   1.04475e+08
 Return [%]                           -20.0923
 Buy & Hold Return [%]                 57.2003
 Return (Ann.) [%]                    -44.8426
 Volatility (Ann.) [%]                 26.0073
 Sharpe Ratio                                0
 Sortino Ratio                               0
 Calmar Ratio                                0
 Max. Drawdown [%]                    -37.0354
 Avg. Drawdown [%]                    -37.0354
 Max. Drawdown Duration      623 days 00:00:00
 Avg. Drawdown Duration      623 days 00:00:00
 # Trades                                   31
 Win Rate [%]                          51.6129
 Best Trade [%]                         29.398
 Worst Trade [%]                      -19.2485

In [39]:
# unpack the various values held within the stats
testing_returns = {}
testing_sharpe = {}
testing_win_rate = {}
testing_avg_trade = {}

for word in testing_words:
    testing_returns[word] = testing_stats[word][6]
    testing_returns["BENCHMARK"] = testing_stats[word][7]
    testing_sharpe[word] = testing_stats[word][10]
    testing_win_rate[word] = testing_stats[word][18]
    testing_avg_trade[word] = testing_stats[word][21]

In [40]:
testing_optimized_words = [i[0] for i in sorted(testing_returns.items(), key = lambda x: x[1], reverse = True)[:20]]
testing_optimized_words_returns = sorted(testing_returns.items(), key = lambda x: x[1], reverse = True)[:20]
testing_optimized_words_returns

[('revenue.csv', 81.82694515669252),
 ('BENCHMARK', 57.20029604887874),
 ('nasdaq.csv', 50.70472424835205),
 ('dow jones.csv', 50.70472424835205),
 ('dividend.csv', 46.41957499150086),
 ('financial markets.csv', 46.09239384083557),
 ('gains.csv', 44.91937185003662),
 ('returns.csv', 32.270215500839235),
 ('derivatives.csv', 29.67904024394989),
 ('stocks.csv', 26.51364630142975),
 ('bonds.csv', 12.306358339920044),
 ('rich.csv', 0.0),
 ('gold.csv', -1.4491724515533446),
 ('money.csv', -3.217528088401794),
 ('invest.csv', -20.09227675113678)]

In [36]:
testing_sharpe_words = sorted(testing_sharpe.items(), key = lambda x: x[1], reverse = True)[:20]
testing_sharpe_words

[('bonds.csv', 1.159740710369638),
 ('stocks.csv', 1.159740710369638),
 ('dividend.csv', 1.159740710369638),
 ('revenue.csv', 1.159740710369638),
 ('nasdaq.csv', 1.159740710369638),
 ('dow jones.csv', 1.159740710369638),
 ('gains.csv', 1.1212821565966455),
 ('money.csv', 1.0465091565277076),
 ('financial markets.csv', 0.9652204973200323),
 ('derivatives.csv', 0.8825918758438688),
 ('invest.csv', 0.87613931701041),
 ('gold.csv', 0.1998419507119276),
 ('rich.csv', 0.0),
 ('returns.csv', 0.0)]

### Random Alg

In [None]:
import random

# randomly buy and sell 
class randomStrat(Strategy):
    def init(self):
        self.change = self.I(ourFunc1, self.data)
    def next(self):
        if random.choice([True, False]):
            self.position.close()
            self.buy()
        else:
            self.position.close()
            self.sell()

In [None]:
random_returns = []
for x in range(201):
    bt = Backtest(word_trends["debt"], randomStrat, cash=100_000_000, commission=0) # initializes
    random_stats = bt.run()
    # returns["RANDOM"] = random_stats[6]
    random_returns.append(random_stats[6])

In [None]:
np.mean(random_returns)

In [None]:
stats['Return [%]']

In [21]:
# by the paper
from backtesting import Strategy
from backtesting.lib import crossover

class trainingStrat(Strategy):
    high = 0
    low = 0
    
    def init(self):
        self.change = self.I(getMovingAvg, self.data)
        high = self.high
        low = self.low
        self.buy_next_week = False
        self.sell_next_week = False

    def next(self):
        assert not (self.buy_next_week == True and self.sell_next_week == True)
        
        if self.buy_next_week:
            self.buy()
            self.buy_next_week = False
        
        if self.sell_next_week:
            self.sell()
            self.sell_next_week = False
        
        if self.change[-1] > self.high:
            #self.position.close()
            #print("selling this week, buying next week")
            self.sell()
            self.buy_next_week = True
            self.sell_next_week = False

        elif self.change[-1] < self.low:
            #self.position.close()
            #print("buying this week, selling next week")
            self.buy()
            self.sell_next_week = True
            self.buy_next_week = False