# **Combine LSTM model with Google Trends Data**

In [None]:
!pip install pytrends

Collecting pytrends
  Downloading https://files.pythonhosted.org/packages/74/a4/c1b1242be7d31650c6d9128a776c753db18f0e83290aaea0dd80dd31374b/pytrends-4.7.2.tar.gz
Building wheels for collected packages: pytrends
  Building wheel for pytrends (setup.py) ... [?25l[?25hdone
  Created wheel for pytrends: filename=pytrends-4.7.2-cp36-none-any.whl size=14261 sha256=82e10f4edf21594e14160528fffe6ab5dd01acdf611c73f3f8c0f64b9a9e59ae
  Stored in directory: /root/.cache/pip/wheels/64/ae/af/51d48fbbca0563036c6f80999b7ce3f097fa591fd165047baf
Successfully built pytrends
Installing collected packages: pytrends
Successfully installed pytrends-4.7.2


In [None]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
import joblib
from pytrends.exceptions import ResponseError
from pytrends.request import TrendReq
from datetime import datetime
from datetime import date, timedelta
from functools import partial
from time import sleep
from calendar import monthrange


Using TensorFlow backend.


**Fetch Google Searching Trends data with pytrends**


In [None]:
def preprocess(list_of_files:list):
  """loads, downloads and perprocesses the two data sets.
  Arguments: a list of names of the data (csv) files (the list can have length 1).
  Returns:
    X1 the bitcoin price as input to model() (see below).
    X2: the additional data, pulled from the internet.
    y: the bitcoin price as targets, i.e. the true bitcoin price (to be used for
    training and evaluation).
  """
  #Read bitcoin price file
  df = pd.read_csv(list_of_files[0],usecols=[0,4])

  if len(list_of_files)==1:
      df['date'] =pd.to_datetime(df['date'])
  else:
    for i in range(1,len(list_of_files)+1):
      df = df.append(pd.read_csv(list_of_files[i],usecols=[0,4]))
      #sort Date
      df['date'] =pd.to_datetime(df['date'])
      df.sort_values(by=['date'])

  #imputed missing values with pevious available value
  df['close'].fillna(method='ffill', inplace = True)   
  X1, y = df[:len(df)-1], df[1:len(df)]


  #Pull data from Google Trends

  # Eastern standard time UTC-05 in minutes -300
  pytrends = TrendReq(hl='en-US', tz=-300)

  kw_list = ["bitcoin"] # Keyword list
  pytrends.build_payload(kw_list, cat=0, timeframe='today 5-y', geo='', gprop='')
  df = pytrends.interest_over_time()

  def get_last_date_of_month(year: int, month: int) -> date:
      """Given a year and a month returns an instance of the date class
      containing the last day of the corresponding month.
      Source: https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python
      """
      return date(year, month, monthrange(year, month)[1])


  def convert_dates_to_timeframe(start: date, stop: date) -> str:
      """Given two dates, returns a stringified version of the interval between
      the two dates which is used to retrieve data for a specific time frame
      from Google Trends.
      """
      return f"{start.strftime('%Y-%m-%d')} {stop.strftime('%Y-%m-%d')}"


  def _fetch_data(pytrends, build_payload, timeframe: str) -> pd.DataFrame:
      """Attempts to fecth data and retries in case of a ResponseError."""
      attempts, fetched = 0, False
      while not fetched:
          try:
              build_payload(timeframe=timeframe)
          except ResponseError as err:
              print(err)
              print(f'Trying again in {60 + 5 * attempts} seconds.')
              sleep(60 + 5 * attempts)
              attempts += 1
              if attempts > 3:
                  print('Failed after 3 attemps, abort fetching.')
                  break
          else:
              fetched = True
      return pytrends.interest_over_time()


  def get_daily_data(word: str,
                  start_year: int,
                  start_mon: int,
                  stop_year: int,
                  stop_mon: int,
                  geo: str = 'US',
                  verbose: bool = True,
                  wait_time: float = 5.0) -> pd.DataFrame:
      """Given a word, fetches daily search volume data from Google Trends and
      returns results in a pandas DataFrame.
      Details: Due to the way Google Trends scales and returns data, special
      care needs to be taken to make the daily data comparable over different
      months. To do that, we download daily data on a month by month basis,
      and also monthly data. The monthly data is downloaded in one go, so that
      the monthly values are comparable amongst themselves and can be used to
      scale the daily data. The daily data is scaled by multiplying the daily
      value by the monthly search volume divided by 100.
      For a more detailed explanation see http://bit.ly/trendsscaling
      Args:
          word (str): Word to fetch daily data for.
          start_year (int): the start year
          start_mon (int): start 1st day of the month
          stop_year (int): the end year
          stop_mon (int): end at the last day of the month
          geo (str): geolocation
          verbose (bool): If True, then prints the word and current time frame
              we are fecthing the data for.
      Returns:
          complete (pd.DataFrame): Contains 4 columns.
              The column named after the word argument contains the daily search
              volume already scaled and comparable through time.
              The column f'{word}_unscaled' is the original daily data fetched
              month by month, and it is not comparable across different months
              (but is comparable within a month).
              The column f'{word}_monthly' contains the original monthly data
              fetched at once. The values in this column have been backfilled
              so that there are no NaN present.
              The column 'scale' contains the scale used to obtain the scaled
              daily data.
      """

      # Set up start and stop dates
      start_date = date(start_year, start_mon, 1) 
      stop_date = get_last_date_of_month(stop_year, stop_mon)

      # Start pytrends for US region
      pytrends = TrendReq(hl='en-US', tz=360)
      # Initialize build_payload with the word we need data for
      build_payload = partial(pytrends.build_payload,
                              kw_list=[word], cat=0, geo=geo, gprop='')

      # Obtain monthly data for all months in years [start_year, stop_year]
      monthly = _fetch_data(pytrends, build_payload,
                          convert_dates_to_timeframe(start_date, stop_date))

      # Get daily data, month by month
      results = {}
      # if a timeout or too many requests error occur we need to adjust wait time
      current = start_date
      while current < stop_date:
          last_date_of_month = get_last_date_of_month(current.year, current.month)
          timeframe = convert_dates_to_timeframe(current, last_date_of_month)
          if verbose:
              print(f'{word}:{timeframe}')
          results[current] = _fetch_data(pytrends, build_payload, timeframe)
          current = last_date_of_month + timedelta(days=1)
          sleep(wait_time)  # don't go too fast or Google will send 429s

      daily = pd.concat(results.values()).drop(columns=['isPartial'])
      complete = daily.join(monthly, lsuffix='_unscaled', rsuffix='_monthly')

      # Scale daily data by monthly weights so the data is comparable
      complete[f'{word}_monthly'].ffill(inplace=True)  # fill NaN values
      complete['scale'] = complete[f'{word}_monthly'] / 100
      complete[word] = complete[f'{word}_unscaled'] * complete.scale

      return complete

  X2 = get_daily_data('bitcoin', 2017, 1, 2020, 2, geo='US')
  X2 = X2.drop(['bitcoin_unscaled', 'bitcoin_monthly', 'isPartial', 'scale'], axis = 1)
  X2 = X2.reset_index()

  return X1, X2, y
  


**Retrieve the LSTM model**

In [None]:
def model(filename_a1:str, X1):
  a1_model = joblib.load(filename_a1)
  dates = X1['date']
  X1 = X1.drop(['date'], 1, inplace=False)
  X1_norm = min_max_scaler.fit_transform(X1.values)
  X1_norm = np.reshape(X1_norm, (len(X1_norm), 1, 1))
  result = a1_model.predict(X1_norm)
  y1 = min_max_scaler.inverse_transform(result)
  y1_hat = pd.DataFrame(y1,columns=['y1_hat'])
  y1_hat['date'] = dates
  y1_hat['date'] =pd.to_datetime(y1_hat['date'])
  return y1_hat

In [None]:
from google.colab import files
files.upload()


Saving combined_model_Xi Kang.sav to combined_model_Xi Kang.sav


{'combined_model_Xi Kang.sav': b'\x80\x03csklearn.linear_model._base\nLinearRegression\nq\x00)\x81q\x01}q\x02(X\r\x00\x00\x00fit_interceptq\x03\x88X\t\x00\x00\x00normalizeq\x04\x89X\x06\x00\x00\x00copy_Xq\x05\x88X\x06\x00\x00\x00n_jobsq\x06NX\x05\x00\x00\x00coef_q\x07cjoblib.numpy_pickle\nNumpyArrayWrapper\nq\x08)\x81q\t}q\n(X\x08\x00\x00\x00subclassq\x0bcnumpy\nndarray\nq\x0cX\x05\x00\x00\x00shapeq\rK\x02\x85q\x0eX\x05\x00\x00\x00orderq\x0fX\x01\x00\x00\x00Cq\x10X\x05\x00\x00\x00dtypeq\x11cnumpy\ndtype\nq\x12X\x02\x00\x00\x00f8q\x13K\x00K\x01\x87q\x14Rq\x15(K\x03X\x01\x00\x00\x00<q\x16NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\x17bX\n\x00\x00\x00allow_mmapq\x18\x88ub\xab\xd1\xa5Bq6\xf6?X!\x13.e\x13\xf0?X\t\x00\x00\x00_residuesq\x19cnumpy.core.multiarray\nscalar\nq\x1ah\x15C\x08\xdf\xcce4\x86\x1f\xf6@q\x1b\x86q\x1cRq\x1dX\x05\x00\x00\x00rank_q\x1eK\x02X\t\x00\x00\x00singular_q\x1fh\x08)\x81q }q!(h\x0bh\x0ch\rK\x02\x85q"h\x0fh\x10h\x11h\x15h\x18\x88ub\xd0-s\xf4\xdb<\xe8@\x12\xcd/\x81\

In [None]:
X1, X2, y = preprocess(['/content/BTCUSD_1d_2011-09-13_to_2019-10-23_bitstamp.csv'])

bitcoin:2017-01-01 2017-01-31
bitcoin:2017-02-01 2017-02-28
bitcoin:2017-03-01 2017-03-31
bitcoin:2017-04-01 2017-04-30
bitcoin:2017-05-01 2017-05-31
bitcoin:2017-06-01 2017-06-30
bitcoin:2017-07-01 2017-07-31
bitcoin:2017-08-01 2017-08-31
bitcoin:2017-09-01 2017-09-30
bitcoin:2017-10-01 2017-10-31
bitcoin:2017-11-01 2017-11-30
bitcoin:2017-12-01 2017-12-31
bitcoin:2018-01-01 2018-01-31
bitcoin:2018-02-01 2018-02-28
bitcoin:2018-03-01 2018-03-31
bitcoin:2018-04-01 2018-04-30
bitcoin:2018-05-01 2018-05-31
bitcoin:2018-06-01 2018-06-30
bitcoin:2018-07-01 2018-07-31
bitcoin:2018-08-01 2018-08-31
bitcoin:2018-09-01 2018-09-30
bitcoin:2018-10-01 2018-10-31
bitcoin:2018-11-01 2018-11-30
bitcoin:2018-12-01 2018-12-31
bitcoin:2019-01-01 2019-01-31
bitcoin:2019-02-01 2019-02-28
bitcoin:2019-03-01 2019-03-31
bitcoin:2019-04-01 2019-04-30
bitcoin:2019-05-01 2019-05-31
bitcoin:2019-06-01 2019-06-30
bitcoin:2019-07-01 2019-07-31
bitcoin:2019-08-01 2019-08-31
bitcoin:2019-09-01 2019-09-30
bitcoin:20

In [None]:
y1_hat = model('model_a1_Xi Kang.sav',X1)











Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




**Train the combined model**

In [None]:
#merge price data with Google trends data
merged_df = pd.merge(y1_hat, X2, on='date', how= 'inner')
merged_df = pd.merge(merged_df, y, how='left')
print(merged_df)

           y1_hat       date  bitcoin    close
0     1000.261658 2017-01-01     2.58   997.75
1     1015.111084 2017-01-02     4.32  1012.54
2     1037.899780 2017-01-03     4.08  1035.24
3     1117.898438 2017-01-04     4.86  1114.92
4     1007.279541 2017-01-05     6.00  1004.74
...           ...        ...      ...      ...
1020  7959.320312 2019-10-18     2.52  7955.08
1021  7964.690430 2019-10-19     2.34  7960.49
1022  8237.872070 2019-10-20     3.04  8235.74
1023  8215.957031 2019-10-21     3.36  8213.65
1024  8029.629883 2019-10-22     3.28  8025.90

[1025 rows x 4 columns]


In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
train_size = 0.8
regr=linear_model.LinearRegression()
X=merged_df[['bitcoin','y1_hat']].values
yt=merged_df['close'].values

X_train, X_test, y_train, y_test = train_test_split(X, yt, test_size=train_size, random_state=0)

regr.fit(X_train,y_train)
y_pred = regr.predict(X_test)
print('coefficients:\n',regr.coef_)
print('Mean squared error: %.2f'%metrics.mean_squared_error(y_test, y_pred))
#print(regr.summary)
comp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comp_df.head(25))

coefficients:
 [1.351937   1.00473551]
Mean squared error: 486.90
      Actual     Predicted
0    3998.13   4002.245392
1     918.60    891.876552
2    1017.97    993.562172
3    8067.00   8083.585294
4    7806.07   7816.301637
5    1012.54    991.059693
6    4099.55   4111.657795
7    5503.36   5512.085276
8   10850.00  10852.414790
9    3885.87   3890.933994
10   8946.95   8964.003281
11   9345.11   9355.847503
12   4822.01   4832.582708
13  15155.62  15084.973795
14   9648.00   9663.376774
15   9230.00   9252.208291
16   6437.29   6450.051029
17   3835.79   3836.062675
18  10808.99  10815.097819
19   7617.98   7631.578915
20   1039.92   1014.742537
21   1071.02   1047.130067
22   7396.60   7413.179002
23   1170.34   1147.196908
24   3576.93   3577.233553


**Forecast using combined model**

In [None]:
def combiner(y1_hat, X2, filename_comb):
  merged_df = pd.merge(y1_hat, X2, on='date', how= 'inner')
  dates = merged_df['date']
  X_combined = merged_df[['bitcoin','y1_hat']].values
  combined_model = joblib.load(filename_comb)
  y_pred = combined_model.predict(X_combined)
  y2_hat = pd.DataFrame(y_pred,columns=['y2_hat'])
  y2_hat['date'] = dates
  return y2_hat

In [None]:
y2_hat = combiner(y1_hat, X2, 'combined_model_Xi Kang.sav')



In [None]:
y2_hat.to_csv('y2_hat.csv',index=False)