This script uses library `pytrends` to scrape google trends by keywords

In [1]:
from pytrends.request import TrendReq
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import math
from tqdm.auto import tqdm

In [2]:
#keywords
kw_list=["dengue"]

#specify start and end date YYYY-MM-DD
date_start='2009-01-01'
date_end='2022-12-01'

#restrict geography to SG
geo='SG'

#timezone in minutes from UTC
tz=-480

#language
hl="en-US"

#A backoff factor to apply between attempts after the second try
# (most errors are resolved immediately by a second try without a delay).
# urllib3 will sleep for: {backoff factor} * (2 ^ ({number of total retries} - 1)) seconds.
# If the backoff_factor is 0.1, then sleep() will sleep for [0.0s, 0.2s, 0.4s, …] between retries.
# It will never be longer than Retry.BACKOFF_MAX. By default, backoff is disabled (set to 0).
backoff_factor=0.5

In [3]:
pt=TrendReq(hl=hl,tz=tz,backoff_factor=backoff_factor)

In [4]:
def call_query(pt,date_start,date_end,progress):
    """
    a recurring function that breaks up the query into chunks of 6 months or less
    pt: pytrends object
    date_start: start_date in YYYY-MM-DD
    date_end: end_date in YYYY-MM-DD
    return: dataframe object of the entire date range
    """
    progress.update(1)
    
    start=datetime.strptime(date_start,"%Y-%m-%d")
    end=datetime.strptime(date_end,"%Y-%m-%d")
    r = relativedelta(end, start)
    mth = (r.years * 12) + r.months
    
    q_start=start
    q_end=start+relativedelta(months=+6)
    timeframe=q_start.strftime("%Y-%m-%d")+" "+(q_end+relativedelta(days=-1)).strftime("%Y-%m-%d")
    
    pt.build_payload(kw_list,timeframe=timeframe,geo=geo)
    df=pt.interest_over_time()
    
    if mth>6:
        df2=call_query(pt,(q_end).strftime("%Y-%m-%d"),date_end,progress)
        df=pd.concat([df,df2])
    
    return df    

First find number of recurrsion, for the purpose of showing a progress bar.

In [5]:
start=datetime.strptime(date_start,"%Y-%m-%d")
end=datetime.strptime(date_end,"%Y-%m-%d")
r = relativedelta(end, start)
mth = (r.years * 12) + r.months
rounds=math.ceil(mth/6)

bar=tqdm(total=rounds)
df=call_query(pt,date_start,date_end,progress=bar)
df.drop(columns="isPartial",inplace=True)

  0%|          | 0/28 [00:00<?, ?it/s]

Check for missing dates in the dataframe.
If there's any missing dates, it will show up below.

In [6]:
print(pd.date_range(
  start=date_start, end=date_end).difference(df.index).to_list())

[Timestamp('2011-01-01 00:00:00'), Timestamp('2011-01-02 00:00:00'), Timestamp('2011-01-03 00:00:00'), Timestamp('2011-01-04 00:00:00'), Timestamp('2011-01-05 00:00:00'), Timestamp('2011-01-06 00:00:00'), Timestamp('2011-01-07 00:00:00'), Timestamp('2011-01-08 00:00:00'), Timestamp('2011-01-09 00:00:00'), Timestamp('2011-01-10 00:00:00'), Timestamp('2011-01-11 00:00:00'), Timestamp('2011-01-12 00:00:00'), Timestamp('2011-01-13 00:00:00'), Timestamp('2011-01-14 00:00:00'), Timestamp('2011-01-15 00:00:00'), Timestamp('2011-01-16 00:00:00'), Timestamp('2011-01-17 00:00:00'), Timestamp('2011-01-18 00:00:00'), Timestamp('2011-01-19 00:00:00'), Timestamp('2011-01-20 00:00:00'), Timestamp('2011-01-21 00:00:00'), Timestamp('2011-01-22 00:00:00'), Timestamp('2011-01-23 00:00:00'), Timestamp('2011-01-24 00:00:00'), Timestamp('2011-01-25 00:00:00'), Timestamp('2011-01-26 00:00:00'), Timestamp('2011-01-27 00:00:00'), Timestamp('2011-01-28 00:00:00'), Timestamp('2011-01-29 00:00:00'), Timestamp('20

In [7]:
df.to_csv(f"../assets/gtrends/gtrends_{kw_list[0]}.csv")