forked from GeneralMills/pytrends
-
Notifications
You must be signed in to change notification settings - Fork 3
/
renormalize.py
134 lines (114 loc) · 4.44 KB
/
renormalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
## INITIAL COMMENTS ##
# Re-scale Trends data so that all values are on the same scale.
# Why do we do this?
# To obtain data at the a certain resolution from Google Trends,
# it must be requested in batches (for example, 1-hour data
# needs to be requested in 7-day intervals).
# However, the values that the Trends API returns are scaled
# from 0-100 within that batch, so it is impossible to compare
# data points from different batches.
# This script re-scales the data by requesting batches with
# overlapping time periods.
## SETUP ##
from datetime import datetime, timedelta
from pytrends.request import TrendReq
import pandas as pd
import os
import sys
import time
daily = False
hourly = False
if len(sys.argv) != 2:
print("Usage: python scaledata.py <hourly/daily>")
exit(-1)
elif sys.argv[1] == 'hourly':
hourly = True
elif sys.argv[1] == 'daily':
daily = True
else:
print("Usage: python scaledata.py <hourly/daily>")
exit(-1)
path = '.'
os.chdir(path)
filename = 'pichai.csv'
# The maximum for a timeframe for which we get daily data is 270.
# Therefore we could go back 269 days. However, since there might
# be issues when rescaling, e.g. zero entries, we should have an
# overlap that does not consist of only one period. Therefore,
# I limit the step size to 250. This leaves 19 periods for overlap.
if daily:
maxstep=269
overlap=40
step = maxstep - overlap + 1
dt = timedelta(days=step)
time_fmt = '%Y-%m-%d'
# Hourly time resolution needs a 7-day step
elif hourly:
overlap = 18
step = 168
dt = timedelta(hours=step)
time_fmt = '%Y-%m-%dT%H'
kw_list = ['pichai']
start_date = datetime(2017, 1, 1).date()
## FIRST RUN ##
# Login to Google. Only need to run this once, the rest of requests will use the same session.
pytrend = TrendReq()
# Run the first time (if we want to start from today, otherwise we need to ask for an end_date as well
today = datetime(2019, 3, 1).date()
#today = datetime.today().date()
old_date = today
# Go back in time
new_date = today - dt #timedelta(hours=step)
# Create new timeframe for which we download data
timeframe = new_date.strftime(time_fmt)+' '+old_date.strftime(time_fmt)
print(timeframe)
pytrend.build_payload(kw_list=kw_list, timeframe = timeframe)
interest_over_time_df = pytrend.interest_over_time()
## RUN ITERATIONS
# Note this runs backwards from the most recent date back.
while new_date>start_date:
### Save the new date from the previous iteration.
# Overlap == 1 would mean that we start where we
# stopped on the iteration before, which gives us
# indeed overlap == 1.
old_date = new_date + timedelta(hours=overlap-1)
### Update the new date to take a step into the past
# Since the timeframe that we can apply for daily data
# is limited, we use step = maxstep - overlap instead of
# maxstep.
new_date = new_date - dt #timedelta(hours=step)
# If we went past our start_date, use it instead
if new_date < start_date:
new_date = start_date
# New timeframe
timeframe = new_date.strftime(time_fmt)+' '+old_date.strftime(time_fmt)
print(timeframe)
# Download data
pytrend.build_payload(kw_list=kw_list, timeframe = timeframe)
temp_df = pytrend.interest_over_time()
if (temp_df.empty):
raise ValueError('Google sent back an empty dataframe. Possibly there were no searches at all during the this period! Set start_date to a later date.')
# Renormalize the dataset and drop last line
for kw in kw_list:
beg = new_date
end = old_date - timedelta(hours=1)
# Since we might encounter zeros, we loop over the
# overlap until we find a non-zero element
for t in range(1,overlap+1):
#print('t = ',t)
#print(temp_df[kw].iloc[-t])
if temp_df[kw].iloc[-t] != 0:
# TODO dame da kore...
scaling = float(interest_over_time_df[kw].iloc[t-1])/temp_df[kw].iloc[-t]
#print('Found non-zero overlap!')
print(scaling)
break
elif t == overlap:
print('Did not find non-zero overlap, set scaling to zero! Increase Overlap!')
scaling = 0
# Apply scaling
temp_df.loc[beg:end,kw]=temp_df.loc[beg:end,kw]*scaling
interest_over_time_df = pd.concat([temp_df[:-overlap],interest_over_time_df])
time.sleep(1)
# Save dataset
interest_over_time_df.to_csv(filename)