In [10]:
####-----------------------------------------------------------------------------####
#   title: "IEOR 135 Group Project -- commodity trends (Built for RiskEx)"          #
#   author: Elias Castro Hernandez                                                  #
#   date: "March 2018"                                                              # 
#   purpose: Itereate over daily google API to extract trend data relative to topic #
####-----------------------------------------------------------------------------####

In [8]:
# Import Packages and Libraries
import requests # HTTP parser
import html5lib

# Web parcing, scraping, etc.
import csv
import time

# data frames and math
import pandas as pd
import numpy as np

# Import output related packages 
import pprint

# Google trends related
import datetime

In [9]:
# stretch Jupyter coding blocks to fit screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>")) 

# make it run on py2 and py3
from __future__ import division, print_function

## Google API related 

### using pytrends package from githug:
https://github.com/GeneralMills/pytrends

### Connect to Google


In [18]:
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)


### Build Payload

In [31]:
queries = ["Bitcoin", "bitcoin"] # keyword list
pytrends.build_payload(queries, cat=77, timeframe='now 1-H', geo='US', gprop='news')

#__NOTE:__
#Up to five terms can be uncluded in the list: ['Pizza', 'Italian', 'Spaghetti', 'Breadsticks', 'Sausage']

---
### API Parameters:

__cat__
Category to narrow results
- Find available cateogies by inspecting the url when manually using Google Trends. The category starts after cat= and ends before the next & or view this wiki page containing all available categories
- For examples: https://github.com/pat310/google-trends-api/wiki/Google-Trends-Categories
- '71' is the category
- Defaults to no category


__geo__
Two letter country abbreviation
- For example United States is 'US'
- Defaults to World
- More detail available for States/Provinces by specifying additonal abbreviations
- For example: Alabama would be 'US-AL'
- For example: England would be 'GB-ENG'


__tz__
Timezone Offset
- For example US CST is '360'


__timeframe__
Date to start from

- Defaults to last 5yrs, 'today 5-y'.
- Everything 'all'
- Specific dates, 'YYYY-MM-DD YYYY-MM-DD' 
 - example '2016-12-14 2017-01-25'
- Specific datetimes, 'YYYY-MM-DDTHH YYYY-MM-DDTHH' 
 - example '2017-02-06T10 2017-02-12T07'
 - Note Time component is based off UTC
- Current Time Minus Time Pattern:
 - __By Month:__ 'today #-m' where # is the number of months from that date to pull data for
   - For example: 'today 3-m' would get data from today to 3months ago
   - __NOTE__ Google uses UTC date as 'today'
   - Seems to only work for 1, 2, 3 months only
 - __By Day:__ 'now #-d' where # is the number of days from that date to pull data for
   - For example: 'now 7-d' would get data from the last week
   - Seems to only work for 1, 7 days only
 - __By Hourly:__ 'now #-H' where # is the number of hours from that date to pull data for
   - For example: 'now 1-H' would get data from the last hour
   - Seems to only work for 1, 4 hours only


__gprop__
What Google property to filter to
- Example 'images'
- Defaults to web searches
- Can be 'images', 'news', 'youtube' or 'froogle' (for Google Shopping results)
---

---
### Accessing object
The following API methods are available:

<br />
__Interest Over Time:__ returns historical, indexed data for when the keyword was searched most as shown on Google Trends' Interest Over Time section.

- __pytrends.interest_over_time()__ 

__Interest by Region:__ returns data for where the keyword is most searched as shown on Google Trends' Interest by Region section.

- __pytrends.interest_by_region(resolution='US')__
 - Parameters
  - resolution
   - 'CITY' returns city level data
   - 'COUNTRY' returns country level data 
   - 'DMA' returns Metro level data 
   - 'REGION' returns Region level data
   
__Related Topics:__ returns data for the related keywords to a provided keyword shown on Google Trends' Related Topics section.

- __pytrends.related_topics()__

__Related Queries:__ returns data for the related keywords to a provided keyword shown on Google Trends' Related Queries section.

- __pytrends.related_queries()__

__Trending Searches:__ returns data for latest trending searches shown on Google Trends' Trending Searches section.

- __pytrends.trending_searches(pn='p1')__ # in English
- __pytrends.trending_searches(pn='p4')__ # in Japanese


__Top Charts:__ returns the data for a given topic shown in Google Trends' Top Charts section.

- __pytrends.top_charts(date, cid, geo='US', cat='')__
 - Parameters
  - date
   - YYYYMM integer or string value (required)
   - Example '201611' for November 2016 Top Chart data
 - cid
  - Topic to get data for (required)
  - Only able to choose from those listed on https://www.google.com/trends/topcharts
  - Example the chart 'Baseketball players cid is 'basketball_players'


__Suggestions:__ returns a list of additional suggested keywords that can be used to refine a trend search.
- __pytrends.suggestions(keyword)__

---

### Extracting hourly trend data 
__Note__ To simplify and automate process, a function will be called at random intervals of time.

In [34]:
pytrends.interest_over_time()

Unnamed: 0_level_0,Bitcoin,bitcoin,isPartial
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-03-17 21:11:00,33,33,False
2018-03-17 21:12:00,0,0,False
2018-03-17 21:13:00,36,36,False
2018-03-17 21:14:00,33,33,False
2018-03-17 21:15:00,33,33,False
2018-03-17 21:16:00,0,0,False
2018-03-17 21:17:00,0,0,False
2018-03-17 21:18:00,34,34,False
2018-03-17 21:19:00,0,0,False
2018-03-17 21:20:00,0,0,False


In [20]:
# funtion pulls hourly trend data. Make sure you are in your desired directory prior to executing
def gTrends_hr(keys, query, yr, mo, day, hr, geo):
     # connect to Google
    g_connect = pyGTrends(google_username, google_password)
    # make request
    g_connect.request_report(keys, date = date, geo = geo)
    # wait a random amount of time between requests to avoid bot detection
    time.sleep(randint(5, 10))
    # download file
    g_connect.save_csv(path, '_' + "gTrends_Hourly" + '_' + keys.replace(' ', '_'))
 
    name = path + '_' + "gTrends_Hourly" + '_' + keys.replace(' ', '_')
       
    with open(name + '.csv', 'rt') as csvfile:    
        csvReader = csv.reader(csvfile)
        data = []
 
        for row in csvReader:
            if any(yr in s for s in row): 
                data.append(row)
 
        hr_df = pd.DataFrame(data)
        cols = ["Hour", keys]    
        hr_df.columns = [cols]  
        hr_Data.append(day_df) 

In [None]:
## Work in proces, but halted to do to granularity constraint

# iterate process
query = ['Bitcoin', 'bitcoin']
yr = '2018'
geo = 'US'
map(lambda x: gTrends_hr(x), query, yr, geo)  
 
rge = [hr_Data[0], hr_Data[1], hr_Data[2]]    
 
hr_df_final = reduce(lambda left,right: pd.merge(left,right, on='Hour'), rge)
hr_df_final = hr_df_final.loc[:, (hr_df_final != "0").any(axis=0)]
hr_df_final.to_csv("Hourly_Trends_Data.csv", index=False)
 
hourly_Data = [ ]     

#### Get everything on a topic, from given sources/domains, within a given range