Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
emhart committed Oct 4, 2012
0 parents commit 1294def
Show file tree
Hide file tree
Showing 13 changed files with 436 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.RData
.Rbuildignore
.Rhistory
.Rproj.user
rGtrends.Rproj
25 changes: 25 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
Package: rGtrends
Type: Package
Title: A collection of functions to interface with the unofficial google trends
API (http://github.com/suryasev/unofficial-google-trends-api)
Version: 0.01
Date: 2012-09-26
Author: Edmund Hart <edmund.m.hart@gmail.com>
Maintainer: Edmund Hart <edmund.m.hart@gmail.com>
Description: An interface to download Google trends data into R
License: CC0
URL: http://github.com/emhart/rGtrends
Bug reports: http://github.com/emhart/rGtrends/issues
Depends:
rjava
rjython
stringr
Imports:
rjava
rjython
rstringr
Collate:
'add_char.R'
'format_py_date.R'
'rGtrends.R'
'strip_char.R'
20 changes: 20 additions & 0 deletions R/add_char.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#' Adds quote padding so terms can be searchable with python
#'
#' @param my_string A vector of strings that are padded with quotation marks so they can be passed to python Required
#'
#' @return a vector of strings padded with quotes
#' @import stringr
#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
#'
add_char <- function(my_string){
for(i in 1:length(my_string)){
my_string[i] <- str_pad(my_string[i],str_length(my_string[i])+2,"both",pad="'")
}
return(my_string)
}






17 changes: 17 additions & 0 deletions R/format_py_date.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#' Formats rJava strings of dates into text that can be converted to POSIX date
#'
#' @param date_vec A vector of dates retured and stripped of characters from rJava
#' @return a vector of POSIX formatted dates
#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
#'

format_py_date <- function(date_vec){
date_vec <- gsub('([[:alpha:]])([0-9])',"\\1 \\2",date_vec)
months <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
mon_num <- 1:12
for(i in 1:12){
date_vec <- gsub(months[i],mon_num[i],date_vec)
}
date_vec <- gsub(" ","-",date_vec)
return(as.Date(date_vec,format="%m-%d-%Y"))
}
111 changes: 111 additions & 0 deletions R/rGtrends.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#' Search google trends (http://trends.google.com)
#' using a python API.
#'
#'
#' @param keywords A vector of search strings, each term will be a seperate search. Required
#' @param date A vector of dates coded a string in the form YYYY-MM, with the first element being the starting date and the second element the end date. If you want to return the entire range leave blank. If you want from a starting date to the last available point, leave the second element in vector as "all", or the first as "all" to search from the beginning until the specified end date
#' @return a data frame with weekly search output and search volmue within the specified date range
#' @import rJython rJava stringr
#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
#' @export
#' @example \dontrun{
#' my_search <- rGrends("Bieber")
#' plot(my_search[,2],my_search[,1],type='l')
#' }
#'

rGtrends <- function(keywords,src_path = paste(getwd(),"/src",sep=""), date=c("all","all")){
#require(rJava)
#require(rJython)
#require(stringr)
## Exception handling
if(!is.character(keywords)) stop("Keywords must be strings")
if(length(date)!=2) stop("Date must be a vector of length 2, see documentation")



pg_path <- paste("'",src_path,"/pyGTrends.py'",sep="")
pyg_src <- paste("pg = imp.load_source('pyGTrends',",pg_path,")",sep="")

pgp_path <- paste("'",src_path,"/pyGparse.py'",sep="")
pygp_src <- paste("gp = imp.load_source('pyGparse',",pgp_path,")",sep="")

rJython <- rJython()
rJython$exec("import imp")

rJython$exec(pyg_src)
rJython$exec(pygp_src)

#' Name and password for rGtrends. People may not want to send
#' their e-mail credentials unencrypted so I made a dummy email account
#' but I may have to change this at somepoint. Please don't send e-mails
#' from it :)

rJython$exec("con = pg.pyGTrends('rgtrendsapi','ropensci')")



terms <- paste("(",paste(add_char(keywords),collapse=","),")",sep="")
call <- paste("con.download_report(",terms,")",sep="")
rJython$exec(call)
#### Leaving this code in for when the API is fixed
#if (language){
# data_call <- paste("data=gp.pyGparse(con.csv(section=",add_char("Language"),"))")
#}

#if (city){
# data_call <- paste("data=gp.pyGparse(con.csv(section=",add_char("Cities"),"))")
#}

#if (region){
# data_call <- paste("data=gp.pyGparse(con.csv(section=",add_char("Region"),"))")
#}

#if(sum(c(region,city,language)) == 0){
data_call <- paste("data=gp.pyGparse(con.csv())")
#}

###Extract data into a dataframe


rJython$exec(paste(data_call,"[0]",sep=""))
dat <- rJython$get("data")
dat <- .jstrVal(dat)
dat <- strip_char(dat)
c_names <- vector()
output <- matrix(0,ncol=length(keywords),nrow=length(dat)-1)

my_dates <- format_py_date(dat[2:length(dat)])


for(i in 1:(length(keywords))){
rJython$exec(paste(data_call,"[",i,"]",sep=""))
dat <- rJython$get("data")
dat <- .jstrVal(dat)
dat <- strip_char(dat)
c_names[i] <- dat[1]
output[,i] <- as.numeric(dat[2:length(dat)])

}
output <- data.frame(output)
output <- cbind(output,my_dates)
colnames(output) <- c(gsub(" ","",keywords),"Date")

if(date[1]=="all" && date[2]=="all"){
return(output)
}

if(date[1]=="all" && date[2] != "all"){
return(output[1:max(grep(date[2],my_dates)),])
}

if(date[1] !="all" && date[2] == "all"){
return(output[min(grep(date[1],my_dates)):dim(output)[1],])
}

if(date[1]!="all" && date[2] != "all"){
return(output[min(grep(date[1],my_dates)):max(grep(date[2],my_dates)),])
}

}

Binary file added R/src/pyGTrends$py.class
Binary file not shown.
106 changes: 106 additions & 0 deletions R/src/pyGTrends.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import httplib
import urllib
import urllib2
import re
import csv

from cookielib import CookieJar

class pyGTrends(object):
"""
Google Trends API
Recommended usage:
from csv import DictReader
r = pyGTrends(username, password)
r.download_report(('pants', 'skirt'))
d = DictReader(r.csv().split('\n'))
"""
def __init__(self, username, password):
"""
provide login and password to be used to connect to Google Analytics
all immutable system variables are also defined here
website_id is the ID of the specific site on google analytics
"""
self.login_params = {
"continue": 'http://www.google.com/trends',
"PersistentCookie": "yes",
"Email": username,
"Passwd": password,
}
self.headers = [("Referrer", "https://www.google.com/accounts/ServiceLoginBoxAuth"),
("Content-type", "application/x-www-form-urlencoded"),
('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21'),
("Accept", "text/plain")]
self.url_ServiceLoginBoxAuth = 'https://accounts.google.com/ServiceLoginBoxAuth'
self.url_Export = 'http://www.google.com/trends/viz'
self.url_CookieCheck = 'https://www.google.com/accounts/CheckCookie?chtml=LoginDoneHtml'
self.header_dictionary = {}
self._connect()

def _connect(self):
"""
connect to Google Trends
"""

self.cj = CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
self.opener.addheaders = self.headers

galx = re.compile('<input type="hidden" name="GALX" value="(?P<galx>[a-zA-Z0-9_-]+)">')

resp = self.opener.open(self.url_ServiceLoginBoxAuth).read()
resp = re.sub(r'\s\s+', ' ', resp)

m = galx.search(resp)
if not m:
raise Exception("Cannot parse GALX out of login page")
self.login_params['GALX'] = m.group('galx')
params = urllib.urlencode(self.login_params)
self.opener.open(self.url_ServiceLoginBoxAuth, params)
self.opener.open(self.url_CookieCheck)

def download_report(self, keywords, date='all', geo='all', geor='all', graph = 'all_csv', sort=0, scale=0, sa='N'):
"""
download a specific report
date, geo, geor, graph, sort, scale and sa
are all Google Trends specific ways to slice the data
"""
if type(keywords) not in (type([]), type(('tuple',))):
keywords = [keywords]

params = urllib.urlencode({
'q': ",".join(keywords),
'date': date,
'graph': graph,
'geo': geo,
'geor': geor,
'sort': str(sort),
'scale': str(scale),
'sa': sa
})
self.raw_data = self.opener.open('http://www.google.com/trends/viz?' + params).read()
if self.raw_data in ['You must be signed in to export data from Google Trends']:
raise Exception(self.raw_data)

def csv(self, section="main", as_list=False):
"""
Returns a CSV of a specific segment of the data.
Available segments include Main, Language, City and Region.
"""
if section == "main":
section = ("Week","Year","Day","Month")
else:
section = (section,)

segments = self.raw_data.split('\n\n\n')
for s in segments:
if s.partition(',')[0] in section:
if as_list:
return [line for line in csv.reader(s.split('\n'))]
else:
return s

raise Exception("Could not find requested section")

Binary file added R/src/pyGparse$py.class
Binary file not shown.
12 changes: 12 additions & 0 deletions R/src/pyGparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
def pyGparse(pyGtrendobj):
my_lines = pyGtrendobj.splitlines()
p_num = ((len(my_lines[0].split(",")) - 1)/2)
output = [[0 for x in xrange(0)] for x in xrange(p_num + 1)]

for x in my_lines:
t_string = x.split(",")
my_dat = map(lambda i: t_string[i],filter(lambda i: i%2 == 1,range(len(t_string))))
my_dat.insert(0,t_string[0])
map(lambda i: output[i].append(my_dat[i]),range(len(my_dat)))

return output
19 changes: 19 additions & 0 deletions R/strip_char.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#' Strips rJava S4 class characters from text vector returned from python
#'
#' @param Js4text vector of rJava S4 class strings
#'
#' @return a vector stripped of quotes ond brackets
#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
#'
strip_char <- function(Js4text){

Js4text <- strsplit(Js4text,",")
Js4text <- Js4text[[1]]
Js4text <- sub( "['", "", Js4text, fixed=TRUE)
Js4text <- sub( "]", "", Js4text, fixed=FALSE )
Js4text <- sub( "'", "", Js4text, fixed=FALSE )
Js4text <- sub( " ", "", Js4text, fixed=FALSE )
Js4text <- sub( " ", "", Js4text, fixed=FALSE )
Js4text <- sub( "'", "", Js4text, fixed=FALSE )
return(Js4text)
}
3 changes: 3 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
README
=========
rGtrends is an interface for a python [Google trends](http://www.google.com/trends) API created by [Sal Uryasev](Sal Uryasev). It works by downloading a temporary CSV within a python environment, parsing it, sending it to R, and parsing it a bit more into a dataframe. As of 9/27/2012 Google changed it's interface for Trends and merged it with insight. At the moment only raw search volume data can be downloaded. Once the python code is updated I will return the functionality of parsing search results by region and language.
Loading

0 comments on commit 1294def

Please sign in to comment.