First commit

emhart · Oct 4, 2012 · 1294def · 1294def
commit 1294def
Show file tree

Hide file tree

Showing 13 changed files with 436 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.RData
+.Rbuildignore
+.Rhistory
+.Rproj.user
+rGtrends.Rproj
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,25 @@
+Package: rGtrends
+Type: Package
+Title: A collection of functions to interface with the unofficial google trends
+    API (http://github.com/suryasev/unofficial-google-trends-api)
+Version: 0.01
+Date: 2012-09-26
+Author: Edmund Hart <edmund.m.hart@gmail.com>
+Maintainer: Edmund Hart <edmund.m.hart@gmail.com>
+Description: An interface to download Google trends data into R
+License: CC0
+URL: http://github.com/emhart/rGtrends
+Bug reports: http://github.com/emhart/rGtrends/issues
+Depends:
+    rjava
+rjython
+stringr
+Imports:
+    rjava
+rjython
+rstringr
+Collate:
+    'add_char.R'
+    'format_py_date.R'
+    'rGtrends.R'
+    'strip_char.R'
diff --git a/R/add_char.R b/R/add_char.R
@@ -0,0 +1,20 @@
+#' Adds quote padding so terms can be searchable with python
+#'
+#' @param my_string A vector of strings that are padded with quotation marks so they can be passed to python Required
+#' 
+#' @return a vector of strings padded with quotes
+#' @import stringr
+#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
+#'
+add_char <- function(my_string){
+for(i in 1:length(my_string)){
+  my_string[i] <- str_pad(my_string[i],str_length(my_string[i])+2,"both",pad="'")
+  }
+return(my_string)
+}
+
+
+
+
+
+
diff --git a/R/format_py_date.R b/R/format_py_date.R
@@ -0,0 +1,17 @@
+#' Formats rJava strings of dates into text that can be converted to POSIX date
+#'
+#' @param date_vec A vector of dates retured and stripped of characters from rJava
+#' @return a vector of POSIX formatted dates
+#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
+#'
+
+format_py_date <- function(date_vec){
+  date_vec <- gsub('([[:alpha:]])([0-9])',"\\1 \\2",date_vec)
+  months <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec") 
+  mon_num <- 1:12
+  for(i in 1:12){
+    date_vec <- gsub(months[i],mon_num[i],date_vec)
+  }
+date_vec <- gsub(" ","-",date_vec)
+return(as.Date(date_vec,format="%m-%d-%Y"))
+}
diff --git a/R/rGtrends.R b/R/rGtrends.R
@@ -0,0 +1,111 @@
+#' Search google trends (http://trends.google.com)
+#' using a python API. 
+#'
+#'
+#' @param keywords A vector of search strings, each term will be a seperate search. Required
+#' @param date A vector of dates coded a string in the form YYYY-MM, with the first element being the starting date and the second element the end date.  If you want to return the entire range leave blank.  If you want from a starting date to the last available point, leave the second element in vector as "all", or the first as "all" to search from the beginning until the specified end date
+#' @return a data frame with weekly search output and search volmue within the specified date range
+#' @import rJython rJava stringr
+#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
+#' @export
+#' @example \dontrun{
+#' my_search <- rGrends("Bieber")
+#' plot(my_search[,2],my_search[,1],type='l')
+#' }
+#' 
+
+rGtrends <- function(keywords,src_path = paste(getwd(),"/src",sep=""), date=c("all","all")){
+#require(rJava)
+#require(rJython)
+#require(stringr)
+## Exception handling
+if(!is.character(keywords)) stop("Keywords must be strings")
+if(length(date)!=2) stop("Date must be a vector of length 2, see documentation")
+
+
+
+pg_path <- paste("'",src_path,"/pyGTrends.py'",sep="")
+pyg_src <- paste("pg = imp.load_source('pyGTrends',",pg_path,")",sep="")
+
+pgp_path <- paste("'",src_path,"/pyGparse.py'",sep="")
+pygp_src <- paste("gp = imp.load_source('pyGparse',",pgp_path,")",sep="")
+
+rJython <- rJython()
+rJython$exec("import imp")
+
+rJython$exec(pyg_src)
+rJython$exec(pygp_src)
+
+#' Name and password for rGtrends.  People may not want to send
+#' their e-mail credentials unencrypted so I made a dummy email account
+#' but I may have to change this at somepoint.  Please don't send e-mails
+#' from it :)
+
+rJython$exec("con = pg.pyGTrends('rgtrendsapi','ropensci')")
+
+
+
+terms <- paste("(",paste(add_char(keywords),collapse=","),")",sep="")
+call <- paste("con.download_report(",terms,")",sep="")
+rJython$exec(call)
+#### Leaving this code in for when the API is fixed
+#if (language){
+#  data_call <- paste("data=gp.pyGparse(con.csv(section=",add_char("Language"),"))")
+#}
+
+#if (city){
+#  data_call <- paste("data=gp.pyGparse(con.csv(section=",add_char("Cities"),"))")
+#}
+
+#if (region){
+#  data_call <- paste("data=gp.pyGparse(con.csv(section=",add_char("Region"),"))")
+#}
+
+#if(sum(c(region,city,language)) == 0){
+data_call <- paste("data=gp.pyGparse(con.csv())")
+#}
+
+###Extract data into a dataframe
+
+
+rJython$exec(paste(data_call,"[0]",sep=""))
+dat <- rJython$get("data")
+dat <- .jstrVal(dat)
+dat <- strip_char(dat)
+c_names <- vector()
+output <- matrix(0,ncol=length(keywords),nrow=length(dat)-1)
+
+my_dates <- format_py_date(dat[2:length(dat)])
+
+
+  for(i in 1:(length(keywords))){
+    rJython$exec(paste(data_call,"[",i,"]",sep=""))    
+    dat <- rJython$get("data")
+    dat <- .jstrVal(dat)
+    dat <- strip_char(dat)
+    c_names[i] <- dat[1]
+    output[,i] <- as.numeric(dat[2:length(dat)])
+
+  }
+output <- data.frame(output)
+output <- cbind(output,my_dates)
+colnames(output) <- c(gsub(" ","",keywords),"Date")
+
+if(date[1]=="all" && date[2]=="all"){
+  return(output)  
+ }
+
+if(date[1]=="all" && date[2] != "all"){
+  return(output[1:max(grep(date[2],my_dates)),])
+}
+
+if(date[1] !="all" && date[2] == "all"){
+  return(output[min(grep(date[1],my_dates)):dim(output)[1],])
+}
+
+if(date[1]!="all" && date[2] != "all"){
+  return(output[min(grep(date[1],my_dates)):max(grep(date[2],my_dates)),])
+}
+
+}
+
diff --git a/R/src/pyGTrends$py.class b/R/src/pyGTrends$py.class
diff --git a/R/src/pyGTrends.py b/R/src/pyGTrends.py
@@ -0,0 +1,106 @@
+import httplib
+import urllib
+import urllib2 
+import re
+import csv
+
+from cookielib import CookieJar
+
+class pyGTrends(object):
+    """
+    Google Trends API
+    
+    Recommended usage:
+    
+    from csv import DictReader
+    r = pyGTrends(username, password)
+    r.download_report(('pants', 'skirt'))
+    d = DictReader(r.csv().split('\n'))
+    """
+    def __init__(self, username, password):
+        """
+        provide login and password to be used to connect to Google Analytics
+        all immutable system variables are also defined here
+        website_id is the ID of the specific site on google analytics
+        """        
+        self.login_params = {
+            "continue": 'http://www.google.com/trends',
+            "PersistentCookie": "yes",
+            "Email": username,
+            "Passwd": password,
+        }
+        self.headers = [("Referrer", "https://www.google.com/accounts/ServiceLoginBoxAuth"),
+                        ("Content-type", "application/x-www-form-urlencoded"),
+                        ('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21'),
+                        ("Accept", "text/plain")]
+        self.url_ServiceLoginBoxAuth = 'https://accounts.google.com/ServiceLoginBoxAuth'
+        self.url_Export = 'http://www.google.com/trends/viz'
+        self.url_CookieCheck = 'https://www.google.com/accounts/CheckCookie?chtml=LoginDoneHtml'
+        self.header_dictionary = {}
+        self._connect()
+
+    def _connect(self):
+        """
+        connect to Google Trends
+        """
+
+        self.cj = CookieJar()                            
+        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
+        self.opener.addheaders = self.headers
+
+        galx = re.compile('<input type="hidden" name="GALX" value="(?P<galx>[a-zA-Z0-9_-]+)">')
+
+        resp = self.opener.open(self.url_ServiceLoginBoxAuth).read()
+        resp = re.sub(r'\s\s+', ' ', resp)
+
+        m = galx.search(resp)
+        if not m:
+            raise Exception("Cannot parse GALX out of login page")
+        self.login_params['GALX'] = m.group('galx')
+        params = urllib.urlencode(self.login_params)
+        self.opener.open(self.url_ServiceLoginBoxAuth, params)
+        self.opener.open(self.url_CookieCheck)
+
+    def download_report(self, keywords, date='all', geo='all', geor='all', graph = 'all_csv', sort=0, scale=0, sa='N'):
+        """
+        download a specific report
+        date, geo, geor, graph, sort, scale and sa
+        are all Google Trends specific ways to slice the data
+        """
+        if type(keywords) not in (type([]), type(('tuple',))):
+            keywords = [keywords]
+
+        params = urllib.urlencode({
+            'q': ",".join(keywords),
+            'date': date,
+            'graph': graph,
+            'geo': geo,
+            'geor': geor,
+            'sort': str(sort),
+            'scale': str(scale),
+            'sa': sa
+        })                            
+        self.raw_data = self.opener.open('http://www.google.com/trends/viz?' + params).read()
+        if self.raw_data in ['You must be signed in to export data from Google Trends']:
+            raise Exception(self.raw_data)
+
+    def csv(self, section="main", as_list=False):
+        """
+        Returns a CSV of a specific segment of the data.
+        Available segments include Main, Language, City and Region.
+        """
+        if section == "main":
+            section = ("Week","Year","Day","Month")
+        else:
+            section = (section,)
+
+        segments = self.raw_data.split('\n\n\n')
+        for s in segments:
+            if s.partition(',')[0] in section:
+                if as_list:
+                    return [line for line in csv.reader(s.split('\n'))]
+                else:
+                    return s
+
+        raise Exception("Could not find requested section")
+
diff --git a/R/src/pyGparse$py.class b/R/src/pyGparse$py.class
diff --git a/R/src/pyGparse.py b/R/src/pyGparse.py
@@ -0,0 +1,12 @@
+def pyGparse(pyGtrendobj):
+    my_lines  = pyGtrendobj.splitlines()
+    p_num = ((len(my_lines[0].split(",")) - 1)/2)
+    output = [[0 for x in xrange(0)] for x in xrange(p_num + 1)] 
+
+    for x in my_lines:
+        t_string = x.split(",")
+        my_dat = map(lambda i: t_string[i],filter(lambda i: i%2 == 1,range(len(t_string))))
+        my_dat.insert(0,t_string[0])
+        map(lambda i: output[i].append(my_dat[i]),range(len(my_dat)))
+
+    return output
diff --git a/R/strip_char.R b/R/strip_char.R
@@ -0,0 +1,19 @@
+#' Strips rJava S4 class characters from text vector returned from python
+#'
+#' @param Js4text vector of rJava S4 class strings 
+#' 
+#' @return a vector stripped of quotes ond brackets
+#' @author Edmund Hart \email{edmund.m.hart@@gmail.com}
+#'
+strip_char <- function(Js4text){
+
+  Js4text <- strsplit(Js4text,",")
+  Js4text <- Js4text[[1]]
+  Js4text <- sub( "['", "", Js4text, fixed=TRUE)
+  Js4text <- sub( "]", "", Js4text, fixed=FALSE )
+  Js4text <- sub( "'", "", Js4text, fixed=FALSE )
+  Js4text <- sub( " ", "", Js4text, fixed=FALSE )
+  Js4text <- sub( " ", "", Js4text, fixed=FALSE )
+  Js4text <- sub( "'", "", Js4text, fixed=FALSE )
+  return(Js4text)
+}
diff --git a/README b/README
@@ -0,0 +1,3 @@
+README 
+=========
+rGtrends is an interface for a python [Google trends](http://www.google.com/trends) API created by [Sal Uryasev](Sal Uryasev).  It works by downloading a temporary CSV within a python environment, parsing it, sending it to R, and parsing it a bit more into a dataframe.  As of 9/27/2012 Google changed it's interface for Trends and merged it with insight.  At the moment only raw search volume data can be downloaded.  Once the python code is updated I will return the functionality of parsing search results by region and language.