Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

file 234 lines (192 sloc) 7.904 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
# File-Name: GScholarScraper_2.R
# Date: 2011-11-12
# Author: Kay Cichini
# Email: kay.cichini@gmail.com
# Purpose: Extract and examine Google Scholar search result (publication titles)
# Packages used: RCurl, Rcpp, stringr, tm, wordcloud,
# Licence: CC BY-SA-NC
#
# Arguments:
#
# (1) search.str:
# A search string as used in Google Scholar search dialog
# (!) Important: use "+" between elements of the search string..
#
# (2) write.table:
# Logical, defining if a table holding category (book, article, pdf),
# full titles & links to publications should be saved to the default system folder.
#
# Output: a data frame with word frequencies (publication titles), optionally a
# CSV-file of the results, a word cloud
#
# Error reported: Error in substring(string, start, end) :
# invalid multibyte string at ' * Wi<6c>dlife
# may be resolved by: Sys.setlocale(locale="C")
#
# recent edits: 6-12-2011, resolved bug with no. of search results..

GScholarScraper <- function(search.str, write.table = FALSE){

require(Rcpp)
require(RCurl)
require(stringr)
require(tm)
require(wordcloud)

# Some explanations regarding the search string parameterization:
# "&lr?lang_en" will search only publications in English.
# "&num=100" will retur 100 results per site, strangely one yields different
# numbers of results when changing this parameter.. so I will use num = 100
# which will give me the largest number of results.
# "&as_vis=1" exculdes citations, in this version of the function I will
# exclude these because they may bias the final word frequencies
# due to the fact that citations often occurr multiply.
# "&hl_en" defines language used on site.
# "&as_sdt=1" returns only articles excluding patents.

# Get number of search results, making a first input to Google Scholar,
# retrieving results 1 to 100 from first result page, and containing the
# total no. of results somewhere:
url <- paste("http://scholar.google.com/scholar?start=0&q=",
      search.str, "&hl=en&lr=lang_en&num=100&as_sdt=1&as_vis=1",
      sep = "")

# ...i�m using urls like: http://scholar.google.com/scholar?start=0&q=allintitle:+amphibians+richness+OR+diversity&hl=en&lr=lang_en&num=100&as_sdt=1&as_vis=1

webpage <- getURL(url)
html_str <- paste(webpage, collapse="\n")

# Find html place holders (2 alternatives!) for number of results,
# and pull the number.
# (!) Strangely Google Scholar gives different numbers of results
# dependent on start value.. i.e., a change from 900 to 980 results
# when changing start = 0 to start = 800

match_no.res <- str_match(html_str, "Results <b>1</b> - <b>(.*?)</b> of <b>(.*?)</b>")
no.res <- match_no.res[1, max(dim(match_no.res))]

if(nchar(no.res) == 0 | is.na(no.res) | nchar(gsub("\\d", "", no.res))) {
match_no.res <- str_match(html_str, "Results <b>1</b> - <b>(.*?)</b> of about <b>(.*?)</b>")
no.res <- match_no.res[1, max(dim(match_no.res))]
}

# Remove punctuation (Google uses decimal commas):
no.res <- as.integer(gsub("[[:punct:]]", "", no.res))

# If there are no results, stop and throw an error message:
if(nchar(no.res) == 0 | is.na(no.res) | nchar(gsub("\\d", "", no.res))) {
stop("\n\n...There is no result for the submitted search string!")}

# Define number of pages with results to be used subsequently
# pages.max = maximum number of pages (chunk with 100 results each)
# to be submitted subsequently.
# Above it was said that no.res varies, depending on start value.
# However, we use ceiling and the change will very unlikely be greater
# than 100, so we may also add one page plus, to be save:
pages.max <- ceiling(no.res/100)+1

# "start" as used in url:
start <- c(100*1:pages.max) - 100

# Collect webpages as list:
urls <- paste("http://scholar.google.com/scholar?start=", start,
          "&q=", search.str,
          "&hl=en&lr=lang_en&num=100&as_sdt=1&as_vis=1",
          sep = "")

webpages <- lapply(urls, getURL)

# Paste all content:
html_str <- paste(unlist(webpages), collapse="\n")

# Pull titles between h3 tags:
match_h3 <- str_match_all(html_str, "<h3>(.*?)</h3>")
match_h3 <- match_h3[[1]][, 2]

# Clean up br-tags:
match_h3 <- gsub("<b>", "", match_h3)
match_h3 <- gsub("</b>", "", match_h3)

# Get id's for different entry types, which have different html-schemes and
# hence will have to be treated differently when cleaning up:
id_books <- grep("BOOK", match_h3)
id_pdfs <- grep("PDF", match_h3)

# The rest is articles:
id_articles <- c(1:length(match_h3))[c(-id_books, -id_pdfs)]

# Check correctness of ids:
# should be as many as number of titles
# sort(c(id_books, id_pdfs, id_articles)) == 1:length(match_h3)

# Get html code for different types of publications
books_raw <- match_h3[id_books]
articles_raw <- match_h3[id_articles]
pdfs_raw <- match_h3[id_pdfs]

# Clean up & pull titles:
if(length(id_books) > 0){
book <- TRUE
b.title_str <- strsplit(books_raw, ">")
b.titles <- rep(NA, length(b.title_str))
for(i in 1:length(b.title_str)){
    b.titles[i] <- substring(b.title_str[[i]][4],
                        1, nchar(b.title_str[[i]][4])-3)}
} else {
    book <- FALSE
}

if(length(id_articles) > 0){
art <- TRUE
a.title_str <- strsplit(articles_raw, ">")
a.titles <- rep(NA, length(a.title_str))
for(i in 1:length(a.title_str)){
    a.titles[i] <- substring(a.title_str[[i]][2],
                        1, nchar(a.title_str[[i]][2])-3)}
} else {
    art <- FALSE
}

if(length(id_pdfs) > 0){
pdf <- TRUE
pdf.title_str <- strsplit(pdfs_raw, ">")
pdf.titles <- rep(NA, length(pdf.title_str))
for(i in 1:length(pdf.title_str)){
    pdf.titles[i] <- substring(pdf.title_str[[i]][4],
                          1, nchar(pdf.title_str[[i]][4])-3)}
} else {
    pdf <- FALSE
}

# Get links:
match_aref <- str_match_all(match_h3, "<a href=\"(.*?)\"")

links <- rep(NA, length(match_aref))
for(i in 1:length(match_aref)){
    if (length(match_aref[[i]][, 2]) == 0)
    links[i] <- ""
    else (links[i] <- match_aref[[i]][, 2])}

# Dataframe with titles and links:
result <- data.frame(
    ARTICLES = NA, BOOKS = NA,
    PDFs = NA, LINKS = links)

if(art == TRUE){
result[id_articles, "ARTICLES"] <- a.titles
}
if(book == TRUE){
result[id_books, "BOOKS"] <- b.titles
}
if(pdf == TRUE){
result[id_pdfs, "PDFs"] <- pdf.titles
}

# Optionally write table with results to system default folder:
if(write.table){
write.table(result, path.expand("~\\GScholarScraper-Result.CSV"),
     row.names = F, sep = ";")
}

# Make a dataframe with word frequencies and a wordcloud:
# if there are too few results stop and throw an error message:
if(no.res < 5){stop("\n\nThere are less than 5 Results, a word cloud may be useless!")}

corpus <- Corpus(DataframeSource(result[, 1:3]))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, function(x)removeWords(x, stopwords()))
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)

# Remove strings with numbers:
d <- d[-grep("[1-9]", d$word), ]

# Remove unwanted rubbish (..to be extended?):
rubbish <- c("htmls", "hellip", "amp", "quot")
d <- d[d$word%in%rubbish == FALSE, ]

# Show only frequencies larger than 5:
print(d[d$freq > 5, ])
cat(paste("\n\nNumber of titles submitted =", length(match_h3)))

# Compare retrieved titles and no. of results pulled from first webpage:
cat(paste("\n\nNumber of results as retrieved from first webpage =", no.res))

cat("\n\nBe aware that sometimes titles in Google Scholar outputs
are truncated - that is why, i.e., some mandatory intitle-search
strings may not be contained in all titles\n")

# Print wordcloud:
wordcloud(d$word, d$freq, random.order = F)

return(d)
}


# Example:
# The below search string will search for titles with the words "amphibians"
# and "richness" or "diversity":

# search.str <- "allintitle:+amphibians+richness+OR+diversity"
# d <- GScholarScraper(search.str, write.table = FALSE)
Something went wrong with that request. Please try again.