**Name:** Top words in corpus of PDFs.  
**Description:** The R code identifies the ten top words in a corpus of PDF files that are associated with the Capacity Building tool type. Capacity Building is one of the six tool types used in the study.  
**Author:** Garry Sotnik.  

### Install and load relevant libraries  

Install relevant packages.

In [None]:
install.packages("tm")
install.packages("ggplot2")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("lattice")
install.packages("pdftools")
install.packages("reshape2")
install.packages("scales")
install.packages("SnowballC")
install.packages("stringr")
install.packages("tidytext")
install.packages("tidyverse")
install.packages("topicmodels")
install.packages("tidyr")
install.packages("udpipe")

Load the needed libraries.

In [None]:
library(tm)
library(ggplot2)
library(dplyr)
library(ggplot2)
library(lattice)
library(pdftools)
library(reshape2)
library(scales)
library(SnowballC)
library(stringr)
library(tidytext)
library(tidyverse)
library(tm)
library(topicmodels)
library(tidyr)
library(udpipe)

### Buld a corpus from the PDF files  

Set the working directory to the folder with the source code and the PDF files.

In [None]:
library(rstudioapi)
setwd(dirname(getActiveDocumentContext()$path))
getwd()

List the PDF files in the folder.

In [None]:
pdfs <- list.files(pattern = "pdf$")

Build a corpus from the files in the folder.

In [None]:
corp <- Corpus(URISource(pdfs),
               readerControl = list(reader = readPDF))

### Clean the corpus.
Start by removing punctuation.

In [None]:
corp <- tm_map(corp, removePunctuation, ucp = TRUE)

Remove numbers.

In [None]:
corp <- tm_map(corp, removeNumbers, ucp = TRUE)

Remove white space.

In [None]:
corp <- tm_map(corp, stripWhitespace)


Remove English stop words.

In [None]:
corp <- tm_map(corp, removeWords, stopwords("english"))

Remove specific common words.

In [None]:
removeCommonWords <- function(x) gsub("chang", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))
removeCommonWords <- function(x) gsub("climat", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))
removeCommonWords <- function(x) gsub("adapt", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))
removeCommonWords <- function(x) gsub("ation", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))
removeCommonWords <- function(x) gsub("use", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))
removeCommonWords <- function(x) gsub("hohold", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))
removeCommonWords <- function(x) gsub("also", "", x)
corp <- tm_map(corp, content_transformer(removeCommonWords))

Replace special characters with space.

In [None]:
toSpace <- content_transformer(function(x, pattern) gsub(pattern, "",x))
corp <- tm_map(corp, toSpace, "/")
corp <- tm_map(corp, toSpace, "@")
corp <- tm_map(corp, toSpace, "Â¢")
corp <- tm_map(corp, toSpace, "â")
corp <- tm_map(corp, toSpace, "Â¬")
corp <- tm_map(corp, toSpace, "â¬")
corp <- tm_map(corp, toSpace, "Â´")
corp <- tm_map(corp, toSpace, "Â¯")

Convert the text to lower case.

In [None]:
corp <- tm_map(corp, content_transformer(tolower))

In [None]:
Reduce words to their root form.

In [None]:
corp <- tm_map(corp, stemDocument)

Remove emojis "ï¿½"

In [None]:
corp <- tm_map(corp, content_transformer(gsub), pattern="\\W",replace=" ")

Remove URLs.

In [None]:
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
corp <- tm_map(corp, content_transformer(removeURL))

Remove anything other then English letters or space.

In [None]:
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corp <- tm_map(corp, content_transformer(removeNumPunct))

In [None]:
Remove extra whitespace.

In [None]:
corp <- tm_map(corp, stripWhitespace)

### Prepare the corpus for analysis  
Convert the corups into a Term Document Matrix.

In [None]:
tools.tdm <- TermDocumentMatrix(corp, 
                                control = 
                                  list(stopwords = TRUE,
                                       tolower = TRUE,
                                       stemming = TRUE,
                                       removeNumbers = TRUE,
                                       bounds = list(global = c(3, Inf))))

In [None]:
Convert the Term Document Matrix into a regular matrix.

In [None]:
N <- 10
m <- as.matrix(tools.tdm)
v <- sort(rowSums(m), decreasing=TRUE)
head(v, N)
v.df<- data.frame(v)
v.df <- cbind(word = rownames(v.df), v.df)
rownames(v.df) <- 1:nrow(v.df)
v.df <- v.df[1:N, ]
v.df
colnames(v.df)[2] <- "frequency"
v.df

Convert the matrix into a dataframe.

In [None]:
v.df<- data.frame(v)
v.df <- cbind(word = rownames(v.df), v.df)
rownames(v.df) <- 1:nrow(v.df)
v.df <- v.df[1:N, ]
v.df
colnames(v.df)[2] <- "frequency"
v.df

### Create the bar plot  
Create the bar plot of top words.

In [None]:
library(ggplot2)
library(extrafont)
tiff("Capacity building.tiff", units="in", width=5, height=5, res=300)
ggplot(data=v.df, aes(x=reorder(word, + frequency), y=frequency)) +
  ggtitle("Capacity building") +
  geom_bar(stat="identity") +
  scale_y_continuous(breaks=seq(0,6000,1000)) +
  coord_flip() +
  labs(x ="Top word", y = "Frequency") +
  theme(plot.title = element_text(family="Times New Roman", vjust = - 100, hjust = .9),
        panel.background = element_blank(),
        panel.grid.major = element_line(color="grey", size = (0.2)),
        panel.grid.minor = element_line(size = (0.2), color="grey"),
        axis.text = element_text(family="Times New Roman"),
        axis.title.x = element_text(family="Times New Roman"),
        axis.title.y = element_text(family="Times New Roman"),
        axis.line = element_line(colour = "black"))
dev.off()