In [1]:
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
install.packages('stm')

also installing the dependencies ‘coda’, ‘extrafontdb’, ‘Rttf2pt1’, ‘RcppEigen’, ‘statnet.common’, ‘reticulate’, ‘ISOcodes’, ‘extrafont’, ‘fastmatch’, ‘ggrepel’, ‘network’, ‘RSpectra’, ‘RcppParallel’, ‘sna’, ‘SnowballC’, ‘spacyr’, ‘stopwords’, ‘proxyC’, ‘matrixStats’, ‘slam’, ‘lda’, ‘quanteda’, ‘glmnet’

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [3]:
setwd('../../nowdata/')

In [4]:
getwd()

### 1. Ingest: Reading and processing text data

In [5]:
data = read.csv('charters_textfull_2015.csv')

In [6]:
# drop NA/Null

# data[complete.cases(data), ]
data = na.omit(data)

In [7]:
# get random 1000 rows from dataframe 
# data[sample(nrow(df), 1000), ]
data = sample_n(data, 1000)

In [8]:
# check whether rows contain any NAs
row.has.na <- apply(data, 1, function(x){any(is.na(x))})
sum(row.has.na)

In [9]:
dim(data)

In [10]:
str(data)

'data.frame':	1000 obs. of  60 variables:
 $ master_string          : Factor w/ 5703 levels "          agenda board members agenda charter board agenda agenda agenda agenda agenda agenda regular charter b"| __truncated__,..: 4773 3664 2047 2260 5301 3485 4575 4676 5051 4685 ...
 $ inquiry_ideology       : num  0.00151 0.01127 0 0.00447 0.00549 ...
 $ readingblur_2013       : num  5 20 10 1 5 20 1 20 1 5 ...
 $ mathblur_2013          : num  5 20 10 1 5 10 1 20 1 5 ...
 $ readingscore_2013      : num  57 30 75 75 67 50 66 30 65 57 ...
 $ mathscore_2013         : num  67 10 65 51 82 25 52 10 42 27 ...
 $ readingblur_2014       : num  5 50 5 1 5 10 1 50 1 1 ...
 $ mathblur_2014          : num  5 10 5 1 5 20 1 50 1 5 ...
 $ readingscore_2014      : num  72 25 67 61 77 45 50 25 51 14 ...
 $ mathscore_2014         : num  67 5 72 42 77 10 38 25 33 12 ...
 $ readingblur_2015       : num  5 20 5 1 5 20 1 50 1 1 ...
 $ mathblur_2015          : num  5 20 5 1 5 10 1 50 1 1 ...
 $ readingscore_2015 

In [11]:
library(tibble)

In [12]:
as_tibble(data)

In [13]:
library('stm')

In [None]:
install.packages("tm")

In [None]:
processed <- textProcessor(data$master_string, metadata = data)

In [None]:
processed

In [None]:
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

In [None]:
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

### 2. Prepare: Associating text with metadata

In [None]:
plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))

In [None]:
out <- prepDocuments(processed$documents, processed$vocab,processed$meta, lower.thresh = 15)


### 3. Estimate: Estimating the structural topic model

In [None]:
#charterPrevFit <- stm(documents = out$documents, vocab = out$vocab,K = 20, prevalence =['poor_students', 'students_ofcolor', 'poverty_district', 'people_ofcolor_district', 'readingscore_2015', 'mathscore_2015', 'primary', 'middle', 'high', 'lnage', 'lnstudents', 'urban', 'pctpdfs', 'numwords', 'readingblur_2015', 'mathblur_2015', 'lnteachers'], max.em.its = 25, data = out$meta, init.type = "Spectral")

In [None]:
# Estimating the structural topic model

charterPrevFit <- stm(documents = out$documents, vocab = out$vocab,K = 2, prevalence =~ level + urban,max.em.its = 2, data = out$meta, init.type = "Spectral")

In [None]:
levels(meta$rating)

In [None]:
colnames(data)

In [None]:
print.data.frame(head(data))

In [None]:
# covariate: ['poor_students', 'students_ofcolor', 
#'poverty_district', 'people_ofcolor_district', 'readingscore_2015', 
# 'mathscore_2015', 'primary', 'middle', 'high', 'lnage', 'lnstudents', 'urban', 
#'pctpdfs', 'numwords', 'readingblur_2015', 'mathblur_2015', 'lnteachers']

# start small
# ['poor_students', 'students_ofcolor', 'poverty_district', 'people_ofcolor_district']

In [None]:
# charterPrevFit2 <- stm(documents = out$documents, vocab = out$vocab,K = 2, prevalence =~ poor_students + s(students_ofcolor) + s(poverty_district), max.em.its = 25, data = out$meta, init.type = "Spectral")

In [None]:
# charterPrevFit3 <- stm(documents = out$documents, vocab = out$vocab,K = 20, prevalence =~ poor_students + s(students_ofcolor) + s(poverty_district) + s(people_ofcolor_district) + s(readingscore_2015) + s(mathscore_2015) + s(primary) + s(middle) + s(high) + s(lnage) + s(lnstudents) + s(urban) + s(pctpdfs) + s(numwords) + s(readingblur_2015) + s(mathblur_2015) + s(lnteachers), max.em.its = 25, data = out$meta, init.type = "Spectral")

### 4. Evaluate: Model selection and search

In [None]:
# Model selection for a fixed number of number of topics

charterSelect <- selectModel(out$documents, out$vocab, K = 2, prevalence =~ poor_students + s(students_ofcolor), max.em.its = 2,data = out$meta, runs = 2, seed = 843)

In [None]:
selectedmodel <- charterSelect$runout[[3]]

In [None]:
# Model search across numbers of topics

plotModels(charterSelect, pch=c(1,2,3,4), legend.position="bottomright")

In [None]:
storage <- searchK(out$documents, out$vocab, K = c(7, 10),prevalence =~ poor_students + s(students_ofcolor), data = meta)

### 5. Understand: Interpreting the STM by plotting and inspecting results

In [None]:
# Understanding topics through words and example documents
# Explore the words associated with each topic

labelTopics(charterPrevFit, c(3, 7, 20))

In [None]:
# examine documents that are highly associated with topics

thoughts3 <- findThoughts(charterPrevFit, texts = shortdoc, n = 2, topics = 3)$docs[[1]]

In [None]:
thoughts20 <- findThoughts(charterPrevFit, texts = shortdoc, n = 2, topics = 20)$docs[[1]]

In [None]:
# Estimating metadata/topic relationships

par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))

In [None]:
plotQuote(thoughts3, width = 30, main = "Topic 3")

In [None]:
plotQuote(thoughts20, width = 30, main = "Topic 20")

In [None]:
out$meta$rating <- as.factor(out$meta$rating)

In [None]:
prep <- estimateEffect(1:20 ~ rating + s(day), charterPrevFit, meta = out$meta, uncertainty = "Global")

In [None]:
summary(prep, topics=1)

### 6. Visualize: Presenting STM results

In [None]:
# Summary visualization

plot(charterPrevFit, type = "summary", xlim = c(0, .3))
# Graphical display of estimated topic proportions.

In [None]:
# Topical content

plot(prep, covariate = "rating", topics = c(3, 7, 20),model = charterPrevFit, 
        method = "difference",cov.value1 = "Liberal", cov.value2 = "Conservative",
        xlab = "More Conservative ... More Liberal", main = "Effect of Liberal vs. Conservative",
        xlim = c(-.1, .1), labeltype = "custom", custom.labels = c('Obama', 'Sarah Palin','Bush Presidency'))

In [None]:
plot(prep, "day", method = "continuous", topics = 7, model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")

In [None]:
monthseq <- seq(from = as.Date("2008-01-01"),to = as.Date("2008-12-01"), by = "month")

In [None]:
monthnames <- months(monthseq)
axis(1,at = as.numeric(monthseq) - min(as.numeric(monthseq)),labels = monthnames)

# Graphical display of topic prevalence.

In [None]:
charterContent <- stm(out$documents, out$vocab, K = 20,prevalence =~ poor_students + s(students_ofcolor), content =~ poor_students, max.em.its = 75, data = out$meta, init.type = "Spectral")

In [None]:
plot(charterContent, type = "perspectives", topics = 11)

# Graphical display of topical perspectives

In [None]:
plot(charterPrevFit, type = "perspectives", topics = c(12, 20))

# Graphical display of topical contrast between topics 12 and 20.

In [None]:
# Plotting covariate interactions