In [1]:
#NLTK is the NLP package we're using
import nltk
#OS is for navigating directories on the computer
import os

In [2]:
#Put in the full path to the directory with all your texts
#On Mac it looks like what you see below
#On Windows it looks more like 'C:\\Users\\YOUR-USER-NAME\\Documents'
textdir = '/Users/qad/Documents/dsc/dsc_corpus_clean'
#Change directories to the one you specified
os.chdir(textdir)
#Put the path to the directory where you want to store the output files. This directory should exist already.
outdir = '/Users/qad/Documents/dsc/dsc_nouns'

In [3]:
#For each filename in the directory you listed...
for filename in os.listdir(textdir):
    #If the filename ends with .txt...
    if filename.endswith('.txt'):
        #Create an output name that adds '-nouns' to the filename
        outname = filename.replace('.txt','-nouns.txt')
        outpath = os.path.join(outdir,outname)
        #Open the file
        with open(filename, 'r') as f:
            #Open the output file
            with open(outpath, 'w') as out:
                #Read the text from the file
                text = f.read()
                #Split the text into a list of sentences
                sentences = nltk.sent_tokenize(text)
                #For each sentence in the list of sentences...
                for sentence in sentences:
                    #For each word and each part-of-speech tag that you get
                    #When NLTK tokenizes the sentence (splitting words from punctuation, etc.)
                    for word,pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
                        #If the part-of-speech is noun
                        if (pos == 'NN' or pos == 'NNS'):
                        #You can sub in other parts of speech, too
                        #Just comment out the noun code, and uncomment one of these
                        #Adverbs
                        #if (pos == 'RB' or pos == 'RBR' or pos == 'RBS'):
                        #Adjectives
                        #if (pos == 'JJ' or pos == 'JJR' or pos == 'JJS'):
                        #Verbs
                        #if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ'):
                            #Write the word (which should be a noun) to the output file
                            out.write(word)
                            #Write a space so the words don't smush together
                            out.write(' ')

In [4]:
#Imports the sys module in order to install a package in your Jupyter notebook
import sys
#Installs rpy2 (you only need to do this the first time you run the notebook)
!{sys.executable} -m pip install rpy2



In [5]:
#Load the rpy2 extension so you can run R code
%load_ext rpy2.ipython

In [6]:
%%R
#A function that takes a text chunk as an input
cleanChunk <- function(text.chunk){
  #Removes tabs
  text.chunk <- paste(text.chunk, collapse=" ")
  #Splits the text into individual characters
  text.chunk <- unlist(strsplit(text.chunk, ""))
  #Makes all text lower-case
  text.chunk <- tolower(text.chunk)
  #Throws out any character that isn't considered a letter by R
  text.chunk <- text.chunk[which(text.chunk %in% c(letters, " "))]
  #Pastes the vector of individual letters back together to form one giant string
  text.chunk <- paste(text.chunk, collapse="")
  #Splits everything based on spaces so we have a vector of words instead of letters
  text.clean <- unlist(strsplit(text.chunk, " "))
  #Returns the clean text chunk
  return(text.clean)
}

In [7]:
%%R
#Sets up what the function expects as its inputs & outputs
corpusTypicality<-function(corpus.dir, meta.data.file, output.dir, top.words = evaluated.words, corpus.name){
  #Loads the text mining package
  library(tm)
  #Sets up a path for each file, with the corpus directory + filename
  all.files<-paste(corpus.dir, meta.data.file$Filename, sep="/")
  #Extracts text from the text files
  all.texts<-lapply(all.files, function(x) scan(x, what="character", sep="\n", quiet=T))
  #Pastes everything back together in a single string
  all.texts<-lapply(all.texts, function(x) paste(x, collapse=" "))
  #Applies Mark's special cleaning function
  all.texts<-lapply(all.texts, function(x) cleanChunk(x))
  #Pastes all texts back together in a single string
  all.texts<-lapply(all.texts, function(x) paste(x, collapse=" "))
  #Returns a list, changing the text strings into a vector
  all.texts<-unlist(all.texts)
  #Defines text.corpus as a vectorization of all the texts
  text.corpus<-Corpus(VectorSource(all.texts))
  #Creates a document-term matrix from the vectorization
  text.dtm<-DocumentTermMatrix(text.corpus)
  #Sets the document-term matrix as a matrix
  text.dtm<-as.matrix(text.dtm)
  #Defines word frequency by sorting the words in order of occurrence
  word.freqs<-sort(colSums(text.dtm), decreasing=T)
  #Defines top terms as being the # of terms that you specify when you run the code
  #The "starts.with", which you also specify when you run the code, lets you exclude the most-frequent terms
  #This might be useful if you're using all words (e.g. not just nouns) and want to
  #exclude the highest-frequency words that are mostly indicative of author signal (e.g. 'the', 'a', 'and')
  top.terms<-names(word.freqs[starts.with:top.words])
  #Writes a CSV with the top terms that it's using for its analysis
  write.csv(x = top.terms, file = paste(c(output.dir, '/', 'top_terms_', corpus.name,'.csv'), collapse = ''))
  #Converts word counts to frequencies, dividing by book length
  #Otherwise, books would look different from one another just because of different length
  dtm.scale<-text.dtm/rowSums(text.dtm)
  #Reshapes the document/term matrix to just columns with frequencies for the top words
  dtm.mfw<-dtm.scale[,which(colnames(dtm.scale) %in% top.terms)]
  #Computes PCA coordinates using the words in the range you specified for analysis
  pca.coords<-prcomp(dtm.mfw)
  #Specififes that the coordinates should be PC1 and PC2
  pca.coords<-pca.coords$x[,1:2]
  #Names the columns in your output file
  colnames(pca.coords)<-c("PC1", "PC2")
  #Creates a dataframe (R spreadsheet) with the coordinates plus the info from the metadata file in the input
  final.table<-data.frame(pca.coords, meta.data.file)
  #Calculates the mean X value
  x.mean<-mean(pca.coords[,1])
  #Calculates the median X value
  x.median<-median(pca.coords[,1])
  #Calculates the mean Y value
  y.mean<-mean(pca.coords[,2])
  #Calculates the median Y value
  y.median<-median(pca.coords[,2])
  #Creates 2 empty rows
  empty.rows<-matrix(rep(NA, (ncol(meta.data.file)*2)), nrow=2)
  #Fills the empty rows with the x-mean and x-median
  x.add<-c(x.mean, x.median)
  #Fills the empty rows with the y-mean and y-median
  y.add<-c(y.mean, y.median)
  #Adds the rows with the calculated mean and median values to the final dataframe
  add.table<-data.frame(x.add, y.add, empty.rows)
  colnames(add.table)<-colnames(final.table)
  final.table<-rbind(final.table, add.table)
  #Puts the value 'text' for all your original text files in the final dataframe
  type.vector<-rep("text", nrow(final.table))
  #Puts the values 'mean' and 'median' in the final dataframe for your calculated values
  type.vector[c(length(type.vector)-1, length(type.vector))]<-c("mean", "median")
  #Sets the final table to be a vector
  final.table$Type<-type.vector
  #Defines the filename for the PCA output
  out.filename<-paste(corpus.name, "_pca.csv", sep="")
  #Defines the path for the PCA output
  out.filename<-paste(output.dir, out.filename, sep="/")
  #Writes the final dataframe to your output file
  write.csv(final.table, out.filename, row.names=F)
}

In [8]:
%%R
#How many top words should it use?
evaluated.words=1000
#How many of the very most frequent words should it skip? To skip 0, starts.with = 1.
#To skip 50, starts.with = 51
starts.with=1
#Put the path to the folder containing the folder with your texts here
setwd('~/Documents/dsc')
#Put the name of the folder with your texts here
Filename<-list.files('dsc_pca_all_words')
#If you want to strip anything off the text filename for the title besides .txt you can change this
Title<-unlist(strsplit(Filename, ".txt"))
#Creates the metadata table
dsc.meta<-data.frame(Filename, Title, stringsAsFactors = FALSE)
#Runs the Typicality code
means.text<-corpusTypicality('dsc_pca_all_words', dsc.meta, ".", corpus.name="dsc_pca_all_words")

R[write to console]: Loading required package: NLP



In [9]:
#Defines a function that takes in a list of points and a number labeled K
#K is just the number of closest points to 0,0 that you want
#Because there's a pseudo-point at 0,0 you need to add 1 to the number of results you want
def pClosest(points, K):
    #Mary had a little lambda?
    #TBH I struggle to wrap my head around lambda stuff
    #It works, though
    points.sort(key = lambda K: K[0]**2 + K[1]**2)
    #Returns K number points
    return points[:K]
 
#I created this list by copying and pasting the PC1 and PC2 columns
#from the CSV generated by Typicality, into a plain text file
#Then I used regular expressions to put all the values on a single line
#And surround each set of values with square brackets
#Then I pasted that list between the first set of brackets
points = [[0.00469161, -0.001991805], [0.009683978, -0.008615327], [0.008572655, -0.001824667], [0.003675937, -0.011829725], [0.011229949, -0.00992014], [0.005954947, -0.010017583], [-0.004476714, -0.01351926], [0.002374966, -0.00757018], [-0.008789731, -0.003048114], [0.01216702, -0.004200922], [0.003722121, -0.006907427], [0.00769834, -0.008655584], [0.002334449, -0.003363331], [0.004529415, -0.012335336], [0.003864004, -0.013008246], [0.00435057, -0.004221983], [-0.004951219, -0.008742653], [0.002067941, -0.001514505], [-0.004736377, -0.009282298], [0.004368298, -0.007480759], [-0.029222638, -0.005909872], [0.002068301, -0.008848089], [0.003693331, -0.006555199], [0.00411114, -0.003276592], [-0.011580047, -0.021221971], [0.008307237, -0.003467732], [0.005365445, -0.01388096], [-0.005972513, -0.005758532], [0.011382165, -0.011022268], [0.010832945, -0.011559693], [0.014017606, -0.011837172], [0.016100764, -0.014274526], [-0.00731022, -0.011255925], [0.003028445, -0.008666056], [-0.003230194, 0.002504059], [0.012346846, -0.005404243], [0.004239089, -0.007702936], [-0.00419579, 0.003379739], [-0.00741384, -0.002815382], [0.001717428, -0.016452399], [0.002336378, -0.000895452], [0.011563584, -0.006602793], [-0.001320436, 0.002626512], [0.012051506, -0.005411988], [-0.019932552, -0.01264996], [0.001227889, -0.004654948], [0.00266858, -0.007009829], [0.001563455, -0.006563461], [-0.01458836, -0.022989674], [-0.00080952, 0.002674865], [0.004850349, 0.000553782], [-0.001138218, -0.004883547], [0.00295566, -0.009086473], [-0.006439591, 0.007926304], [-0.006096543, 0.001602647], [-0.010709235, 0.0067662], [-0.01830948, -0.019530052], [-0.011294579, -0.006774717], [0.004138995, -0.011339406], [-0.004755756, -0.001694357], [0.002336382, 0.005766334], [-0.015953166, -0.005859408], [0.003469975, 0.004515667], [0.002782578, 0.003146166], [-0.005057231, -0.001876251], [-0.002433529, 0.005469585], [0.00479866, 0.002788323], [-0.005007354, 0.002511677], [0.001730607, -0.010251528], [-0.002083167, -0.004600916], [-0.003478521, 0.009355717], [-0.005567839, -0.001020918], [0.006126912, 0.000761974], [-0.005477061, -0.001228646], [-0.011217858, 0.015344681], [-0.012396735, 0.00269082], [0.003959916, 0.00701705], [0.004070298, 0.001824347], [0.001692785, 0.000870467], [0.000318332, 0.001547991], [-0.00697194, -0.00442188], [-0.001555259, -0.006890195], [0.007690274, 0.001026254], [0.000877743, 0.002933271], [-0.01740308, -0.012584651], [-0.007681429, 0.002597892], [-0.006255462, 0.002287488], [0.001601807, 0.00326431], [0.00099986, -0.00619557], [-0.011019061, 0.004453192], [0.000440062, 0.003726049], [-0.007006778, -0.002461165], [0.003062121, 0.000208577], [-0.004434383, 0.002612386], [0.005557097, -0.006909485], [-0.007126046, 0.004257314], [0.001582076, 0.001893182], [0.005853964, 0.007973258], [-0.002583916, 0.001821899], [-0.001925526, -0.001745704], [0.000528997, 0.000988793], [-0.00906454, -0.002320243], [0.002288717, 0.008179182], [-0.001072654, 0.001148261], [-0.006881697, 0.000643985], [-0.003972353, 0.011186226], [-0.003218817, 0.005512333], [-0.012851412, 0.008641159], [-0.014770899, 0.000375218], [-0.003848227, 0.009104553], [-0.01544207, 0.017326715], [-0.009265123, 0.00187548], [-0.005045982, 0.003727476], [-0.008747113, 0.01135271], [-0.004690856, -0.001217952], [0.002234854, 0.005470105], [-0.001935213, 0.000868962], [0.003565061, -0.005405885], [-0.00277459, 0.009971733], [-0.002803215, 0.004841541], [-0.004941038, -0.003273866], [-0.000189547, -0.000237985], [-0.030116569, -0.004371898], [0.011480428, 0.002074561], [0.003407392, -0.010078104], [-0.005161328, -0.008442468], [0.009710449, -0.011331549], [0.001861404, 0.008892], [0.003292164, 0.000176213], [-0.007856082, 0.012355752], [-0.010922237, -0.001119533], [0.010231453, -0.004426342], [0.00156204, 0.002406567], [0.005624661, 0.014677844], [0.001555394, 0.01631771], [0.001483046, 0.014346892], [0.009109574, 0.014295803], [0.011900749, 0.009787263], [0.019760064, 0.000107728], [0.008921484, 0.007658414], [-0.003170581, 0.016751233], [0.004796574, 0.012823444], [0.014521908, 0.005004447], [0.017448747, -0.002471576], [0.004690886, 0.013678724], [0.007671858, 0.015052376], [0.007775464, 0.012239997], [0.005974707, 0.008708258], [0.005910541, 0.00634788], [0.005887654, 0.004801132], [0.005234945, 0.005199604], [0.000384196, 0.002336877], [0.001403685, 0.006222739], [0.002063593, 0.010695518], [0.010148192, 0.011815938], [-0.000342095, 0.01598559], [0.00791066, 0.005526686], [0.014481919, 0.005500372], [0.003914503, 0.000788317], [0.007918963, 0.004980043], [-0.005013202, -0.00634705], [0.013641678, -0.0087342], [0.002236322, -0.007810783], [0.012818793, -0.002390494], [-0.007197174, -0.004887106], [0.010277143, -0.003656188], [0.007382163, -0.00559317], [-0.008926255, 0.003682763], [0.000317981, 0.004758479], [-0.002973747, -0.009079573], [0.002845861, 0.004316752], [5.12E-05, 0.001014361], [-0.007731514, -0.000235443], [-0.008898852, -0.012512811], [-0.01206524, -0.007011903], [0.00023655, 0.011240322], [0.001588484, 0.001445216], [-0.008266009, -0.004680667], [0.00307885, -0.003371266], [-0.000912399, 0.003776058], [-0.006292082, 0.006241512], [-0.009667708, 0.003702167], [-0.008344899, 0.002691967], [-0.003024284, -0.002433902], [0.001326754, 0.001980956], [-0.008992753, 0.008488592], [-0.007901606, -0.009318342], [0.003162828, 0.003753576], [0.012340701, -0.006548767], [-0.002462029, 0.002719182], [-0.005245633, 0.011166358], [0.001122942, -0.004325161], [0.007885139, 3.80E-05], [-0.005447609, 0.004278499], [0.002932201, 0.000646119], [0.000974077, 0.003803959], [0.005905315, 0.004991715], [0.008084161, -0.002333118], [0.00359464, -0.000344437], [0.012753753, -0.007773522], [0.009082452, -0.000682023], [0.003643154, -0.000731967], [0.00532418, -0.002176075], [-0.006365006, 0.005248339], [-0.004233163, 0.009463866], [0.002898313, -5.87E-05], [-0.003666059, -0.004650337], [0.006001644, 0.007078763], [0.00160294, 0.007807623], [0.006898097, 0.007918056], [-0.001876034, 0.007227341], [-0.004919251, 0.009637717], [-0.019849637, -0.002769239], [-0.003061415, 0.001588581], [0.005411868, 0.005390671], [-0.001663993, 0.002204526], [0.006599563, -0.003209579], [-0.000430911, 0.00798643], [-0.017295061, -0.003548918], [-0.008748826, 0.003822645], [0.010859657, -0.00694325], [0.000636576, 0.001555386], [-0.005728491, 0.008800884], [-0.001519371, 0.013963409], [-0.003264341, 0.009046577], [-1.12E-18, -3.38E-19], [0.001326754, 0.000643985]]

#How many values you want back, plus 1 if you're using Typicality
#Because Typicality has a pseudo-point that's basically at 0,0
K = 21

#Prints the closest points
print(pClosest(points, K))

[[-1.12e-18, -3.38e-19], [-0.000189547, -0.000237985], [5.12e-05, 0.001014361], [0.000528997, 0.000988793], [0.001326754, 0.000643985], [-0.001072654, 0.001148261], [0.000318332, 0.001547991], [0.000636576, 0.001555386], [0.001692785, 0.000870467], [-0.001935213, 0.000868962], [0.001588484, 0.001445216], [0.000384196, 0.002336877], [0.001326754, 0.001980956], [0.001582076, 0.001893182], [0.002336378, -0.000895452], [0.002067941, -0.001514505], [-0.001925526, -0.001745704], [-0.001663993, 0.002204526], [-0.00080952, 0.002674865], [0.00156204, 0.002406567], [0.002898313, -5.87e-05]]


In [10]:
%%R
#Sets up what the function expects as its inputs & outputs
corpusTypicalityBiplot <- function(corpus.dir, meta.data.file, output.dir, top.words = evaluated.words, corpus.name){
  #Loads the text mining package
  library(tm)
  #Sets up a path for each file, with the corpus directory + filename
  all.files <- paste(corpus.dir, meta.data.file$Filename, sep="/")
  #Extracts text from the text files
  all.texts <- lapply(all.files, function(x) scan(x, what="character", sep="\n", quiet=T))
  #Pastes everything back together in a single string
  all.texts <- lapply(all.texts, function(x) paste(x, collapse=" "))
  #Applies Mark's special cleaning function
  all.texts <- lapply(all.texts, function(x) cleanChunk(x))
  #Pastes all texts back together in a single string
  all.texts <- lapply(all.texts, function(x) paste(x, collapse=" "))
  #Returns a list, changing the text strings into a vector
  all.texts <- unlist(all.texts)
  #Defines text.corpus as a vectorization of all the texts
  text.corpus <- Corpus(VectorSource(all.texts))
  #Creates a document-term matrix from the vectorization
  text.dtm <- DocumentTermMatrix(text.corpus)
  #Sets the document-term matrix as a matrix
  text.dtm <- as.matrix(text.dtm)
  #Defines word frequency by sorting the words in order of occurrence
  word.freqs <- sort(colSums(text.dtm), decreasing=T)
  #Defines top terms as being the # of terms that you specify when you run the code
  #The "starts.with", which you also specify when you run the code, lets you exclude the most-frequent terms
  #This might be useful if you're using all words (e.g. not just nouns) and want to
  #exclude the highest-frequency words that are mostly indicative of author signal (e.g. 'the', 'a', 'and')
  top.terms <- names(word.freqs[starts.with:top.words])
  #Writes a CSV with the top terms that it's using for its analysis
  write.csv(x = top.terms, file = paste(c(output.dir, '/', 'top_terms_', corpus.name,'.csv'), collapse = ''))
  #Converts word counts to frequencies, dividing by book length
  #Otherwise, books would look different from one another just because of different length
  dtm.scale <- text.dtm/rowSums(text.dtm)
  #Reshapes the document/term matrix to just columns with frequencies for the top words
  dtm.mfw <- dtm.scale[,which(colnames(dtm.scale) %in% top.terms)]
  #Computes PCA coordinates using the words in the range you specified for analysis
  pca.coords <- prcomp(dtm.mfw)
  #THIS IS THE NEW PART!
  #Creates a PDF output for the biplot
  pdf('bsc_topnouns_biplot.pdf', height=100, width=100)
  #Creates the biplot
  biplot(pca.coords)
  dev.off()
}

In [11]:
%%R
#How many top words should it use?
evaluated.words=1000
#How many of the very most frequent words should it skip? To skip 0, starts.with = 1.
#To skip 50, starts.with = 51
starts.with=1
#Put the path to the folder containing the folder with your texts here
setwd('~/Documents/dsc')
#Put the name of the folder with your texts here
Filename<-list.files('dsc_nouns')
#If you want to strip anything off the text filename for the title besides .txt you can change this
Title<-unlist(strsplit(Filename, ".txt"))
#Creates the metadata table
dsc.meta<-data.frame(Filename, Title, stringsAsFactors = FALSE)
#Runs the Typicality code
means.text<-corpusTypicalityBiplot('dsc_nouns', dsc.meta, ".", corpus.name="dsc_nouns")

In [12]:
%%R
#Sets up what the function expects as its inputs & outputs
corpusTypicalityBiplotZscore <- function(corpus.dir, meta.data.file, output.dir, top.words = evaluated.words, corpus.name){
  #Loads the text mining package
  library(tm)
  #Sets up a path for each file, with the corpus directory + filename
  all.files <- paste(corpus.dir, meta.data.file$Filename, sep="/")
  #Extracts text from the text files
  all.texts <- lapply(all.files, function(x) scan(x, what="character", sep="\n", quiet=T))
  #Pastes everything back together in a single string
  all.texts <- lapply(all.texts, function(x) paste(x, collapse=" "))
  #Applies Mark's special cleaning function
  all.texts <- lapply(all.texts, function(x) cleanChunk(x))
  #Pastes all texts back together in a single string
  all.texts <- lapply(all.texts, function(x) paste(x, collapse=" "))
  #Returns a list, changing the text strings into a vector
  all.texts <- unlist(all.texts)
  #Defines text.corpus as a vectorization of all the texts
  text.corpus <- Corpus(VectorSource(all.texts))
  #Creates a document-term matrix from the vectorization
  text.dtm <- DocumentTermMatrix(text.corpus)
  #Sets the document-term matrix as a matrix
  text.dtm <- as.matrix(text.dtm)
  #Defines word frequency by sorting the words in order of occurrence
  word.freqs <- sort(colSums(text.dtm), decreasing=T)
  #Defines top terms as being the # of terms that you specify when you run the code
  #The "starts.with", which you also specify when you run the code, lets you exclude the most-frequent terms
  #This might be useful if you're using all words (e.g. not just nouns) and want to
  #exclude the highest-frequency words that are mostly indicative of author signal (e.g. 'the', 'a', 'and')
  top.terms <- names(word.freqs[starts.with:top.words])
  #Writes a CSV with the top terms that it's using for its analysis
  write.csv(x = top.terms, file = paste(c(output.dir, '/', 'top_terms_', corpus.name,'.csv'), collapse = ''))
  #Converts word counts to frequencies, dividing by book length
  #Otherwise, books would look different from one another just because of different length
  dtm.scale <- text.dtm/rowSums(text.dtm)
  #Reshapes the document/term matrix to just columns with frequencies for the top words
  dtm.mfw <- dtm.scale[,which(colnames(dtm.scale) %in% top.terms)]
  row.names <- meta.data.file[,1]
  #Computes PCA coordinates using the words in the range you specified for analysis
  #LITERALLY JUST ADD 'scale=T' TO THIS LINE TO SCALE IT USING Z-SCORES
  pca.coords <- prcomp(dtm.mfw, scale=T)
  #Creates a PDF output for the biplot
  pdf('bsc_topnouns_biplot_scaled.pdf', height=100, width=100)
  #Creates the biplot
  biplot(pca.coords, xlabs = meta.data.file$Title)
  dev.off()
}

In [13]:
%%R
#How many top words should it use?
evaluated.words=1000
#How many of the very most frequent words should it skip? To skip 0, starts.with = 1.
#To skip 50, starts.with = 51
starts.with=1
#Put the path to the folder containing the folder with your texts here
setwd('~/Documents/dsc')
#Put the name of the folder with your texts here
Filename<-list.files('dsc_nouns')
#If you want to strip anything off the text filename for the title besides .txt you can change this
Title<-unlist(strsplit(Filename, ".txt"))
#Creates the metadata table
dsc.meta<-data.frame(Filename, Title, stringsAsFactors = FALSE)
#Runs the Typicality code
means.text<-corpusTypicalityBiplotZscore('dsc_nouns', dsc.meta, ".", corpus.name="dsc_nouns")