In [1]:
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from collections import Counter
from spellchecker import SpellChecker

In [3]:
def readData(infilePathAndName):
    """
    Takes in the preprocessed/precleaned wordset that I wrote and returns a list of lists
    containing the words tokenized into sentences (lists) and words (elements of lists)
    
    
    """
    
    with open(infilePathAndName, "r") as f:
        string = f.read()
        ls1 = string.split("\n")
        finalLs = []
        for index, string in enumerate(ls1):
            subLs = string.split(" ")
            finalLs.append(subLs)
            
    return(finalLs)
    

In [4]:
# create the ls of ls as dataset
salientWords = readData("salientWords.txt")

# check to see if ls is same len after reading in data from doc
len(salientWords) == 3101

False

In [5]:
# document is not same length. extra item was thrown in. pop this item to remove it from list
salientWords.pop(3101)

['']

In [7]:
# check len again
len(salientWords) == 3101

True

In [8]:
# the following code creates a dictionary called documentFrequency which determines the total use of 
# individual/unique words across *all* documents
documentIndex = {} 
for sent_index, sent in enumerate(salientWords):
    for word_index, word in enumerate(sent):
        try:
            # checks to see if there is an existing index and, if there is & the value of the 
            # sentence doesn't already exist, it adds the value
            documentIndex[word].add(sent_index)
        except:
            # if there is not an existing index, it creates one and *then* adds the value
            documentIndex[word] = {sent_index}

# documentIndex dictionary is indexed by the unique word. each entry is the sentence (document) #

documentFrequency = {}

for dictIndex, dictElement in enumerate(documentIndex):
    documentFrequency[dictElement] = len(documentIndex[dictElement]) # add the number
    # of documents that contain this index (word) to the new dict.
    
# documentFrequency is a dictionary containing 'unique_word': howManySentencesItOccursIn

In [13]:
# calculate total documents/sent
totalDocuments = len(salientWords)
print("There are %d total documents/sentences in the book"%(totalDocuments))

# calculate total words
wordLen = 0
for i,e in enumerate(salientWords):
    wordLen += len(e)
print("After the data was cleaned and processed there were %d total words remaining"%(wordLen))

There are 3101 total documents/sentences in the book
After the data was cleaned and processed there were 30970 total words remaining


In [20]:
termFrequencyLs = [] # create ls for term frequency (TF)
inverseDocFrequencyLs = [] # create ls for inverse document frequency (IDF)
tfidfLs = [] # create ls for term frequency-inverse document frequency (TF-IDF)
for sent_index, sent in enumerate(salientWords): # loop through each document
    # create a sep list for each document's stats
    tempTermFrequency = [] 
    tempInverseDocFrequency = []
    tempTfidf = []
    docLength = len(sent) # length of the current document (how many words in this sentence)
    counts = Counter(sent) # create a counter for unique values and their frequency per document
    for word_index, word in enumerate(sent): # iterate through each word in the current document
        frequencyInDoc = counts[word] # frequency of current word
        termFrequency = frequencyInDoc / docLength # term frequency for this word in this doc
        tempTermFrequency.append(termFrequency) 
        
        
        inverseDocFrequency = math.log((totalDocuments/(documentFrequency[word]))) # inverse doc
        # frequency for this word in this doc
        tempInverseDocFrequency.append(inverseDocFrequency)
        
        tfidf = (termFrequency * inverseDocFrequency) #tfidf for this word in this doc
        tempTfidf.append(tfidf)
    
    # append the list of the current document to the big-boy list
    termFrequencyLs.append(tempTermFrequency)
    inverseDocFrequencyLs.append(tempInverseDocFrequency)
    tfidfLs.append(tempTfidf)

In [42]:
## interesting. The following prints the top three TFIDF value for each document (i.e. sentence)
## SUPER INTERESTING. THE CODE BELOW GIVES A LOT OF MEANING IN THREE WORDS PER SENTENCE.
maximumIndexLs = []

for index, ls in enumerate(tfidfLs):
    #maximumIndex = np.argmax(ls)
    #maximumIndexLs.append(maximumIndex)
    #topThree = []
    n = 3
    topThreeIndeces = sorted(range(len(ls)), key = lambda sub: ls[sub])[-n:] # some crazy ass lambda function I found that grabs the indecies of the
    # three largest items. if someone can explain to me how this is working that would be lovely. all i know is that it works!
    # print(topThreeIndeces)
    maximumIndexLs.append(topThreeIndeces)
        
sentenceNum = []
wordOne = []
wordTwo = []
wordThree = []
if len(maximumIndexLs) == len(salientWords): # check to make sure we can iterate over these together
    for index, doc in enumerate(salientWords):
        #print("Sentence # %d: "%(index))
        indexLs = maximumIndexLs[index]
        if len(indexLs) ==3: 
            sentenceNum.append((index + 1))
            for j in range(3):
                tempWord = doc[(indexLs[j])]
                #print("Word %d: %s"%((j+1), tempWord))
                if j == 0:
                    wordOne.append(tempWord)
                elif j == 1:
                    wordTwo.append(tempWord)
                elif j == 2:
                    wordThree.append(tempWord)
        else:
            #print("Wrong number of maximums")
            someValue = 0
else:
    print("Not same length.")

In [49]:
topThreeWords = pd.DataFrame(zip(sentenceNum,wordOne,wordTwo,wordThree), columns = ["sentenceNum", "wordOne", "wordTwo", "wordThree"])

In [48]:
df

Unnamed: 0,sentenceNum,wordOne,wordTwo,wordThree
0,1,denver,denver,denver
1,2,starte,day,day
2,3,guess,catching,beings
3,4,please,kurt,vonnegut
4,5,pneumonia,story,story
...,...,...,...,...
2936,3096,could,could,compromise
2937,3097,ostracized,majority,compassion
2938,3098,receive,unshaved,brushing
2939,3100,lonesome,city,city
