In [3]:
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import random

In [4]:
from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from collections import Counter
from spellchecker import SpellChecker

In [5]:
import functions as van

In [6]:
# create the ls of ls as dataset
salientWords = van.readData("salientWords.txt")

# check to see if ls is same len after reading in data from doc
len(salientWords) == 3101

False

In [7]:
# document is not same length. extra item was thrown in. pop this item to remove it from list
salientWords.pop(3101)

['']

In [8]:
# check len again
len(salientWords) == 3101

True

# TFIDF

In [9]:
# the following code creates a dictionary called documentFrequency which determines the total use of 
# individual/unique words across *all* documents
documentIndex = {} 
for sentIndex, sent in enumerate(salientWords):
    for word_index, word in enumerate(sent):
        try:
            # checks to see if there is an existing index and, if there is & the value of the 
            # sentence doesn't already exist, it adds the value
            documentIndex[word].add(sentIndex)
        except:
            # if there is not an existing index, it creates one and *then* adds the value
            documentIndex[word] = {sentIndex}

# documentIndex dictionary is indexed by the unique word. each entry is the sentence (document) #

documentFrequency = {}

for dictIndex, dictElement in enumerate(documentIndex):
    documentFrequency[dictElement] = len(documentIndex[dictElement]) # add the number
    # of documents that contain this index (word) to the new dict.
    
# documentFrequency is a dictionary containing 'unique_word': howManySentencesItOccursIn

In [10]:
# calculate total documents/sent
totalDocuments = len(salientWords)
print("There are %d total documents/sentences in the book"%(totalDocuments))

# calculate total words
wordLen = 0
for i,e in enumerate(salientWords):
    wordLen += len(e)
print("After the data was cleaned and processed there were %d total words remaining"%(wordLen))

There are 3101 total documents/sentences in the book
After the data was cleaned and processed there were 30970 total words remaining


In [11]:
termFrequencyLs = [] # create ls for term frequency (TF)
inverseDocFrequencyLs = [] # create ls for inverse document frequency (IDF)
tfidfLs = [] # create ls for term frequency-inverse document frequency (TF-IDF)
for sent_index, sent in enumerate(salientWords): # loop through each document
    # create a sep list for each document's stats
    tempTermFrequency = [] 
    tempInverseDocFrequency = []
    tempTfidf = []
    docLength = len(sent) # length of the current document (how many words in this sentence)
    counts = Counter(sent) # create a counter for unique values and their frequency per document
    for word_index, word in enumerate(sent): # iterate through each word in the current document
        frequencyInDoc = counts[word] # frequency of current word
        termFrequency = frequencyInDoc / docLength # term frequency for this word in this doc
        tempTermFrequency.append(termFrequency) 
        
        
        inverseDocFrequency = math.log((totalDocuments/(documentFrequency[word]))) # inverse doc
        # frequency for this word in this doc
        tempInverseDocFrequency.append(inverseDocFrequency)
        
        tfidf = (termFrequency * inverseDocFrequency) #tfidf for this word in this doc
        tempTfidf.append(tfidf)
    
    # append the list of the current document to the big-boy list
    termFrequencyLs.append(tempTermFrequency)
    inverseDocFrequencyLs.append(tempInverseDocFrequency)
    tfidfLs.append(tempTfidf)

In [12]:
## interesting. The following prints the top three TFIDF value for each document (i.e. sentence)
## SUPER INTERESTING. THE CODE BELOW GIVES A LOT OF MEANING IN THREE WORDS PER SENTENCE.
maximumIndexLs = []

for index, ls in enumerate(tfidfLs):
    #maximumIndex = np.argmax(ls)
    #maximumIndexLs.append(maximumIndex)
    #topThree = []
    n = 3
    topThreeIndeces = sorted(range(len(ls)), key = lambda sub: ls[sub])[-n:] # some crazy ass lambda function I found that grabs the indecies of the
    # three largest items. if someone can explain to me how this is working that would be lovely. all i know is that it works!
    # print(topThreeIndeces)
    maximumIndexLs.append(topThreeIndeces)
        
sentenceNum = []
wordOne = []
wordTwo = []
wordThree = []
if len(maximumIndexLs) == len(salientWords): # check to make sure we can iterate over these together
    for index, doc in enumerate(salientWords):
        #print("Sentence # %d: "%(index))
        indexLs = maximumIndexLs[index]
        if len(indexLs) ==3: 
            sentenceNum.append((index + 1))
            for j in range(3):
                tempWord = doc[(indexLs[j])]
                #print("Word %d: %s"%((j+1), tempWord))
                if j == 0:
                    wordOne.append(tempWord)
                elif j == 1:
                    wordTwo.append(tempWord)
                elif j == 2:
                    wordThree.append(tempWord)
        else:
            #print("Wrong number of maximums")
            someValue = 0
else:
    print("Not same length.")

In [13]:
# make a dataframe with the top three words for each sentence
topThreeWords = pd.DataFrame(zip(sentenceNum,wordOne,wordTwo,wordThree), columns = ["sentenceNum", "wordOne", "wordTwo", "wordThree"])
topThreeWords



Unnamed: 0,sentenceNum,wordOne,wordTwo,wordThree
0,1,denver,denver,denver
1,2,starte,day,day
2,3,guess,catching,beings
3,4,please,kurt,vonnegut
4,5,pneumonia,story,story
...,...,...,...,...
2936,3096,could,could,compromise
2937,3097,ostracized,majority,compassion
2938,3098,receive,unshaved,brushing
2939,3100,lonesome,city,city


In [14]:
# create a set of random pointers to filter out the word sets
random.seed(120)
pointers = []

for i in range(10):
    ptr = random.randint(0, len(sentenceNum))
    pointers.append(ptr)
    
print(pointers)

[2109, 1015, 772, 2602, 2415, 2748, 1894, 373, 1290, 876]


In [15]:
# filter out the df according to the pointer and the length of sentences. append each df to a list
length = 3
sets = []

for index, ptr in enumerate(pointers):
    cap = ptr + length # cap is the top index in DataFrame[ptr:cap]
    tempDf = topThreeWords[ptr:cap]
    sets.append(tempDf)
    filePath = "randomSamples/randomSet" + str(index) + ".csv"
    tempDf.to_csv(filePath)
     
sets[8] # one is interestin

Unnamed: 0,sentenceNum,wordOne,wordTwo,wordThree
1290,1367,outdoorsey,carharrts,khuls
1291,1368,instead,bozeman,vibe
1292,1369,david,office,inform


In [16]:
def getOriginalSentences(lsOfSent, lsOfPointers, lengthOfSet):
    
    sets = []

    for index, ptr in enumerate(lsofPointers):
        cap = ptr + length # cap is the top index in DataFrame[ptr:cap]
        subSet = lsOfSent[ptr:cap]
        string = "\n".join(subSet)
        print(string)
        print("")
        #sets.append(subSet)
    

# Word Frequency

In [17]:
documentFrequency["van"]

156

In [18]:
def uniqueDf(uniqueArr):
    unique = pd.DataFrame(uniqueArr, columns=['token', 'indexFirstTokenOccur','frequency']) # convert to dataframe for easier wrangling
    unique.frequency = unique.frequency.astype(int) # before this line was added, the count/frequency had at some point been converted to a string and so
    # the sort values function was sorting it based upon the first value in a string (e.g. 1 comes before 9, so 1203 is smaller tahn 99)
    unique = unique.sort_values(by='frequency', ascending = False)
    #pring the head
    
    return(unique)

In [19]:
def uniqueArr(lsOfAllWords):
    
    uniqueTokens, uniqueFirstOccurance, uniqueTokenCounts = np.unique(lsOfAllWords, return_index=True, return_counts=True) # how many unique words did I use in the writing of 
    # this book? what were the counts?
    uniqueArr = np.asarray((uniqueTokens,uniqueFirstOccurance,uniqueTokenCounts)).T # turns tuple into ndarray then .T transposes it over its axis
    
    return(uniqueArr)

In [20]:
# although we already put this through a dict above, I would rather use lists
allSalientWords = []

for sentIndex, sent in enumerate(salientWords):
    for wordIndex, word in enumerate(sent):
        allSalientWords.append(word)

len(allSalientWords)
("van" in allSalientWords) # so the word van is in salient words, so it got lost somewhere in the next two cells

True

In [21]:
jackeroo = 0
for i, e in enumerate(allSalientWords):
    if e == "van":
        jackeroo += 1
        
print(jackeroo) # so we have the same len of van instances as the other doc, so we know
# SOMETHING BREAKS AFTER THIS CELL
# probably has to do with the damn pos tag shit lol

163


In [29]:
allSalientWordsArr = uniqueArr(allSalientWords)
allSalientWordsDf = uniqueDf(allSalientWordsArr)
allSalientWordsDf.head() # so if i just let it chill like this it's all good

Unnamed: 0,token,indexFirstTokenOccur,frequency
3211,like,376,211
5866,van,280,163
429,back,262,159
4668,said,29,158
5592,time,64,157


In [75]:
altAxis = allSalientWordsArr.transpose()


posTuples = nltk.pos_tag(altAxis[0])
posLs = []

for i in range(len(posTuples)):
    tup = posTuples[i]
    pos = tup[1]
    posLs.append(pos)
#posLs

In [83]:
allSalientWordsArr.shape

(6190, 3)

In [89]:
posArr = np.array([posLs])
posArr

array([['CD', 'CD', 'CD', ..., 'VBD', 'CD', 'NN']], dtype='<U4')

In [94]:
if len(allSalientWordsArr) == len(posLs):
    final = np.hstack([allSalientWordsArr, posArr.T])
else:
    print("no")

In [96]:
final[60:200]

array([['3am', '17796', '1', 'CD'],
       ['4', '580', '12', 'CD'],
       ['40', '19730', '4', 'CD'],
       ['400lb', '20118', '1', 'CD'],
       ['45', '2741', '1', 'CD'],
       ['45am', '30804', '1', 'CD'],
       ['4am', '4270', '3', 'CD'],
       ['4runner', '2414', '3', 'CD'],
       ['4wd', '2386', '1', 'CD'],
       ['4x4s', '22880', '1', 'CD'],
       ['5', '2454', '9', 'CD'],
       ['50', '212', '9', 'CD'],
       ['500', '15703', '4', 'CD'],
       ['50th', '7088', '2', 'JJ'],
       ['5200', '215', '2', 'CD'],
       ['55', '9680', '3', 'CD'],
       ['5am', '28287', '1', 'CD'],
       ['5ers', '18782', '2', 'NNS'],
       ['5th', '2840', '1', 'CD'],
       ['6', '4971', '5', 'CD'],
       ['60', '6031', '3', 'CD'],
       ['60lbs', '13880', '1', 'CD'],
       ['625', '26415', '1', 'CD'],
       ['65', '4867', '1', 'CD'],
       ['6am', '17421', '1', 'CD'],
       ['6inches', '30103', '1', 'CD'],
       ['7', '20058', '2', 'CD'],
       ['70', '654', '8', 'CD'],
       

In [191]:
for

array([1, 2, 1, ..., 3, 1, 1])

In [39]:
len(allSalientWordsArr)

6190

In [40]:
len(posLs)

3