In [1]:
import nltk
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from collections import Counter
from spellchecker import SpellChecker

In [3]:
def openAndRead(infilePathAndName):
    """
    opens a file and returns a long long long string
    
    :infilePathAndName: string of the path/path/txtFileName.txt from the current directory
    
    :returns
    
    
    """
    with open(infilePathAndName, 'r') as file:
        string = file.read()
        
    return(string)

In [4]:
def fixDashes(string):
    """
    Replaces the dumb & incorrect way I used to write em dashes (followed by a space) and replaces it with the 'correct' way.
    
    """
    
    newString = string.replace('-- ', '--')
    
    return(newString)

In [5]:
def fixDashesTwo(string):
    """
    Replaces the dumb & incorrect way I used to write em dashes (followed by a space) and replaces it with the 'correct' way.
    
    """
    
    newString = string.replace('—', '')
    
    return(newString)

In [35]:
def tokenizeToSent(newString):
    """
    Uses nltk sent_tokenize to break up the string into a list of strings, one sentence per string.
    
    """
    lsOfSent = sent_tokenize(newString)
    
    return(lsOfSent)

In [None]:
def tokenizeToWord(lsOfSent):
    """
    Uses re and nltk to tokenize each string (sentence) in a list into a list of tokens (words).
    
    """
    
    pattern = r"\w+|\d+" # grabs words and numbers; removes punctuation
    lsOfLsOfToken = [regexp_tokenize(token, pattern) for token in lsOfSent]
    
    return(lsOfLsOfToken)

In [6]:
def cleanAndPreProcessWord(lsOfToken):
    """
    (1) turns all of the tokens in a ls lowercase
    (2) removes all 'stopwords' in a ls leaving the more semantically valuble words
    
    """
    lsOfTokensLow = [token.lower() for token in lsOfLsOfToken]
    lsOfTokensNoStops = [token for token in lsOfTokensLow if token not in stopwords.words('english')]
    
    return(lsOfTokensNoStops)

In [36]:
def cleanAndPreProcessSent(lsOfLsOfToken):
    
    newLsofLs = [cleanAndPreprocessWord(token) for token in lsOfLsOfToken]
    return(newLsofLs)

In [34]:
def removeEmptyData(lsOfTokensNoStops):
    """
    Removes empty lists from a set of lists and returns the new, cleaned out list.
    
    """
    newLs = [x for x in lsOfLs if x != []]
    
    return(newLs)

In [15]:
def readData(infilePathAndName):
    """
    Takes in the preprocessed/precleaned wordset that I wrote and returns a list of lists
    containing the words tokenized into sentences (lists) and words (elements of lists)
    
    
    """
    
    with open(infilePathAndName, "r") as f:
        string = f.read()
        ls1 = string.split("\n")
        finalLs = []
        for index, string in enumerate(ls1):
            subLs = string.split(" ")
            finalLs.append(subLs)
            
    return(finalLs)
    

In [28]:
salientWords = readData("salientWords.txt")

# check to see if doc is same after reading in data
len(salientWords)

3102

In [29]:
# document is not same length. extra item was thrown in. pop this item to remove it from list
salientWords.pop(3101)

['']

In [33]:
# check len again
len(salientWords)

3101