# Convert PDF to Tokens
This notebook contains the functions to get raw text from PDFs, process the text and tokenize the text.
***

### Functions

1.  Extract Raw Text
    * getText_PyPDF2(pathToFile)
    * getText(pathToFile)  
    * savePDFAsTxt(text, filename, pathToTxtFilesFolder)
    * preprocessText(text)

3.  Process Text
    * lemmatizeText(text, nlp)
    * get_nlp( )  
    * getStopWords(pathToCommonWordsCsv, moreWords)
    * processText(sentence, stopWords, nlp)

2.  Tokenize Text
    * getSentences(text, nlp)
    * getTokens(sentences, nlp, pathToCommonWordsCsv)


***

#### 1. Get Raw Text

Extract text from PDF using pdfplumber or PyPDF2.

In [None]:
"""
Get raw text from pdf file as string using PyPDF2
"""
def getText_PyPDF2(pathToFile: str) -> str:
    # Load pdf file
    pdfFile = open(pathToFile, 'rb')
    PDF_Reader = PyPDF2.PdfFileReader(pdfFile)

    # Get total number of pages in document
    numPages = PDF_Reader.getNumPages()
    #print(f"There are {numPages} pages in the file.\n")

    # Combine text from all pages into one string
    text = ""
    for pg_number in range(numPages):
      page = PDF_Reader.getPage(pg_number)
      page_text = page.extractText()
      text += page_text
    
    return text

In [None]:
"""
Extract raw text from pdf using pdfplumber

Problems with PyPDF2:
  - only extracted 2 of 13 pages for a file
  - added many newline chars where they don't exist in the original 

Improvements with pdfplumber:
  - able to extract ALL text, including header, footer, image captions
  - keeps general format of original pdf, just makes it all left-aligned
  - all words extracted are as they appear in pdf 
    (significantly reduced the "fake" words)

Initially found here: 
https://towardsdatascience.com/how-to-extract-text-from-pdf-245482a96de7
"""
def getText(pathToFile: str) -> str:
    # Open pdf file
    pdfFile = pdfplumber.open(pathToFile)

    # Get list of all pages' objects
    allPages = pdfFile.pages

    # Extract text from each page and store into one string
    allText = ""
    for pageObject in allPages:
        pageText = pageObject.extract_text()
        allText += pageText
    
    return allText

In [None]:
"""
Save extracted text into .txt file,
using the same name (e.g. SanJose1.pdf => SanJose1.txt)
"""
def savePDFAsTxt(text, filename, pathToTxtFilesFolder):
    # Create txt file name
    name = filename[:-4] #remove .pdf
    txtFilename = name + ".txt"

    # Make and save txt file with name
    pathToCurrentTxtFile = pathToTxtFilesFolder + txtFilename
    txtFile = open(pathToCurrentTxtFile, 'w+')    
    
    # Write to file with string of pdf's text
    txtFile.write(text.strip())
    print(f"Saved as {txtFilename}")
    return pathToCurrentTxtFile

In [None]:
"""
Preprocess text by replacing newline with a space
"""
def preprocessText(text: str) -> str:
    # Make lower case, and remove newline
    preprocessedText = text.replace("\n", " ")
    return preprocessedText

***
#### 2. Process Text

Remove stop words and lemmatize text.

In [None]:
"""
Lemmatize string with text
"""
def lemmatizeText(text, nlp) -> str:
    tokens = nlp(text)
    lemmatizedWords = []
    for token in tokens:
        lemmatizedWords.append(token.lemma_)
    return lemmatizedWords

In [None]:
"""
Create spacy Language object to parse English text
"""
def get_nlp():
    nlp = spacy.load('en')
    return nlp

In [None]:
""" 
Get list of stop words from spacy, 
optionally can add extra words
"""
def getStopWords(pathToCommonWordsCsv, moreWords=[]) -> list:
  stopWords = spacy.lang.en.stop_words.STOP_WORDS #set
  stopWordsList = list(stopWords) + moreWords
  # This uses CommonWords.csv
  commonWords = getCommonWords(pathToCommonWordsCsv)
  if len(commonWords) > 0:
      stopWordsList += commonWords[:15] # top 15 common words    
  
  stopWordsList += ['san', 'jose', 'josé', 'city', 'council', 
                    'meeting', 'resolution', 'memorandum',
                    'event', 'file', 'document', 'agenda',
                    'draft', 'contact', 'office', 'resource',
                    'clerk', 'final', 'california',]

  return stopWordsList

In [None]:
"""
Processing text: lemmatize, remove stop words, make lowercase
Input:
    sentence: str, 
    stopWords: set, 
    nlp: spacy.lang.en.English

Note: when the same word appeared in a sentence (e.g. Fees and fees),
spaCy only lemmatized fees to 'fee', but did not lemmatize Fees.
Some other times, it did lemmatize (e.g. Authorizes and authorizes).
"""
def processText(sentence, stopWords, nlp) -> list:
    # Lemmatize 2 times
    # Lemmatize entire sentence
    tokens_lemmatized = lemmatizeText(sentence, nlp)
    # Make each word of sentence into lower case words
    tokens_lowercase = [word.lower() for word in tokens_lemmatized]
    # Lemmatize each word to ensure they are lemmatized 
    tokens_lemmatized = [lemmatizeText(word, nlp)[0] for word in tokens_lowercase]
    
    # Removing stop words and any punctuation or numeric strings
    list_tokens = []
    for word in tokens_lemmatized:
        if word not in stopWords and word.isalpha():
            list_tokens.append(word)
    
    # Remove single letters
    list_tokens = [word for word in list_tokens if len(word)>1]

    # Return preprocessed list of tokens
    return list_tokens  

***
#### 3. Tokenize Text

Break text into sentences and process further by breaking it into words.

In [None]:
"""
Get sentences from pdf's preprocessed text using spacy's trained pipeline
"""
def getSentences(text: str, nlp) -> list:
    #nlp = get_nlp() #spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = list(doc.sents)
    sentences = [sent.string.strip() for sent in sentences]
    return sentences

In [None]:
"""
Break sentences into words by processing them
"""
def getTokens(sentences, nlp, pathToCommonWordsCsv="") -> list:
    stopWords = getStopWords(pathToCommonWordsCsv)
    # Process sentences to get words
    tokens = []
    for sentence in sentences:
        current_tokens = processText(sentence, stopWords, nlp)
        tokens += current_tokens
    # Return words with alphabet only
    return tokens