# Keyword Extraction with TD-IDF 
# Processing with spaCy
***

## Setup
  * Install packages
  * Mount google drive

### Install Packages
* PyPDF2
* spaCyPDFReader

In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
[?25l[K     |████▎                           | 10 kB 16.5 MB/s eta 0:00:01[K     |████████▌                       | 20 kB 10.5 MB/s eta 0:00:01[K     |████████████▊                   | 30 kB 8.4 MB/s eta 0:00:01[K     |█████████████████               | 40 kB 7.5 MB/s eta 0:00:01[K     |█████████████████████▏          | 51 kB 5.8 MB/s eta 0:00:01[K     |█████████████████████████▍      | 61 kB 5.7 MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████████| 77 kB 2.6 MB/s 
[?25hBuilding wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py) ... [?25l[?25hdone
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61101 sha256=da0633add06732c8b5452fd6a00593db758c3eb3fda8ac0881aa7aaef56188f9
  Stored in directory: /root/.cache/pip/wheels/80/1a/24/648467ade3a77ed20f35cfd2badd32134e96dd25ca811e64b3
Successful

In [2]:
!pip install spacypdfreader

[31mERROR: Could not find a version that satisfies the requirement spacypdfreader (from versions: none)[0m
[31mERROR: No matching distribution found for spacypdfreader[0m



### Mount Google Drive
* Files from two folders: Memorandums, Resolutions


In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


*** 
# Functions

1.  Processing Text: load pdf text and create tokens
    * getText(pathToFile) 
    * preprocessText(text)
    * getStopWords( )
    * getParser( )
    * processText(sentence, stopWords, parser)
    * getSentences(text)
    * getTokens(sentences)

3.  TF-IDF: compute scores and get keywords
    * get_tf_idf(tokens)
    * getKeywords(pathToFile)
  
4.  Test: compare code's keywords with correct keywords
    * getCorrectKeywords(pathToFile)
    * testFilename(pathToFile)



### 1. Processing Text

Load text from PDF

In [1]:
"""
Get raw text from pdf file as string using PyPDF2
"""
def getText(pathToFile: str) -> str:
    # Load pdf file
    pdfFile = open(pathToFile, 'rb')
    PDF_Reader = PyPDF2.PdfFileReader(pdfFile)

    # Get total number of pages in document
    numPages = PDF_Reader.getNumPages()
    #print(f"There are {numPages} pages in the file.\n")

    # Combine text from all pages into one string
    text = ""
    for pg_number in range(numPages):
      page = PDF_Reader.getPage(pg_number)
      page_text = page.extractText()
      text += page_text
    
    return text

In [2]:
"""
Get text from pdf as a spacy doc object using spacypdfreader
(spacypdfreader was not able to get installed)
"""

'\nGet text from pdf as a spacy doc object using spacypdfreader\n(spacypdfreader was not able to get installed)\n'

Process Text

In [3]:
"""
Preprocess text by replacing newline with a space
"""
def preprocessText(text: str) -> str:
    # lower case, and remove newline
    preprocessedText = text.replace("\n", " ")
    return preprocessedText

In [4]:
""" 
Get list of stop words from spacy
"""
def getStopWords():
  stopWords = spacy.lang.en.stop_words.STOP_WORDS
  return stopWords

In [5]:
"""
Create spacy Language object to parse English text
"""
def getParser():
    parser = English()
    return parser

In [6]:
"""
Processing text: lemmatize, remove stop words, make lowercase
Input:
    sentence: str, 
    stopWords: set, 
    parser: spacy.lang.en.English
"""

def processText(sentence, stopWords, parser) -> list:
    
    # Create token object 
    tokens = parser(sentence)
    
    # lemmatize each token and make them lower case
    tokens_lemmatized = [word.lemma_ for word in tokens]
    tokens_lowercase = [(word.lower().strip()) for word in tokens_lemmatized]
    
    # Removing stop words and any punctuation or numeric strings
    list_tokens = []
    for word in tokens_lowercase:
        if word not in stopWords and word.isalpha():
            list_tokens.append(word)
    
    # Remove single letters
    list_tokens = [word for word in list_tokens if len(word)>1]
    
    # Return preprocessed list of tokens
    return list_tokens  

In [7]:
"""
Get sentences from pdf's preprocessed text using spacy's trained pipeline
"""
def getSentences(text: str) -> list:
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = list(doc.sents)
    sentences = [sent.string.strip() for sent in sentences]
    return sentences

In [8]:
"""
Break sentences into words by processing them
"""
def getTokens(sentences: list) -> list:
    stopWords = getStopWords()
    parser = getParser()
    # process sentences to get words
    tokens = []
    for sentence in sentences:
        current_tokens = processText(sentence, stopWords, parser)
        tokens += current_tokens
    # return words with alphabet only
    return tokens

### 2. TF-IDF

In [9]:
"""
Run tf-idf algorithm on the list of tokens, and
return a dataframe with tokens and scores
"""
def get_TF_IDF(tokens: list):
    
    # compute TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(tokens)

    tf_idf = list(tfidf_vectorizer.idf_) #scores
    features = list(tfidf_vectorizer.get_feature_names()) #words/tokens

    # store results in dataframe
    scores_df = pd.DataFrame(list(zip(features, tf_idf)), 
                             columns=['Keywords', 'TF-IDF'])
    scores_df = scores_df.sort_values('TF-IDF').reset_index(drop=True)
    
    return scores_df

In [10]:
"""
Combine all functions to get keywords dataframe from just path to file
"""
def getKeywords(pathToFile: str): 
  rawText = getText(pathToFile=pathToFile)
  preprocessedText = preprocessText(text = rawText)
  sentences = getSentences(text = preprocessedText)
  tokens = getTokens(sentences = sentences)

  # get dataframe with keywords and scores
  keywords_df = get_TF_IDF(tokens)

  return keywords_df

### 3. Test 

In [11]:
"""
Extract words from filename, 
which are separated by an underscore _
"""
def getCorrectKeywords(pathToFile: str) -> list:
    filename = pathToFile.split('/')[-1]
    correctKeywords = filename.replace(".pdf", "").split('_')
    correctKeywords = [word.lower() for word in correctKeywords]
    return correctKeywords

In [12]:
"""
Test one file
Compare correct keywords with the top 10 keywords computed with tf-idf
"""
def testFilename(pathToFile: str) -> float:
    # get top 10 keywords using tf-idf
    tf_idf_keywords = getKeywords(pathToFile)['Keywords'].to_list()[:10]

    # get actual keywords from file name
    correctKeywords = getCorrectKeywords(pathToFile)

    numCorrectWordsFound = 0
    for keyword in correctKeywords:
        if keyword in tf_idf_keywords:
            #print(f"{keyword} was found.")
            numCorrectWordsFound += 1
            
        #else:
            #print(f"The word '{keyword}' was not found in tf-idf keywords.")
    
    correctPercentage = round((numCorrectWordsFound/len(correctKeywords))*100, 2)
    print(f"{correctPercentage}% of keywords were found.")

    return correctPercentage

*** 
# Run Tests
  1. Load libraries
  2. Test 1 File
  3. Test All Files

In [13]:
import os
import string
import pandas as pd

import PyPDF2
import spacy
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Test One File

Memorandum

In [14]:
# Run test for one file
pathBeforeFile = "/content/gdrive/My Drive/CFSJ/Memorandums/"
filename = "Historic_Landmarks_Designation_Property.pdf"
pathToFile = pathBeforeFile + filename

# Test
fileResult = testFilename(pathToFile=pathToFile)

# Top 10 keywords
getKeywords(pathToFile).head(10)

75.0% of keywords were found.


Unnamed: 0,Keywords,TF-IDF
0,commission,4.258097
1,city,4.258097
2,august,4.258097
3,director,4.545779
4,resolution,4.545779
5,property,4.545779
6,chris,4.545779
7,burton,4.545779
8,landmarks,4.545779
9,historic,4.545779


Resolution

In [15]:
# Run test for one file
pathBeforeFile = "/content/gdrive/My Drive/CFSJ/Resolutions/"
filename = "Fire_Department_Exam_Free_Use_Hall.pdf"
pathToFile = pathBeforeFile + filename

# Test
fileResult = testFilename(pathToFile=pathToFile)

# Top 10 keywords
getKeywords(pathToFile).head(10)

50.0% of keywords were found.


Unnamed: 0,Keywords,TF-IDF
0,city,3.429477
1,fire,4.183249
2,council,4.31678
3,ﬁfree,4.470931
4,useﬂ,4.470931
5,saturday,4.470931
6,department,4.470931
7,resolution,4.653252
8,august,4.653252
9,hall,4.653252


### Test All Files

Memorandums

In [16]:
pathBeforeFile = "/content/gdrive/My Drive/CFSJ/Memorandums/"

print("Starting Tests.")
print("-"*50)

for i,filename in enumerate(os.listdir(pathBeforeFile)):
    pathToFile = pathBeforeFile + filename
    print(f"Test {i+1}: {filename}")
    result = testFilename(pathToFile=pathToFile)
    print("\n" + "-"*50)

print("Tests completed.")

Starting Tests.
--------------------------------------------------
Test 1: Downtown_Rezone_Addendum_Environmental.pdf
25.0% of keywords were found.

--------------------------------------------------
Test 2: Chief_Police_Questions_Policy_Selection.pdf
80.0% of keywords were found.

--------------------------------------------------
Test 3: Dumpster_Day_Brooktree_Vinci_Flickinger.pdf
40.0% of keywords were found.

--------------------------------------------------
Test 4: Juneteenth_Holiday.pdf
100.0% of keywords were found.

--------------------------------------------------
Test 5: Marriott_Townplace_Suites_Hotel_Vesting_Development_Permit.pdf
14.29% of keywords were found.

--------------------------------------------------
Test 6: Demolition_Permit_Site_Development_Construction_Building.pdf
33.33% of keywords were found.

--------------------------------------------------
Test 7: Audit_Peer_Review.pdf
100.0% of keywords were found.

--------------------------------------------------

Resolutions

* Was not able to get text from "San_Carlos_Environmental_Mixed_Use.pdf" (or SanJose16.pdf) with PyPDF2
* just shows many newline chars

In [17]:
pathBeforeFile = "/content/gdrive/My Drive/CFSJ/Resolutions/"

print("Starting Tests.")
print("-"*50)

for i,filename in enumerate(os.listdir(pathBeforeFile)):
    pathToFile = pathBeforeFile + filename
    print(f"Test {i+1}: {filename}")
    result = testFilename(pathToFile=pathToFile)
    print("\n" + "-"*50)

print("Tests completed.")

Starting Tests.
--------------------------------------------------
Test 1: Financing_Commercial.pdf
50.0% of keywords were found.

--------------------------------------------------
Test 2: Fire_Department_Exam_Free_Use_Hall.pdf
50.0% of keywords were found.

--------------------------------------------------
Test 3: Vacate_Almaden_Property_Surplus.pdf
25.0% of keywords were found.

--------------------------------------------------
Tests completed.
