# Entities with spaCy

***
# Setup

### Install Packages
  * pdfplumber

In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.5.28.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 1.7 MB/s 
[?25hCollecting pdfminer.six==20200517
  Downloading pdfminer.six-20200517-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 10.0 MB/s 
Collecting Wand
  Downloading Wand-0.6.7-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 33.7 MB/s 
Collecting pycryptodome
  Downloading pycryptodome-3.10.1-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 47.7 MB/s 
[?25hBuilding wheels for collected packages: pdfplumber
  Building wheel for pdfplumber (setup.py) ... [?25l[?25hdone
  Created wheel for pdfplumber: filename=pdfplumber-0.5.28-py3-none-any.whl size=32240 sha256=fa1bfa221e68dec602a3c302cdbaaa32953e8d65ff635307f611b0554277ec84
  Stored in directory: /root/.cache/pip/wheels/f2/b1/a0/c0a77b756d580f53b3806ae0e0b3ec945a8d05fca1d6e10cc1
Successfully built pdfplumb

### Mount Google Drive

In [None]:
from google.colab import drive 

drive.mount('/content/gdrive')

Mounted at /content/gdrive


***
# Functions

### Get Raw Text

In [None]:
"""
Extract text from PDF file as a string
"""
def getText(pathToFile: str) -> str:
    # Open pdf file
    pdfFile = pdfplumber.open(pathToFile)

    # Get list of all pages' objects
    allPages = pdfFile.pages

    # Extract text from each page and store into one string
    allText = ""
    for pageObject in allPages:
        pageText = pageObject.extract_text()
        allText += pageText
    
    return allText

In [None]:
"""
Preprocess text by replacing newline with a space

Notes:
   - After removing \n, it tags additional phrases
   - But, sometimes it adds previous line's words into new line's phrase
"""
def preprocessText(text: str) -> str:
    # Make lower case, and remove newline
    preprocessedText = text.replace("\n", " ")
    return preprocessedText

### Label Entities

In [72]:
"""
Tag entities as GPE and LOC and return 2 lists
"""
def getLocations(text: str):
    nlp = spacy.load('en')
    doc = nlp(text)

    GPEList, LOCList = [], []
    for i,ent in enumerate(doc.ents):
        if ent.label_ == "GPE":
            GPEList.append(ent.text)
        elif ent.label_ == "LOC":
            LOCList.append(ent.text)

    return GPEList, LOCList

In [117]:
"""
Assign labels to locations 
"""
def getLocationLabels(lst: list) -> dict:
    # Define labels
    countries = ['us', 'u.s.', 'united states', 'usa', 'united states of america']
    states = ['ca', 'c.a.', 'california']
    cities = ['san jose', 'san josé', 'san carlos', 'south bay', 'silicon valley']
    directions = ['north', 'south', 'west', 'east']

    # Check for locations in list
    locations = {}
    for word in lst:
        word = word.lower()

        wordLst = [word for country in countries if country in word]
        if len(wordLst) > 0:
            locations[word] = "Country"
            print("Country: " + word)

        wordLst = [word for state in states if state in word]
        if len(wordLst) > 0:
            locations[word] = "State"
            print("State: " + word)

        wordLst = [word for city in cities if city in word]
        if len(wordLst) > 0:
            locations[word] = "City"
            print("City: " + word)

        wordLst = [word for direction in directions if direction in word]
        if len(wordLst) > 0:
            locations[word] = "Direction"
            print("Direction: " + word)

    return locations

View tagged entities in HTML

In [None]:
"""
Get HTML code for the document with all entity tags
"""
def getHTMLString(text: str):
   nlp = spacy.load('en')
   doc = nlp(text)
   # Get HTML with tagged document
   HTMLString = spacy.displacy.render(doc, style='ent')
   return HTMLString

In [None]:
"""
View HTML page in notebook
"""
def displayHTML(HTMLString):
    HTMLOutput = HTML(HTMLString)
    display(HTMLOutput)

***
# Run Entity Recognition

In [None]:
# Load libraries
import spacy
import pdfplumber
from IPython.display import HTML

### View Locations in One File

In [140]:
# Load one file
pathToFolder = "/content/gdrive/My Drive/#proj-city-agenda-scraper/Agenda_Scraper_Files/Legistar/"
filename = "SanJose5.pdf"
pathToFile = pathToFolder + filename

# Get raw text
text = getText(pathToFile)
text = preprocessText(text)

# Get GPE and LOC tagged phrases
GPE, LOC = getLocations(text)

In [141]:
# View original tagged phrases
print(GPE)

['SQUARE', 'California', 'East Santa Clara Street', 'the City of San José', 'County', 'Santa Clara', 'California', 'Santa Clara County', 'Santa Clara County Records', 'the POINT OF BEGINNING', 'Blvd', 'Cal', 'San José', 'City', 'California', 'North Almaden Boulevard', 'Mayor']


In [143]:
# View new location labels
GPE_Dict = getLocationLabels(GPE)

State: california
Direction: east santa clara street
City: the city of san josé
State: california
State: cal
City: san josé
State: california
Direction: north almaden boulevard


In [144]:
# View original tagged phrases
print(LOC)

['EAST', 'North Almaden Boulevard', 'the Grand Deed', 'East', 'West', 'East', 'East', 'West', 'BOULEVARD EAST']


In [145]:
# View new location labels
LOC_Dict = getLocationLabels(LOC)

Direction: east
Direction: north almaden boulevard
Direction: east
Direction: west
Direction: east
Direction: east
Direction: west
Direction: boulevard east


### View Tagged File with HTML

In [None]:
# Load one file
pathToFolder = "/content/gdrive/My Drive/#proj-city-agenda-scraper/Agenda_Scraper_Files/Legistar/"
filename = "SanJose14.pdf"
pathToFile = pathToFolder + filename
# Get raw text
text = getText(pathToFile)

HTMLString = getHTMLString(text)

displayHTML(HTMLString)

In [None]:
# Load one file
pathToFolder = "/content/gdrive/My Drive/#proj-city-agenda-scraper/Agenda_Scraper_Files/Legistar/"
filename = "SanJose14.pdf"

pathToFile = pathToFolder + filename
text = getText(pathToFile)
text = preprocessText(text) #remove newline chars

HTMLString = getHTMLString(text)

displayHTML(HTMLString)