# Entities with spaCy

***
# Setup

### Install Packages
  * pdfplumber
  * us

In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.5.28.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 2.4 MB/s 
[?25hCollecting pdfminer.six==20200517
  Downloading pdfminer.six-20200517-py3-none-any.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 27.1 MB/s 
Collecting Wand
  Downloading Wand-0.6.7-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 51.8 MB/s 
[?25hCollecting pycryptodome
  Downloading pycryptodome-3.10.4-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 28.3 MB/s 
Building wheels for collected packages: pdfplumber
  Building wheel for pdfplumber (setup.py) ... [?25l[?25hdone
  Created wheel for pdfplumber: filename=pdfplumber-0.5.28-py3-none-any.whl size=32240 sha256=52249695d95b9fd2ea239df5b9dc1295d911b7e20e6064a6e77949b9788520a9
  Stored in directory: /root/.cache/pip/wheels/f2/b1/a0/c0a77b756d580f53b3806ae0e0b3ec945a8d05fca1d6e10cc1
Successfully built pdfplumb

In [4]:
!pip install us

Collecting us
  Downloading us-2.0.2.tar.gz (14 kB)
Collecting jellyfish==0.6.1
  Downloading jellyfish-0.6.1.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 37.8 MB/s 
[?25hBuilding wheels for collected packages: us, jellyfish
  Building wheel for us (setup.py) ... [?25l[?25hdone
  Created wheel for us: filename=us-2.0.2-py3-none-any.whl size=11942 sha256=8399846c3e19f7fdc438f8d2d24d491d3ed7513017786d74a1da0b99586f9d63
  Stored in directory: /root/.cache/pip/wheels/ca/6b/11/cda9ea2438f721330a35c9a2c8e34b4aedcd34c89af48a4d00
  Building wheel for jellyfish (setup.py) ... [?25l[?25hdone
  Created wheel for jellyfish: filename=jellyfish-0.6.1-cp37-cp37m-linux_x86_64.whl size=72185 sha256=ee4f29568893d769da34a2d266807d4501d4dda2f2f5c73717c3869d5f2ca4ce
  Stored in directory: /root/.cache/pip/wheels/a1/99/51/7de469e37cd1b3c763c24394e1ebf1baa2d79e094bf346cf80
Successfully built us jellyfish
Installing collected packages: jellyfish, us
Successfully installed jellyfish-0

### Mount Google Drive

In [2]:
from google.colab import drive 

drive.mount('/content/gdrive')

Mounted at /content/gdrive


***
# Functions

1. Get Raw Text
  * getText(pathToFile)
  * preprocessText(text)

2. Label Entities
  * get_US_States( )
  * getLocations(text)
  * getLocationLabels(lst)

3. Display Tagged Document
  * getHTMLString(text)
  * displayHTML(HTMLString)







### Get Raw Text

In [1]:
"""
Extract text from PDF file as a string
"""
def getText(pathToFile: str) -> str:
    # Open pdf file
    pdfFile = pdfplumber.open(pathToFile)

    # Get list of all pages' objects
    allPages = pdfFile.pages

    # Extract text from each page and store into one string
    allText = ""
    for pageObject in allPages:
        pageText = pageObject.extract_text()
        allText += pageText
    
    return allText

In [2]:
"""
Preprocess text by replacing newline with a space

Notes:
   - After removing \n, it tags additional phrases
   - But, sometimes it adds previous line's words into new line's phrase
"""
def preprocessText(text: str) -> str:
    # Make lower case, and remove newline
    preprocessedText = text.replace("\n", " ")
    return preprocessedText

### Label Entities

In [3]:
"""
Get list of U.S. states
(not including Distric of Columbia, currently)
"""
def get_US_States():
    # Get <State: Name> elements in list
    US_States = us.states.STATES
    # Get state names only
    US_States = [state.name for state in US_States]
    
    # Get list of state abbreviations
    US_States_Abbreviations = []
    for state in US_States:
        stateObj = us.states.lookup(state)
        US_States_Abbreviations.append(stateObj.abbr)

    # Return both lists
    return US_States, US_States_Abbreviations

In [4]:
"""
Tag entities as GPE and LOC 
    Input: a string of all text in pdf
    Output: 2 lists with locations from text
"""
def getLocations(text: str):
    nlp = spacy.load('en')
    doc = nlp(text)

    GPEList, LOCList = [], []
    for i,ent in enumerate(doc.ents):
        if ent.label_ == "GPE":
            GPEList.append(ent.text)
        elif ent.label_ == "LOC":
            LOCList.append(ent.text)

    return GPEList, LOCList

In [5]:
"""
Assign labels to locations 
    Input: List of words 
    Output: Dictionary of word:label

Issues:
    Abbreviations and short forms of words
"""
def getLocationLabels(lst: list) -> dict:
    # Define labels
    countries = ['us', 'u.s.', 'united states', 'usa', 'united states of america']
    
    states, stateAbbreviations = get_US_States()
    #states = states + stateAbbreviations
    states = [state.lower() for state in states]

    cities = ['san jose', 'san josé', 'san carlos', 'south bay', 
              'silicon valley', 'santa clara',]
    streets = ['street', 'st', 'boulevard', 'road', 'rd',]
    directions = ['north', 'south', 'west', 'east']

    # Check for locations in list
    locations = {}
    for word in lst:
        originalWord = word
        word = word.lower()

        # 
        for country in countries:
          if country in word:
            wordLst.append(word)
        wordLst = [word for country in countries if country in word]
        if len(wordLst) > 0:
            locations[originalWord] = "Country"
            print("Country: " + originalWord)

        wordLst = [word for state in states if state in word]
        if len(wordLst) > 0:
            locations[originalWord] = "State"
            print("State: " + originalWord)

        wordLst = [word for city in cities if city in word]
        if len(wordLst) > 0:
            locations[originalWord] = "City"
            print("City: " + originalWord)

        wordLst = [word for streetType in streets if streetType in word]
        if len(wordLst) > 0:
            locations[originalWord] = "Street Type"
            print("Street Type: " + originalWord)

        wordLst = [word for direction in directions if direction in word]
        if len(wordLst) > 0:
            locations[originalWord] = "Direction"
            print("Direction: " + originalWord)
    # dictionary of word:label pairs
    return locations

### Display Tagged Document

In [6]:
"""
Get HTML code for the document with all entity tags
"""
def getHTMLString(text: str):
   nlp = spacy.load('en')
   doc = nlp(text)
   # Get HTML with tagged document
   HTMLString = spacy.displacy.render(doc, style='ent')
   return HTMLString

In [7]:
"""
View HTML page in notebook
"""
def displayHTML(HTMLString):
    HTMLOutput = HTML(HTMLString)
    display(HTMLOutput)

***
# Run Entity Recognition

In [8]:
# Load libraries
import spacy
import pdfplumber
import us
from IPython.display import HTML

### View Locations in One File

In [9]:
# Load one file
pathToFolder = "/content/gdrive/My Drive/#proj-city-agenda-scraper/Agenda_Scraper_Files/Legistar/"
filename = "SanJose5.pdf"
pathToFile = pathToFolder + filename

# Get raw text
text = getText(pathToFile)
text = preprocessText(text)

# Get GPE and LOC tagged phrases
GPE, LOC = getLocations(text)
# Add specified labels (output dictionaries)
GPE_Labels = getLocationLabels(GPE)
print("-"*30)
LOC_Labels = getLocationLabels(LOC)

State: California
City: East Santa Clara Street
Street Type: East Santa Clara Street
Direction: East Santa Clara Street
City: the City of San José
City: Santa Clara
State: California
City: Santa Clara County
City: Santa Clara County Records
Street Type: Santa Clara County Records
City: San José
State: California
Street Type: North Almaden Boulevard
Direction: North Almaden Boulevard
------------------------------
Street Type: EAST
Direction: EAST
Street Type: North Almaden Boulevard
Direction: North Almaden Boulevard
Street Type: East
Direction: East
Street Type: West
Direction: West
Street Type: East
Direction: East
Street Type: East
Direction: East
Street Type: West
Direction: West
Street Type: BOULEVARD EAST
Direction: BOULEVARD EAST


In [10]:
# View original tagged phrases
print(GPE)

['SQUARE', 'California', 'East Santa Clara Street', 'the City of San José', 'County', 'Santa Clara', 'California', 'Santa Clara County', 'Santa Clara County Records', 'the POINT OF BEGINNING', 'Blvd', 'Cal', 'San José', 'City', 'California', 'North Almaden Boulevard', 'Mayor']


In [11]:
# View original tagged phrases
print(LOC)

['EAST', 'North Almaden Boulevard', 'the Grand Deed', 'East', 'West', 'East', 'East', 'West', 'BOULEVARD EAST']


### View Tagged File with HTML

In [12]:
# Load one file
pathToFolder = "/content/gdrive/My Drive/#proj-city-agenda-scraper/Agenda_Scraper_Files/Legistar/"
filename = "SanJose14.pdf"
pathToFile = pathToFolder + filename
# Get raw text
text = getText(pathToFile)

HTMLString = getHTMLString(text)

displayHTML(HTMLString)

In [None]:
# Load one file
pathToFolder = "/content/gdrive/My Drive/#proj-city-agenda-scraper/Agenda_Scraper_Files/Legistar/"
filename = "SanJose14.pdf"

pathToFile = pathToFolder + filename
text = getText(pathToFile)
text = preprocessText(text) #remove newline chars

HTMLString = getHTMLString(text)

displayHTML(HTMLString)