# Exploring the Press Data
## 3) Applying the NER

In [1]:
import numpy as np
import pandas as pd

import spacy
from spacy import displacy

import pickle

import sortedcontainers
import pprint
import json
import bson

Set pandas to display 150 rows of a table by default, if it can.

In [2]:
pd.set_option("display.max_rows", 150)

### a) Enhance spaCy's NER using the EntityRuler

Define the functions to create a single pattern and label for the EntityRuler, as well as all the patterns used by this program

In [3]:
# generates and returns the list of all patterns used by the EntityRuler
# this includes patterns to recognize the program streams, the departments, payment types, the federal 
# and provincial governments, the provinces/territories, and government collaborations
# returns: a list of dictionaries
def generate_patterns():
    
    # form the lists of the names of the program streams, departments, payment types, and provinces/territories
    programs_list = pd.read_excel("./excel_sheets/Departments and Programs.xlsx", sheet_name = 'Programs').fillna("N/A")
    dept_list = pd.read_excel("./excel_sheets/Departments and Programs.xlsx", sheet_name = 'Departments').fillna("N/A")
    payment_types = ['grant', 'repayable contribution', 'non-repayable contribution', 'intermediary']
    provinces_and_territories = ['Nova Scotia', 'New Brunswick', 'Prince Edward Island', 'PEI', 'Newfoundland and Labrador', 'Quebec', 'Ontario', 'Manitoba', 'Saskatchewan', 'Alberta', 'British Columbia', 'Yukon', 'Northwest Territories', 'Nunavut']
    
    # the list of all EntityRuler patterns
    patterns = []

    # add pattern to recognize the federal government
    patterns.append(create_pattern('FED_GOV', 'Government of Canada', False))
    patterns.append(create_pattern('FED_GOV', 'Parliament', False))

    # add patterns to recognize the provincial governments
    for prov_terr in provinces_and_territories:
        patterns.append(create_pattern('PROV_GOV', "Government of " + prov_terr, False))
        patterns.append(create_pattern('PROV_GOV', prov_terr + " government", False))

    # add patterns to recognize the provinces/territories (which will all be labelled as PROVINCE)
    for prov_terr in provinces_and_territories:
        patterns.append(create_pattern('PROVINCE', prov_terr, True))

    # add patterns to recognize collaborations between the federal government and provincial governments
    for prov_terr in provinces_and_territories:
        patterns.append(create_pattern('GOV_COLLAB', "Governments of Canada and " + prov_terr, False))
        patterns.append(create_pattern('GOV_COLLAB', "Governments of " + prov_terr + " and Canada", False))

    # add patterns to recognize the payment types
    for payment_type in payment_types:
        patterns.append(create_pattern('PAYMENT_TYPE', payment_type, True))

    # add patterns to recognize the government departments and their abbreviations
    for column_name, dept_names in dept_list.items():
        for dept in dept_names:
            if dept == 'N/A':
                continue
            patterns.append(create_pattern('GOV_DEPT', dept, True))

    # add patterns to recognize the program streams and their abbreviations
    for column_name, program_names in programs_list.items():
        if column_name.find("Unnamed") != -1:
            continue
        for program in program_names:
            if program == 'N/A':
                continue
            patterns.append(create_pattern('PROGRAM_STREAM', program, True))
    
    return patterns

# generates and returns a single pattern in the format needed by the EntityRuler
# patterns are generated from the given label and pattern (text), where the variable exact_match is a boolean
# representing whether the rule will be case sensitive or not
# if exact_match is True, the EntityRuler will only re-label text corresponding to the the 
# exact pattern given 
# however if False, the EntityRuler will re-label any text matching the pattern when all words are 
# converted to lower case
# input:
#   label - string  
#   pattern - string
#   exact_match - boolean
# returns:
#   dictionary
def create_pattern(label, pattern, exact_match):
    pattern_dict = dict()
    
    pattern_dict['label'] = label
    
    if exact_match:
        pattern_dict['pattern'] = pattern
    else:
        words = pattern.lower().split()
        for i in range(0, len(words)):
            words[i] = {'lower': words[i]}
        pattern_dict['pattern'] = words
    
    return pattern_dict

Create the patterns for the EntityRuler

In [4]:
patterns = generate_patterns()

Add the EntityRuler to spaCy's pipeline to extend (and correct) the NER

In [5]:
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")

ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

### b) Import the Articles

In [6]:
# import function needed to convert BSON into JSON
from bson.json_util import loads

# dictionary to store the program-article dictionary
results = dict()

# import the dictionary of the cleaned articles
with open('./data/cleaned_articles_dictionary.json') as json_file:
    data = json.load(json_file)
    results_json = data['cleaned_articles_dictionary']

    # convert the JSON into BSON
    for program in results_json:
        results[program] = []
        for article in results_json[program]:
            results[program].append(loads(article))

### c) Process the Articles

Create a function to apply the customized NLP to a specific article

In [7]:
# applies the NLP to each paragraph in the given article and returns a list of the spaCy Doc objects
# NOTE: this function assumes the incoming article objects have the same format as the raw articles stored 
# on MongoDB. That is, the full text is stored as a list of strings for each article object
# and can be accessed from the article object using "article_name['details']['fulltext']"
# returns list of Doc objects
def process_article(article):

    # get the article text
    text = article['details']['fulltext']
    
    doc_list = []
    
    doc_list.append("Start of Article: " + str(article['_id']))
    
    # apply the NLP to each line and store the result in a list
    for doc in nlp.pipe(text):
        
        doc_list.append(doc)
        
    doc_list.append("End of Article")
        
    return doc_list

Apply the NLP to all the program stream articles

In [8]:
# list to store the Doc objects for all the articles
doc_list = []

# counters used to show the progress of the labelling
i = 1
total_programs = len(results.keys())

# apply the NLP to all the articles, by looping through each program stream in the dictionary
for program in results:
    
    # get the articles for the program
    program_articles = results[program]
    
    # print statement to indicate which program is being processed currently
    status = "[" + str(i) + "/" + str(total_programs) + "]"
    print("Working on", program, status, "...", end=" ")
    
    # add the program name to the doc_list just as a string
    doc_list.append(program)
    
    # apply the NLP to each earticle and add all the Doc objcts to the doc_list
    for article in program_articles:

        doc_list.extend(process_article(article))
    
    print("done!")
    i += 1

Working on Accelerated Growth Service [1/144] ... done!
Working on Advanced Manufacturing Fund [2/144] ... done!
Working on Aerospace Program [3/144] ... done!
Working on Agri-Science Clusters [4/144] ... done!
Working on AgriInnovate Program [5/144] ... done!
Working on AgriInnovation Program [6/144] ... done!
Working on AgriMarketing Program [7/144] ... done!
Working on AgriProcessing Initiative [8/144] ... done!
Working on AgriScience Program [9/144] ... done!
Working on Agricultural Clean Technology Program [10/144] ... done!
Working on Agricultural Greenhouse Gases Program [11/144] ... done!
Working on Agricultural Innovation Program [12/144] ... done!
Working on Applied Research and Development Grants [13/144] ... done!
Working on Aquaculture Collaborative Research and Development Program [14/144] ... done!
Working on Aquatic and Crop Resource Development [15/144] ... done!
Working on Atlantic Fisheries Fund [16/144] ... done!
Working on Atlantic Innovation Fund [17/144] ... done

Working on Strategic Partnership Grants for Projects [127/144] ... done!
Working on Support for Publishers [128/144] ... done!
Working on Sustainable Development Technology Canada [129/144] ... done!
Working on Technology Access Centres Grants [130/144] ... done!
Working on Technology Demonstration Program [131/144] ... done!
Working on Technology Partnerships Canada [132/144] ... done!
Working on Temporary Foreign Worker Program [133/144] ... done!
Working on Trade Commissioner Service [134/144] ... done!
Working on University Idea to Innovation Grants [135/144] ... done!
Working on Western Diversification Program [136/144] ... done!
Working on Western Innovation Initiative [137/144] ... done!
Working on Women Entrepreneurship Fund [138/144] ... done!
Working on Women Entrepreneurship Strategy [139/144] ... done!
Working on Women's Entreprise Initiative [140/144] ... done!
Working on eHealth Innovations Partnership Program [141/144] ... done!
Working on ecoENERGY for Renewable Heat [1

Pickle and save the Doc objects to a file for further processing

In [9]:
file_name = './data/labelled_text.bin'

with open(file_name,'wb') as outfile:
    pickle.dump(doc_list, outfile)
    outfile.close()