# Exploring the Press Data
## 4) Extracting Useful Information

In [1]:
import numpy as np
import pandas as pd

import spacy
from spacy import displacy

import pymongo

import pickle
import pprint
import sortedcontainers

import json
import bson

Connect to Mongo (used to gather more information about the articles)

In [2]:
# connect to MongoDB
client = pymongo.MongoClient("mongodb+srv://<username>:<password>@cluster0-uwh3o.mongodb.net/pressdata_db?retryWrites=true&w=majority")
db = client.get_database('pressdata_db')

# all the records are part of the "clippings" collection
records = db.clippings

### a) Import and structure the labelled articles

Define a function to reorganize and unpack the the Doc objects (labelled articles) from the *labelled_text.bin* file into a dictionary for easier access and processing

In [3]:
from bson import ObjectId

# unpacks the list of Doc objects from the labelled_text.bin file and returns the dictionary
# of the program streams and the list of articles that correspond to each program
# each article is represented by a dictionary as described above
def unpack_doc_list(doc_list):

    doc_dictionary = dict()
    
    program = None
    for doc in doc_list:
        if isinstance(doc, str):
            if doc.find("Start of Article: ") != -1:
                article = {'_id': ObjectId(doc.replace("Start of Article: ", "")), "text": []}
            elif doc == "End of Article":
                doc_dictionary[program].append(article)
            else:
                program = doc
                doc_dictionary[program] = []
            continue

        article['text'].append(doc)

    return doc_dictionary

Import the labelled text

In [4]:
with open('./data/labelled_text.bin','rb') as infile:
    doc_list = pickle.load(infile)
    infile.close()

Create the article dictionary

In [5]:
doc_dictionary = unpack_doc_list(doc_list)

### b) Create a dictionary of equivalent names

To use the dictionary, you use the search word as the key, and the corresponding value will be the standardized word it is equivalent to.

For example: searching "ACOA" gives "Atlantic Canada Opportunities Agency" (and searching "Atlantic Canada Opportunities Agency" also gives "Atlantic Canada Opportunities Agency").

Define the function to create the dictionary of equivalencies

In [6]:
# creates and returns a dictionary of equivalent words for the purposes of the BIGS programs
# this includes equivalent department names, program names, and the names of provinces and territories
def create_equivalency_dictionary():

    # read the departments and programs from the "Departments and Programs" excel sheet, and list the provinces and territories
    dept_list = pd.read_excel("./excel_sheets/Departments and Programs.xlsx", sheet_name = 'Departments').fillna("N/A")
    programs_list = pd.read_excel("./excel_sheets/Departments and Programs.xlsx", sheet_name = 'Programs').fillna("N/A")
    provinces_and_territories = ['Nova Scotia', 'New Brunswick', 'Prince Edward Island', 'Newfoundland and Labrador', 'Quebec', 'Ontario', 'Manitoba', 'Saskatchewan', 'Alberta', 'British Columbia', 'Yukon', 'Northwest Territories', 'Nunavut']

    # create the emptydictionary
    equivalencies = dict()

    # add equivalent department names
    for index, row in dept_list.iterrows():
        dept = row['Department']
        for category in row:
            if category == "N/A":
                continue
            equivalencies[category] = dept

    ##### add equivalent program names

    # this just finds relevant columns within the Programs excel sheet (some of them are just comments)
    relevant_columns = []
    for column in programs_list.columns:
        if column.find("Unnamed") == -1:
            relevant_columns.append(column)
    programs_list = programs_list[relevant_columns]

    # add the programs to the dictionary
    for index, row in programs_list.iterrows():
        program = row['Programs']
        for category in row:
            if category == "N/A":
                continue
            equivalencies[category] = program

    #####

    # add equivalent province names
    equivalencies['PEI'] = 'Prince Edward Island'
    
    return equivalencies

Define a function to map a word to its equivalent or standard word using the equivalencies dictionary

In [7]:
# returns the equivalent word to the given word
# this simply checks whether the word is contained within the equivalencies dictionary, and 
# returns the corresponding value if it is, and simply the original word if it's not
def equivalency(word):
    if word in equivalencies:
        return equivalencies[word]
    else:
        return word

Create the equivalency dicationary

In [8]:
equivalencies = create_equivalency_dictionary()

### c) Building the structured data set

In [49]:
# finds all key sentences within the article for each of the given patterns, and calls the 
# corresponding callback function for each of the sentences
# each individual pattern in list_of_patterns must be a list itself with 2 entries of the form 
# [ list of entity labels, callback function ]
# where the list of strings corresponding to the entity labels, and the callback functions take 
# 3 arguments: the sentence (span object), the paragraph (doc object), and the article id (Mongo Object ID)
# returns: void
def find_key_sentences(article, list_of_patterns):
    
    # deconstruct the patterns
    # that is, retrieve the lists of target entities labels and the list of corresponding callbacks
    list_of_flags = []
    list_of_callbacks = []
    for pattern in list_of_patterns:
        list_of_flags.append(pattern[0])
        list_of_callbacks.append(pattern[1])
          
    # create lists for the types of key sentences and their correpsonding docs
    # each list within the list corresponds to a different type of key sentence specified by each 
    # of the patterns given
    # the i^th inner list stores all sentences matching the entity labels of the i^th flag in 
    # the list list_of_flags
    key_sentences = [ list() for x in list_of_flags]
    key_docs = [ list() for x in list_of_flags]
    
    # retrieve the id and the text of the article
    oid = article['_id']
    text = article['text']
    
    # find the key sentences in each doc
    for doc in text:
        
        # look at each sentence in the doc
        for sent in doc.sents:
            
            # create a list of all of the different entity labels in the sentence
            list_of_ent_labels = []
            for ent in sent.ents:
                list_of_ent_labels.append(ent.label_)
            
            if list_of_ent_labels == []:
                continue
            
            # loop through the types of key sentences
            for i in range(0, len(key_sentences)):
                
                # see if the sentence matches the key sentence criteria
                if all(x in list_of_ent_labels for x in list_of_flags[i]):
                    
                    # add the sentence to the corresponding key sentences list
                    if sent not in key_sentences[i]:                        
                        key_sentences[i].append(sent)
                        key_docs[i].append(doc)
                    
                    # don't allow sentences to be key sentences for multiple criteria
                    break
    
    # call the corresponding callback function for each of the key sentences
    for i in range(0, len(key_sentences)):
        for j in range(0, len(key_sentences[i])):
            list_of_callbacks[i](key_sentences[i][j], key_docs[i][j], oid)
            
    return

In [50]:
# variable to hold the program stream table data, which is used to create the table 
program_stream_table = list()

# creates a dictionary representing a row in the program stream table, and adds it to the 
# global variable: program_stream_table
# each row contains the program name, the article title, id, publication date, and the 
# sentence and paragraph by default (however these can be changed)
# columns can be adjusted using the column_categories list
# input:
#   sent - span object
#   doc - Doc object
#   article_id - Mongo Object ID
# returns: void
def add_to_program_stream_table(sent, doc, article_id):
    
    # adjust what columns are included in the table, and also whether duplicates are preserved
    column_categories = [
        ['GOV_DEPT', False],
        ['PROV_GOV', False],
        ['MONEY', True],
        ['PAYMENT_TYPE', True],
        ['ORG', False],
    ]
    
    # create a label dictionary for all entities in the sentence
    label_dict = create_label_dictionary(sent)

    # get all the program streams mentioned in the sentence
    programs = set(label_dict['PROGRAM_STREAM'])
            
    # create a row in the table for each program stream
    for program in programs:
        
        # find the article on Mongo to incorporate article information
        mongo_article = records.find_one({'_id': article_id})
        
        # represent the data contained in the column_categories object as two separate lists
        other_categories = [x[0] for x in column_categories]
        keep_duplicates = [x[1] for x in column_categories]
        
        # create the row
        row = dict()
        
        # add the column information
        row['Program'] = program
        
        for i in range(0, len(other_categories)):
            category = other_categories[i]
            if category in label_dict:
                add_to_row(row, category, label_dict[category], keep_duplicates[i])
            else:
                row[category] = None  
        
        row['Sentence'] = sent.text
        row['Paragraph'] = doc.text
        
        row['Publication Date'] = mongo_article['pub_date']
        row['Article ID'] = article_id
        row['Article'] = mongo_article['title']
    
        # add the row to the global program_stream_table object
        program_stream_table.append(row)

    return
    
# adds a new column to the given row (dictionary) with the given entity label "category"
# the entries of the column are the new_items which are either stored as a list if duplicates 
# is True, a set if duplicates is False, or a single value if there is only one object
# returns: void
def add_to_row(row, category, new_items, duplicates):
    if len(new_items) == 1:
        row[category] = new_items[0]
    elif duplicates:
        row[category] = new_items
    else:
        s = set(new_items)
        if len(s) == 1:
            row[category] = s.pop()
        else:
            row[category] = s
    
    return row

# creates and returns a dictionary of the entity labels, and the corresponding entities present
# within the given sentence
# government collaborations are parsed and put into the PROV_GOV label instead
# input: Span object
# returns: dictionary
def create_label_dictionary(sent):
    label_dict = dict()
    for ent in sent.ents:
        
        text = ent.text
        label = ent.label_
        
        if label == 'GOV_COLLAB':
            for word in ent:
                if word.text in provinces_and_territories:
                    text = 'Government of ' + word.text
                    label = 'PROV_GOV'
                    break
        
        if label in label_dict:
            label_dict[label].append(equivalency(text))
        else:
            label_dict[label] = [equivalency(text)]
    
    return label_dict 

In [51]:
# callback function for key sentences with governments collaborations
# unimplemented
def found_collab_data(sent, doc, article_id):
    return
  
# callback function for key sentences with payment types
# unimplemented
def found_payment_type(sent, doc, article_id):
    return 

In [52]:
patterns = [
    [['PROGRAM_STREAM', 'MONEY'], add_to_program_stream_table],
    [['GOV_COLLAB'], found_collab_data],
    [['PAYMENT_TYPE'], found_payment_type]
]

In [53]:
for program in doc_dictionary:
    
    print('Prcessing articles for', program)
    
    articles = doc_dictionary[program]
    
    for article in articles:
        find_key_sentences(article, patterns)

Prcessing articles for Accelerated Growth Service
Prcessing articles for Advanced Manufacturing Fund
Prcessing articles for Aerospace Program
Prcessing articles for Agri-Science Clusters
Prcessing articles for AgriInnovate Program
Prcessing articles for AgriInnovation Program
Prcessing articles for AgriMarketing Program
Prcessing articles for AgriProcessing Initiative
Prcessing articles for AgriScience Program
Prcessing articles for Agricultural Clean Technology Program
Prcessing articles for Agricultural Greenhouse Gases Program
Prcessing articles for Agricultural Innovation Program
Prcessing articles for Applied Research and Development Grants
Prcessing articles for Aquaculture Collaborative Research and Development Program
Prcessing articles for Aquatic and Crop Resource Development
Prcessing articles for Atlantic Fisheries Fund
Prcessing articles for Atlantic Innovation Fund
Prcessing articles for Automotive Innovation Fund
Prcessing articles for Automotive Supplier Innovation Fund

Create the Data Frame of the structured data 

In [59]:
df = pd.DataFrame(program_stream_table).sort_values(by=['Program'])
df_concise = df.sort_values(by=['Program']).set_index(['Program', 'Article'])

Export the Data Frame to Excel

In [64]:
df.to_excel("./excel_sheets/Program Dataset.xlsx", index=False)
df_concise.to_excel("./excel_sheets/Program Dataset (Concise).xlsx")

### d) Counting the number of entities in each article (Experimental)

In [None]:
# function that collects and counts the number of entities in each article
# input is a list of doc objects, and a list of string representing entities that will be counted
# returns: dictionary
def count_entities(article, list_of_entities):
    
    counts = {entity:dict() for entity in list_of_entities}
    
    for line in article:
        for word in line.ents:
            label = word.label_
            if label in list_of_entities:
                word = equivalency(word.text)
                if word in counts[label]:
                    counts[label][word] += 1
                else:
                    counts[label][word] = 1
   
    return counts

In [None]:
article = doc_dictionary["Women Entrepreneurship Strategy"][7]
# for line in article['text']:
#     displacy.render(line, style='ent')

In [None]:
list_of_entities = ['FED_GOV', 'PROV_GOV', 'GOV_DEPT', 'PROGRAM_STREAM', 'MONEY', 'PAYMENT_TYPE', 'GPE', 'ORG', 'PROVINCE', 'GOV_COLLAB']
    
entities = count_entities(article['text'], list_of_entities)
entities

In [None]:
label = 'ORG'
pd.DataFrame(entities[label].items(), columns = [label, "Count"]).sort_values(by=['Count'], ascending=[False])