# Exploring the Press Data
## 1) Cleaning up the Articles

In [3]:
import pymongo
import numpy as np
import pandas as pd

import pprint
import copy

import json
import bson

Connect to MongoDB

In [4]:
# connect to MongoDB
client = pymongo.MongoClient("mongodb+srv://<username>:<password>@cluster.mongodb.net/pressdata_db?retryWrites=true&w=majority")
db = client.get_database('pressdata_db')

# All the records are part of the "clippings" collection
records = db.clippings

Define the functions to clean up individual articles

In [20]:
# General text clean-up function for the text of one article, where the text is stored as a list of strings
# This includes removing unnecessary formating characters as well as combining any separated sentences or paragraphs 
# into coherent units.
# The function also removes some unnecessary sections in the article that aren't important for the BIGS program
# (see the remove_irrelevant_sections() function)
# Input: a list of strings
def cleanup_text(text):
    
    # Remove formatting characters: extra spaces, tabs, new lines, and "xa0" characters
    remove_formatting_characters(text)

    # re-formats the apostrophes in the text, and also changes any \xa0 characters embedded in the text
    change_apostrophes_and_xa0(text)
    
    # combines any separated paragraphs or sentences into one
    combine_lines_to_sentences(text)

    # removes irrelevant sections from the text (see the function below for more details)
    remove_irrelevant_sections(text)
    
    return text

# Removes all unecessary formatting characters
# this includes spaces and tabs at the beginning and end of each line, as well as 
# lone "\xa0" characters, new line characters and tabs
# Input: a list of strings
def remove_formatting_characters(text):
    
    i = 0
    while i < len(text):
            text[i] = text[i].strip()
            i += 1
    return
    
# Re-formats the apostrophes in the text, and also changes any \xa0 characters embedded in the text to spaces
# Related to the apostrophes, this is mainly done since spaCy doesn't always recognize the apostrophes in the raw text, 
# for unknown reasons. So this re-formatting is done for greater consistency. 
# Input: list of strings
def change_apostrophes_and_xa0(text):
    i = 0
    while i < len(text):        
        text[i] = text[i].replace("â€™", "'")
        text[i] = text[i].replace("\xa0", " ")
        i += 1
        
    return

# Combines any lines in the text list to form complete sentences and paragraphs
# After applying this function, each line in the text list (should) corresponds to a paragraph
# in the article
# Input: list of strings
def combine_lines_to_sentences(text):
    
    i = 0
    while i < len(text):

        # get the current line, and the next line
        line = text[i]
        if i != len(text) - 1:
            next_line = text[i+1]
        else:
            next_line = None

        # if the current line is a blank line, remove it; otherwise, if both the current line and the next line 
        # contain useful text, they are merged together to form one paragraph or sentence
        if line == '':
            text.pop(i)
            continue 
        elif (next_line != None) and (next_line != ''):
            if next_line[0] in [",", "'"]:
                text[i+1] = line + next_line
            else:
                text[i+1] = line + " " + next_line
            text.pop(i)
            continue
        i += 1
    
    return

# Removes unnecessary sections from the article text
# This includes the "Report a problem or mistake on this page" and/or the "Search for related information by keyword" sections
def remove_irrelevant_sections(text):
    try:
        start = text.index("Search for related information by keyword")
    except ValueError:
        try:
            start = text.index("Report a problem or mistake on this page")
        except ValueError:
            return
        
    end = text.index("Date modified:")
    del text[start: end]
    
    return

Define the function (generating function) to clean up all articles in a given department (or simply a list of articles)

In [4]:
# Cleans up all the articles in the given list of articles. 
# Note that this function assumes the incoming article objects have the same format as the raw articles 
# stored on MongoDB. That is, the full text is stored as a list of strings for each article object
# and can be accessed from the article object using "article_name['details']['fulltext']"
# This function creates a deep copy of each article and cleans the copy, so that original articles are 
# not affected. 
# The cleaned articles are then yielded one at a time.
# The list name can also be provided for print statments which indicate the progress of the function
# (this is useful for cleaning the articles for several different departments)
# input: list of articles, string
def cleanup_articles(list_of_articles, list_name = None):
    
    if list_name is not None:
        print("Cleaning all", len(list_of_articles), "articles in", list_name, "...", end=" ")
    
    # copy and clean each article in the list
    for i in range(0, len(list_of_articles)):
        # create a deep copy so as to not affect the original list of articles
        article = copy.deepcopy(list_of_articles[i])
        text = article['details']['fulltext']
        text = cleanup_text(text)
        yield article
    
    print("Success!")

Import the department names from the Excel sheet (*Departments and Programs.xlsx*)

In [5]:
# get the department names
dept_names = pd.read_excel("./excel_sheets/Departments and Programs.xlsx", sheet_name = "Departments")['Department'].to_numpy()

Query the database for the articles for each department, and clean them

In [6]:
# create a list of cleaned articles
cleaned_articles = list()

# query the database for the articles for each department, and clean them up
for dept in dept_names:
    
    dept_articles = list(records.find({'department':dept}))
    
    for article in cleanup_articles(dept_articles, dept):
        cleaned_articles.append(article)

Cleaning all 709 articles in Agriculture and Agri-Food Canada ... Success!
Cleaning all 1031 articles in Atlantic Canada Opportunities Agency ... Success!
Cleaning all 807 articles in Canada Economic Development for Quebec Regions ... Success!
Cleaning all 1045 articles in Canadian Heritage ... Success!
Cleaning all 177 articles in Canadian Institutes of Health Research ... Success!
Cleaning all 125 articles in Canadian Northern Economic Development Agency ... Success!
Cleaning all 73 articles in Canadian Space Agency ... Success!
Cleaning all 738 articles in Environment and Climate Change Canada ... Success!
Cleaning all 224 articles in Federal Economic Development Agency for Southern Ontario ... Success!
Cleaning all 620 articles in Fisheries and Oceans Canada ... Success!
Cleaning all 1509 articles in Global Affairs Canada ... Success!
Cleaning all 980 articles in Innovation, Science and Economic Development Canada ... Success!
Cleaning all 1374 articles in National Defence ... Succ

In [7]:
print("Number of articles found:", len(cleaned_articles))

Number of articles found: 10289


Save the cleaned articles to the *cleaned_articles.json* file in the **data** directory

In [8]:
# import function needed to convert BSON into JSON
from bson.json_util import dumps

# create the dictionary to hold the list of articles
data = {}
data['cleaned_articles'] = []

# add the articles to the dictionary
for article in cleaned_articles:
    data['cleaned_articles'].append(dumps(article))  # use dumps to convert BSON into JSON

# save to the file (note, this re-writes the entire file)
with open('./data/cleaned_articles.json', 'w') as outfile:
    json.dump(data, outfile)