# Exploring the Press Data
## 2) Finding Articles with Program Streams

In [22]:
import numpy as np
import pandas as pd

import pprint
import json
import bson

Define the functions to search for words in the list of articles and create a dictionary to store matches

In [47]:
# Determines whether the word occurs within the given list of strings
# returns True if so, and False if not
def is_word_in_string_list(list_of_strings, word):
    for line in list_of_strings:
        if line.find(word) != -1:
            return True
    return False

In [48]:
# Creates a dictionary of the list of articles that contain each word in the given list of words
# NOTE: this function assumes the incoming article objects have the same format as the raw articles stored 
# on MongoDB. That is, the full text is stored as a list of strings for each article object
# and can be accessed from the article object using "article_name['details']['fulltext']"
# If duplicate words are contained within the list of words, the function issues a warning,
# detailing which word occurs twice, but still continues processing
# input: list of search words, list of article objects
# output: dictionary of the list of articles containing each search word
def create_dictionary_of_articles(list_of_words, list_of_articles):
    
    article_dictionary = dict()
    
    # search for each word in the list of articles
    for word in list_of_words:
        
        # sanity check to make sure the list of words doesn't contain duplicates
        if word in article_dictionary:
            print("Warning: found a duplicate!", word)
            continue
        
        article_dictionary[word] = []
        
        # search through the articles for the word, and append the article to the dictionary if the word is found
        for article in list_of_articles:
            
            text = article['details']['fulltext']
            
            if is_word_in_string_list(text, word):
                article_dictionary[word].append(article)
        
        print("Found", len(article_dictionary[word]), "articles for", word)
        
    return article_dictionary

Get the list of program streams from the Programs excel spreadsheet

In [49]:
# read the Programs sheet
programs_list = pd.read_excel("./excel_sheets/Departments and Programs.xlsx", sheet_name = 'Programs')['Programs'].to_numpy()
programs_list = np.sort(programs_list)

In [50]:
print("Number of programs:", len(programs_list))

Number of programs: 144


Import the cleaned articles from the cleaned_articles.json file in the "data" directory, and convert the JSON objects into BSON objects for possible integration with MongoDB

In [51]:
with open('./data/cleaned_articles.json') as json_file:
    data = json.load(json_file)
    articles_json = data['cleaned_articles']

articles = []

# import function needed to convert JSON into BSON
from bson.json_util import loads

# convert the article from JSON into BSON
for article in articles_json:
    articles.append(loads(article))

Search for the program names in the list of articles, and store the results in a dictionary

In [52]:
results = create_dictionary_of_articles(programs_list, articles)

Found 89 articles for Accelerated Growth Service
Found 16 articles for Advanced Manufacturing Fund
Found 2 articles for Aerospace Program
Found 0 articles for Agri-Science Clusters
Found 6 articles for AgriInnovate Program
Found 27 articles for AgriInnovation Program
Found 39 articles for AgriMarketing Program
Found 0 articles for AgriProcessing Initiative
Found 34 articles for AgriScience Program
Found 3 articles for Agricultural Clean Technology Program
Found 14 articles for Agricultural Greenhouse Gases Program
Found 0 articles for Agricultural Innovation Program
Found 0 articles for Applied Research and Development Grants
Found 2 articles for Aquaculture Collaborative Research and Development Program
Found 0 articles for Aquatic and Crop Resource Development
Found 37 articles for Atlantic Fisheries Fund
Found 41 articles for Atlantic Innovation Fund
Found 26 articles for Automotive Innovation Fund
Found 0 articles for Automotive Supplier Innovation Fund
Found 3 articles for Automot

Exporting the results to a JSON file called *cleaned_articles_dictionary.json* in the **data** directory

In [43]:
# import function needed to convert BSON into JSON
from bson.json_util import dumps

# create a dictionary using JSON instead of BSON
results_json = dict()
for program in results:
    results_json[program] = []
    for article in results[program]:
        results_json[program].append(dumps(article))

data = dict()
data['cleaned_articles_dictionary'] = results_json

# save to the file (note, this re-writes the entire file)
with open('./data/cleaned_articles_dictionary.json', 'w') as outfile:
    json.dump(data, outfile)