# Assign boolean category labels to news articles

### Datascapes Hack, June 2020
Hack documentation can be found [here](https://paper.dropbox.com/doc/HACK-Q1-2020--A2FzQJwlu4mWkTIUmB7gSH0RAg-zuTZhovLYbSAFzktgW3SN).

**Boolean tag categories:**
- "isBLM"
- "isBrexit"
- "isCovid"
- "isEducation"
- "isImmigration"
- "isEconomy"
- "isProtest"
- "isRacial"
- "isLawAndOrder"

**Methods of asserting category membership:**
1. Search existing article tags for category keywords
2. Search article headline / summary / text for category keywords
3. Use [Mango](http://api.mango-en.virt.ch.bbc.co.uk/) or [Starfruit](http://starfruit.virt.ch.bbc.co.uk/) to auto-generate tags, and search these for category keywords

**Note:** you can find available BBC content tags [here](https://www.bbc.co.uk/things/search?q=immigration)

In [2]:
# load content data
import json
import os


CONTENT_ROOT = '/Users/fitzma02/Documents/work/data/all_content'


# copied from garden_shed.data_utils.data_io
def get_filepaths_in_directory_and_subdirs(root, extension_filter=''):
    fpaths = []
    for dirpath, dirnames, filenames in os.walk(root):
        for fname in filenames:
            if not extension_filter or fname.lower().endswith(extension_filter.lower()):
                fpaths.append(os.path.join(dirpath, fname))
    return fpaths


def load_json_data_from_root(root, extension_filter=''):
    file_paths = get_filepaths_in_directory_and_subdirs(root, extension_filter)
    data = {}
    for fpath in file_paths:
        with open(fpath) as fin:
            data[fpath] = json.load(fin)
    return data


content = load_json_data_from_root(CONTENT_ROOT, extension_filter='')
print(len(content))

57819


In [34]:
import random

# pick random test article from content data
content_list = list(content.items())
test_article = random.choice(content_list)[1]
test_article['metadata']['tags']

{'about': [{'thingLabel': 'Republic of Ireland',
   'thingUri': 'http://www.bbc.co.uk/things/cd91afdb-d0b1-4736-b1d0-64bf674a8a74#id',
   'thingId': 'cd91afdb-d0b1-4736-b1d0-64bf674a8a74',
   'thingType': ['Thing', 'Place', 'geoname:GeoTagConcept'],
   'thingSameAs': ['http://sws.geonames.org/2963597/'],
   'topicName': 'Republic of Ireland',
   'topicId': 'c207p54mdq7t',
   'curationList': [{'curationId': 'cd91afdb-d0b1-4736-b1d0-64bf674a8a74',
     'curationType': 'vivo-stream'}]}]}

In [4]:
# associate boolean categories with content tags

category_tags = {
    'isBLM': {'should': ['Black Lives Matter', 'BLM'], 'should_not': []},
    'isBrexit': {'should':['Brexit', 'Operation Yellowhammer'], 'should_not': []},
    'isCovid': {'should': ['Covid', 'Coronavirus', 'Self-isolation', 'Lockdown', 'Contact tracing', 
                           'Mers virus', 'Joint Biosecurity Centre (JBC)'],
                'should_not': []},
    'isEducation': {'should': ['Education'], 'should_not': []},
    'isImmigation': {'should': ['Immigration'], 'should_not': []},
    "isEconomy": {'should': ['Economy'], 'should_not': []},
    "isProtest": {'should': [], 'should_not': []},
    "isRacial": {'should': [], 'should_not': []},
    "isLawAndOrder": {'should': [], 'should_not': []},
}

In [22]:
# extract relevant text elements from article


def get_tags(article):
    """
    Extract tags from within article json.
    """
    all_tags = article.get('metadata', {}).get('tags', {}).get('about', [])
    tag_names = [t.get('thingLabel') for t in all_tags]
    return tag_names


def get_headline(article):
    """
    Extract headline from within article json.
    """
    return article.get('promo', {}).get('headlines', {}).get('headline', '')


def get_summary(article):
    """
    Extract summary from within article json.
    """
    return article.get('promo', {}).get('summary', '')


def get_uri(article):
    """
    Extract uri from within article json.
    """
    return article.get('metadata', {}).get('locators', {}).get('assetUri', '')

In [23]:
test_tags = get_tags(test_article)
test_headline = get_headline(test_article)
test_summary = get_summary(test_article)
test_uri = get_uri(test_article)

In [12]:
def check_text_for_keywords(text, category_tags):
    """
    Function checks if text includes category tags, excluding invalid tags.
    Args:
        text (list): list of text segments to be searched (these may be tags, headlines, or body text)
        category_tags (dict):
    Returns:
        True if text contains at least one valid category tag, and no invalid tags.
    """
    all_text = (' ').join([t.lower() for t in text])
    should = [t.lower() for t in category_tags.get('should', [])]
    should_not = [t.lower() for t in category_tags.get('should_not', [])]
    
    for s in should_not:
        if all_text.find(s) >= 0:
            return False
    for s in should:
        if all_text.find(s) >= 0:
            return True
    return False

In [20]:
all_text = test_tags + [test_headline, test_summary]
check_text_for_keywords(all_text, {'should': ['Entertainment', 'Scotland'], 'should_not': ['Brexit']})

True

In [32]:
#!pip install requests
import requests
import json


starfruit_api = 'http://starfruit.virt.ch.bbc.co.uk'
mango_api = 'http://api.mango-en.virt.ch.bbc.co.uk'


def get_results_from_autotagger(asset_uri, api=starfruit_api):
    """
    Query starfruit or mango api with article URI to return auto-generated content tags.
    """
    response = requests.get(f'{api}/topics?uri=https://www.bbc.co.uk{asset_uri}')
    body = response.content
    return json.loads(body.decode("utf-8"))


def parse_labels_from_starfruit_response(response):
    """
    Extract list of auto-generated labels from starfruit api response.
    """
    all_labels = response.get('results', [])
    return [l.get('label', {}).get('en-gb', '') for l in all_labels]


def parse_labels_from_mango_response(response):
    """
    Extract list of auto-generated labels from mango api response.
    """
    all_labels = response.get('results', [])
    print(all_labels[0]['label'])
    return [l.get('label', {}) for l in all_labels]

In [33]:
response = get_results_from_autotagger(test_uri, api=mango_api)
parse_labels_from_mango_response(response)

Arrested


['Arrested',
 'Stabbed',
 'Bessbrook',
 'murder',
 'Police',
 'Coleraine',
 'BBC News',
 'Hospital',
 'Robbery',
 'Gang',
 'Anniversary',
 'County Armagh',
 'PSNI',
 'Prosecution',
 'Crimestoppers']

In [35]:
def end_to_end_labelling(article, category_tags, use_tags=True, use_headline=True, use_summary=True, use_starfish=True, use_mango=True):
    text = []
    if use_tags:
        text += get_tags(article)
    if use_headline:
        text += [get_headline(article)]
    if use_summary:
        text += [get_summary(article)]
    if use_starfish or use_mango:
        uri = get_uri(article)
        if use_starfish:
            response = get_results_from_autotagger(uri, api='http://starfruit.virt.ch.bbc.co.uk')
            text += parse_labels_from_starfruit_response(response)
        if use_mango:
            response = get_results_from_autotagger(uri, api='http://api.mango-en.virt.ch.bbc.co.uk')
            text += parse_labels_from_mango_response(response)
    return check_text_for_keywords(text, category_tags)

In [36]:
end_to_end_labelling(test_article, category_tags['isBLM'])

NameError: name 'article' is not defined