# Assign boolean category labels to news articles

### Datascapes Hack, June 2020
Hack documentation can be found [here](https://paper.dropbox.com/doc/HACK-Q1-2020--A2FzQJwlu4mWkTIUmB7gSH0RAg-zuTZhovLYbSAFzktgW3SN).

**Boolean tag categories:**
- "isBLM"
- "isBrexit"
- "isCovid"
- "isEducation"
- "isImmigration"
- "isEconomy"
- "isProtest"
- "isRacial"
- "isLawAndOrder"

**Methods of asserting category membership:**
1. Search existing article tags for category keywords
2. Search article headline / summary / text for category keywords
3. Use [Mango](http://api.mango-en.virt.ch.bbc.co.uk/) or [Starfruit](http://starfruit.virt.ch.bbc.co.uk/) to auto-generate tags, and search these for category keywords

**Note:** you can find available BBC content tags [here](https://www.bbc.co.uk/things/search?q=immigration)

In [27]:
# load content data
import json
import os


CONTENT_ROOT = '/Users/fitzma02/Documents/work/data/all_content'


# copied from garden_shed.data_utils.data_io
def get_filepaths_in_directory_and_subdirs(root, extension_filter=''):
    fpaths = []
    for dirpath, dirnames, filenames in os.walk(root):
        for fname in filenames:
            if not extension_filter or fname.lower().endswith(extension_filter.lower()):
                fpaths.append(os.path.join(dirpath, fname))
    return fpaths


def load_json_data_from_root(root, extension_filter=''):
    file_paths = get_filepaths_in_directory_and_subdirs(root, extension_filter)
    data = {}
    for fpath in file_paths:
        with open(fpath) as fin:
            data[fpath] = json.load(fin)
    return data


content = load_json_data_from_root(CONTENT_ROOT, extension_filter='')
print(len(content))

57819


In [33]:
import random

# pick random test article from content data
content_list = list(content.items())
test_article = random.choice(content_list)[1]
test_article

{'metadata': {'id': 'urn:bbc:ares::asset:news/business-52279871',
  'locators': {'assetUri': '/news/business-52279871',
   'cpsUrn': 'urn:bbc:content:assetUri:news/business-52279871',
   'curie': 'http://www.bbc.co.uk/asset/6b7db5d9-cb7d-214e-a5f2-1143fb1ba02a',
   'assetId': '52279871'},
  'type': 'STY',
  'createdBy': 'news',
  'language': 'en-gb',
  'lastUpdated': 1586887211244,
  'firstPublished': 1586863085000,
  'lastPublished': 1586884552000,
  'timestamp': 1586884552000,
  'options': {'isIgorSeoTagsEnabled': False,
   'includeComments': True,
   'allowRightHandSide': True,
   'isFactCheck': False,
   'allowDateStamp': True,
   'suitableForSyndication': True,
   'hasNewsTracker': False,
   'allowRelatedStoriesBox': True,
   'isKeyContent': False,
   'allowHeadline': True,
   'allowAdvertising': True,
   'isBreakingNews': False,
   'allowPrintingSharingLinks': True},
  'analyticsLabels': {'cps_asset_type': 'sty',
   'counterName': 'news.business.story.52279871.page',
   'cps_asse

In [35]:
# associate boolean categories with content tags

category_tags = {
    'isBLM': {'should': ['Black Lives Matter', 'BLM'], 'should_not': []},
    'isBrexit': {'should':['Brexit', 'Operation Yellowhammer'], 'should_not': []},
    'isCovid': {'should': ['Covid', 'Coronavirus', 'Self-isolation', 'Lockdown', 'Contact tracing', 
                           'Mers virus', 'Joint Biosecurity Centre (JBC)'],
                'should_not': []},
    'isEducation': {'should': ['Education'], 'should_not': []},
    'isImmigation': {'should': ['Immigration'], 'should_not': []},
    "isEconomy": {'should': ['Economy'], 'should_not': []},
    "isProtest": {'should': [], 'should_not': []},
    "isRacial": {'should': [], 'should_not': []},
    "isLawAndOrder": {'should': [], 'should_not': []},
}

In [17]:
# extract relevant text elements from article


def get_tags(article):
    """
    Extract tags from within article json.
    """
    all_tags = article.get('metadata', {}).get('tags', {}).get('about', [])
    tag_names = [t.get('thingLabel') for t in all_tags]
    return tag_names


def get_headline(article):
    """
    Extract headline from within article json.
    """
    return article.get('promo', {}).get('headlines', {}).get('headline', '')


def get_summary(article):
    """
    Extract summary from within article json.
    """
    return article.get('promo', {}).get('summary', '')

In [18]:
test_tags = get_tags(test_article)
test_headline = get_headline(test_article)
test_summary = get_summary(test_article)

In [19]:
def check_text_for_keywords(text, category_tags):
    """
    Function checks if text includes category tags, excluding invalid tags.
    Args:
        text (list): list of text segments to be searched (these may be tags, headlines, or body text)
        category_tags (dict):
    Returns:
        True if text contains at least one valid category tag, and no invalid tags.
    """
    clean_tags = [t.lower() for t in text]
    should = [t.lower() for t in category_tags.get('should', [])]
    should_not = [t.lower() for t in category_tags.get('should_not', [])]
    
    for s in should_not:
        for t in clean_tags:
            if t.find(s) >= 0:
                return False
    for s in should:
        for t in clean_tags:
            if t.find(s) >= 0:
                return True
    return False

In [20]:
check_text_for_keywords(test_tags, {'should': ['HM Revenue & Customs'], 'should_not': []})

True

In [21]:
#!pip install requests
import requests
import json


starfruit_api = 'http://starfruit.virt.ch.bbc.co.uk'
mango_api = 'http://api.mango-en.virt.ch.bbc.co.uk'


def get_results_from_autotagger(asset_uri, api=starfruit_api):
    """
    Query starfruit or mango api with article URI to return auto-generated content tags.
    """
    response = requests.get(f'{api}/topics?uri=https://www.bbc.co.uk{asset_uri}')
    body = response.content
    return json.loads(body.decode("utf-8"))


def parse_labels_from_autotagger_response(response):
    """
    Extract list of auto-generated labels from starfruit / mango api response.
    """
    all_labels = response.get('results')
    return [l.get('label', {}).get('en-gb', '') for l in all_labels]

In [22]:
response = get_results_from_autotagger('/news/uk-england-merseyside-48888207')
parse_labels_from_autotagger_response(response)

['HM Revenue & Customs', 'Merseyside', 'Liverpool']