# Assign boolean category labels to news articles

### Datascapes Hack, June 2020
Hack documentation can be found [here](https://paper.dropbox.com/doc/HACK-Q1-2020--A2FzQJwlu4mWkTIUmB7gSH0RAg-zuTZhovLYbSAFzktgW3SN).

**Boolean tag categories:**
- "isBLM"
- "isBrexit"
- "isCovid"
- "isEducation"
- "isImmigration"
- "isEconomy"
- "isProtest"
- "isRacial"
- "isLawAndOrder"

**Methods of asserting category membership:**
1. Search existing article tags for category keywords
2. Search article headline / summary / text for category keywords
3. Use [Mango](http://api.mango-en.virt.ch.bbc.co.uk/) or [Starfruit](http://starfruit.virt.ch.bbc.co.uk/) to auto-generate tags, and search these for category keywords
4. Use NLI to infer category membership (useful for more vague categories where keyword searches don't suffice)

**Note:** you can find available BBC content tags [here](https://www.bbc.co.uk/things/search?q=immigration)

### Category labelling using keyword searches

Membership of categories with distinctive associated keywords can be determined by searching for these keywords within the article headline, summary, body, or tags. The dictionary below outlines the categories for which this approach might be possible, and their associated keywords. We may want to exclude certain articles if a keyword appears in conjunction with another term (e.g. and article containing the word "sand" might be excluded from the category "beach" if it also contains the word "sandpaper"). For this reason, each category is associated with a list of "should_not" words as well as the "should" keywords.

In [1]:
# associate boolean categories with content tags

category_tags = {
    'isBLM': {'should': ['Black Lives Matter', 'BLM'], 
              'should_not': []},
    'isBrexit': {'should':['Brexit', 'Operation Yellowhammer'], 
                 'should_not': []},
    'isCovid': {'should': ['Covid', 'Coronavirus', 'Self-isolation', 'Lockdown', 'Contact tracing', 'Mers virus', 'Joint Biosecurity Centre (JBC)'],
                'should_not': []},
    'isEducation': {'should': ['Education'], 
                    'should_not': []},
    'isImmigration': {'should': ['Immigration'], 
                      'should_not': []},
    "isEconomy": {'should': ['Economy'], 
                  'should_not': []},
}

In [63]:
# load content data
import json
import os
import numpy as np


CONTENT_ROOT = '/Users/fitzma02/Documents/work/data/all_content'


# copied from garden_shed.data_utils.data_io
def get_filepaths_in_directory_and_subdirs(root, extension_filter=''):
    fpaths = []
    for dirpath, dirnames, filenames in os.walk(root):
        for fname in filenames:
            if not extension_filter or fname.lower().endswith(extension_filter.lower()):
                fpaths.append(os.path.join(dirpath, fname))
    return fpaths


def load_json_data_from_root(root, extension_filter='', limit=None):
    file_paths = get_filepaths_in_directory_and_subdirs(root, extension_filter)
    if limit:
        file_paths = file_paths[:np.minimum(limit, len(file_paths))]
    data = {}
    for fpath in file_paths:
        with open(fpath) as fin:
            data[fpath] = json.load(fin)
    return data


content = load_json_data_from_root(CONTENT_ROOT, extension_filter='', limit=100)
print(len(content))

100


In [13]:
import random

# pick random test article from content data
content_list = list(content.items())
test_article = random.choice(content_list)[1]
test_article['metadata']

{'id': 'urn:bbc:ares::asset:news/uk-scotland-north-east-orkney-shetland-49125777',
 'locators': {'assetUri': '/news/uk-scotland-north-east-orkney-shetland-49125777',
  'cpsUrn': 'urn:bbc:content:assetUri:news/uk-scotland-north-east-orkney-shetland-49125777',
  'curie': 'http://www.bbc.co.uk/asset/a20a180d-8f69-6e4a-8d49-1a3c29c46955'},
 'type': 'STY',
 'createdBy': 'news',
 'language': 'en-gb',
 'lastUpdated': 1564134858330,
 'firstPublished': 1564134694,
 'lastPublished': 1564134849,
 'options': {'isIgorSeoTagsEnabled': False,
  'includeComments': False,
  'allowRightHandSide': True,
  'isFactCheck': False,
  'allowDateStamp': True,
  'suitableForSyndication': True,
  'hasNewsTracker': False,
  'allowRelatedStoriesBox': True,
  'isKeyContent': False,
  'allowHeadline': True,
  'allowAdvertising': True,
  'isBreakingNews': False,
  'allowPrintingSharingLinks': True},
 'analyticsLabels': {'cps_asset_type': 'sty',
  'counterName': 'news.scotland.north_east_orkney_and_shetland.story.49125

In [3]:
# extract relevant text elements from article


def get_tags(article):
    """
    Extract tags from within article json.
    """
    all_tags = article.get('metadata', {}).get('tags', {}).get('about', [])
    tag_names = [t.get('thingLabel') for t in all_tags]
    return tag_names


def get_headline(article):
    """
    Extract headline from within article json.
    """
    return article.get('promo', {}).get('headlines', {}).get('headline', '')


def get_summary(article):
    """
    Extract summary from within article json.
    """
    return article.get('promo', {}).get('summary', '')


def get_body(article):
    """
    Extract body text from within article json.
    """
    text = ''
    for t in article.get('content', {}).get('blocks', []):
        text += t.get('text', '')
    return text


def get_uri(article):
    """
    Extract uri from within article json.
    """
    return article.get('metadata', {}).get('locators', {}).get('assetUri', '')

In [23]:
test_tags = get_tags(test_article)
test_headline = get_headline(test_article)
test_summary = get_summary(test_article)
test_uri = get_uri(test_article)

In [4]:
def check_text_for_keywords(text, category_tags):
    """
    Function checks if text includes category tags, excluding invalid tags.
    Args:
        text (list): list of text segments to be searched (these may be tags, headlines, or body text)
        category_tags (dict): dictionary containing a list of "should" tags, and optional "should_not" tags
    Returns:
        True if text contains at least one valid category tag, and no invalid tags.
    """
    all_text = (' ').join([t.lower() for t in text])
    should = [t.lower() for t in category_tags.get('should', [])]
    should_not = [t.lower() for t in category_tags.get('should_not', [])]
    
    for s in should_not:
        if all_text.find(s) >= 0:
            return False
    for s in should:
        if all_text.find(s) >= 0:
            return True
    return False

In [20]:
all_text = test_tags + [test_headline, test_summary]
check_text_for_keywords(all_text, {'should': ['Entertainment', 'Scotland'], 'should_not': ['Brexit']})

True

In [5]:
#!pip install requests
import requests
import json


starfruit_api = 'http://starfruit.virt.ch.bbc.co.uk'
mango_api = 'http://api.mango-en.virt.ch.bbc.co.uk'


def get_results_from_autotagger(asset_uri, api=starfruit_api):
    """
    Query starfruit or mango api with article URI to return auto-generated content tags.
    """
    response = requests.get(f'{api}/topics?uri=https://www.bbc.co.uk{asset_uri}')
    body = response.content
    return json.loads(body.decode("utf-8"))


def parse_labels_from_starfruit_response(response):
    """
    Extract list of auto-generated labels from starfruit api response.
    """
    all_labels = response.get('results', [])
    return [l.get('label', {}).get('en-gb', '') for l in all_labels]


def parse_labels_from_mango_response(response):
    """
    Extract list of auto-generated labels from mango api response.
    """
    all_labels = response.get('results', [])
    return [l.get('label', {}) for l in all_labels]

In [33]:
response = get_results_from_autotagger(test_uri, api=mango_api)
parse_labels_from_mango_response(response)

Arrested


['Arrested',
 'Stabbed',
 'Bessbrook',
 'murder',
 'Police',
 'Coleraine',
 'BBC News',
 'Hospital',
 'Robbery',
 'Gang',
 'Anniversary',
 'County Armagh',
 'PSNI',
 'Prosecution',
 'Crimestoppers']

In [6]:
def end_to_end_labelling(article, category_tags, use_tags=True, use_headline=True, use_summary=True, use_body=True, use_starfish=True, use_mango=True):
    """
    Orchestration of boolean labelling for a single article and category.
    args:
        article (dict): article object to be labelled
        category_tags (dict): dictionary containing a list of "should" tags, and optional "should_not" tags
    Returns:
        True if text returned from various optional locations (tags, headline, summary, body, auto-taggers) contains at least one valid category tag, 
        and no invalid tags.
    """
    text = []
    if use_tags:
        text += get_tags(article)
    if use_headline:
        text += [get_headline(article)]
    if use_summary:
        text += [get_summary(article)]
    if use_body:
        text += [get_body(article)]
    if use_starfish or use_mango:
        uri = get_uri(article)
        if use_starfish:
            response = get_results_from_autotagger(uri, api='http://starfruit.virt.ch.bbc.co.uk')
            text += parse_labels_from_starfruit_response(response)
        if use_mango:
            response = get_results_from_autotagger(uri, api='http://api.mango-en.virt.ch.bbc.co.uk')
            text += parse_labels_from_mango_response(response)
    return check_text_for_keywords(text, category_tags)

In [53]:
end_to_end_labelling(test_article, category_tags['isImmigration'])

True

### Category labelling for vague categories

For the "fuzzier" categories such as "isRacial", "isProtest", and "isLawAndOrder", keyword searches won't suffice for determining the label. Instead, we will attempt to use NLI to determine membership of these classes.

In [7]:
# Try out RobertaForQuestionAnswering

#!pip install transformers
#!pip install transformers[tf-cpu]
#!pip install transformers[torch]
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering

In [64]:
max_text_length = 500
question = "Is race discussed?"
text = get_body(test_article)[:max_text_length]

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])

answer

"Ġto Ġdescribe Ġhim . ĠReally . ĠHe 's Ġthe Ġbest Ġperson ĠI Ġever Ġknew . & qu ot ; Tom ĠSinn ott Ġis Ġplanning Ġhis Ġyounger Ġbrother 's Ġfuneral . The Ġlast Ġthing Ġhe Ġexpected Ġwas Ġjust Ġabout Ġevery Ġprofessional Ġteam Ġin ĠEngland Ġsending Ġhim Ġa Ġshirt Ġwith ĠJordan 's Ġname Ġon Ġthe Ġback Ġof Ġit . But , Ġas Ġhe 's Ġtold ĠRadio Ġ1 ĠNews beat , Ġit 's Ġpart Ġof Ġa Ġtribute Ġto Ġthe Ġ25 - year - old Ġfootballer . Jordan Ġ- Ġa Ġnon - league Ġplayer Ġfor ĠMat lock ĠTown Ġ- Ġdied Ġon ĠSaturday Ġafter Ġbeing Ġattacked Ġduring Ġa Ġnight Ġout . ĠThree Ġ21 - year -"

In [8]:
# Try out HuggingFace facebook/bart-large-mnli

import os
import sys
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from nlp.text_entailment import *

Probability that the label is true: 0.66%


In [9]:
model, tokenizer = load_bart_model_tokenizer(model_name)
premise = "Oriel College's governors vote to take down the statue of the Victorian colonialist Cecil Rhodes."
hypothesis = 'References the diamond trade'

get_premise_hypothesis_entailment(premise, hypothesis, tokenizer, model)

Probability that the label is true: 0.66%


In [10]:
# modify text entailment function to return raw probability

def get_premise_hypothesis_entailment_probability(premise, hypothesis, tokenizer, model):
    input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')
    logits = model(input_ids)[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true
    entail_contradiction_logits = logits[:,[0,2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    true_prob = probs[:,1].item()
    return true_prob

In [34]:
# infer whether article belongs to category using NLI

def get_label_from_entailment(article_text, category_tags, tokenizer, model, threshold=0.5, max_len=1024):
    '''
    Function applies a binary label to an article about whether it discusses the category, defined by a list of category keywords.
    Args: 
        article_text (str):
        category_tags (list): 
        threshold (float):
        tokenizer (BartTokenizer):
        model (BartForSequenceClassification):
    Returns: 
        Boolean label indicating whether article belongs to category
    '''
    hypothesis = f'discusses {category_tags[0]}'
    for t in category_tags[1:]:
        hypothesis += f' or {t}'
    probability = get_premise_hypothesis_entailment_probability(article_text[:np.minimum(max_len, len(article_text))], hypothesis, tokenizer, model)
    if probability >= threshold:
        return True
    return False

In [14]:
test_text = get_body(test_article)
get_label_from_entailment(test_text, ['race', 'racism', 'BLM'], tokenizer, model)

False

In [None]:
# tuning and evaluation

#!pip install sklearn
from sklearn.metrics import confusion_matrix
import pandas as pd


def end_to_end_labelling_entailment(content_item, category_tags, tokenizer, model, threshold):
    """
    Wrap together get_label_from_entailment with get_body so that it can take raw articles that haven't been pre-processed.
    """
    article_text = get_body(content_item)
    return get_label_from_entailment(article_text, category_tags, tokenizer, model, threshold=threshold)


def calculate_labels(content, category_tags, prediction_function, **kwargs):
    """
    Iterate through content items and predict label for each of the articles.
    """
    y = []
    for k, v in content.items(): 
        y.append(prediction_function(v, category_tags, **kwargs))
    return y


def iterate_over_threshold_vals(content, category_tags, tokenizer, model, threshold_vec):
    """
    Iterate over entailment threshold values and return predictions and metrics.
    """
    metrics = pd.DataFrame()
    all_y_pred = {}
    y = calculate_labels(content, category_tags, end_to_end_labelling)
    for t in threshold_vec:
        y_pred = calculate_labels(content, category_tags['should'], end_to_end_labelling_entailment, tokenizer=tokenizer, model=model, threshold=t)
        all_y_pred[t] = y_pred
        tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
        results = {'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp}
        results['TPR'] = results['TP'] / (results['TP'] + results['FN'])
        results['FPR'] = results['FP'] / (results['TN'] + results['FP'])
        metrics = metrics.append(results, ignore_index=True)
    return y, all_y_pred, metrics

In [None]:
threshold_vals = [0.3, 0.6, 0.9]
y, all_y_pred, metrics = iterate_over_threshold_vals(content, category_tags['isCovid'], tokenizer, model, threshold_vals)

In [None]:
import matplotlib.pyplot as plt


plt.plot(metrics.FPR, metrics.TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')

In [None]:
threshold_vals = [0.3, 0.6, 0.9]
y, all_y_pred, metrics = iterate_over_threshold_vals(content, category_tags['isCovid'], tokenizer, model, threshold_vals)

In [None]:
import matplotlib.pyplot as plt


plt.plot(metrics.FPR, metrics.TPR)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')