# Assign boolean category labels to news articles

### Datascapes Hack, June 2020
Hack documentation can be found [here](https://paper.dropbox.com/doc/HACK-Q1-2020--A2FzQJwlu4mWkTIUmB7gSH0RAg-zuTZhovLYbSAFzktgW3SN).

**Boolean tag categories:**
- "isBLM"
- "isBrexit"
- "isCovid"
- "isEducation"
- "isImmigration"
- "isEconomy"
- "isProtest"
- "isRacial"
- "isLawAndOrder"

**Methods of asserting category membership:**
1. Search existing article tags for category keywords
2. Search article headline / summary / text for category keywords
3. Use [Mango](http://api.mango-en.virt.ch.bbc.co.uk/) or [Starfruit](http://starfruit.virt.ch.bbc.co.uk/) to auto-generate tags, and search these for category keywords

**Note:** you can find available BBC content tags [here](https://www.bbc.co.uk/things/search?q=immigration)

In [2]:
# load content data
import json
import os


#CONTENT_ROOT = '/Users/fitzma02/Documents/work/data/all_content'



# copied from garden_shed.data_utils.data_io
def get_filepaths_in_directory_and_subdirs(root, extension_filter=''):
    fpaths = []
    for dirpath, dirnames, filenames in os.walk(root):
        for fname in filenames:
            if not extension_filter or fname.lower().endswith(extension_filter.lower()):
                fpaths.append(os.path.join(dirpath, fname))
    return fpaths


def load_json_data_from_root(root, extension_filter=''):
    file_paths = get_filepaths_in_directory_and_subdirs(root, extension_filter)
    data = {}
    for fpath in file_paths:
        with open(fpath) as fin:
            data[fpath] = json.load(fin)
    return data


content = load_json_data_from_root(CONTENT_ROOT, extension_filter='')
print(len(content))

57819


In [55]:
import random

# pick random test article from content data
content_list = list(content.items())
test_article = random.choice(content_list)[1]
test_article['metadata']['tags']

{'about': [{'thingLabel': 'Matlock',
   'thingUri': 'http://www.bbc.co.uk/things/b8415938-21af-4977-bcec-a6e460a751bc#id',
   'thingId': 'b8415938-21af-4977-bcec-a6e460a751bc',
   'thingType': ['Thing', 'Place', 'geoname:GeoTagConcept'],
   'thingSameAs': ['http://sws.geonames.org/2642910/']},
  {'thingLabel': 'Jordan Sinnott death',
   'thingUri': 'http://www.bbc.co.uk/things/cb83cb53-4bd3-4b8e-ae9d-e1089cf505fa#id',
   'thingId': 'cb83cb53-4bd3-4b8e-ae9d-e1089cf505fa',
   'thingType': ['Thing', 'Event'],
   'thingSameAs': ['http://dbpedia.org/resource/Jordan_Sinnott',
    'http://www.wikidata.org/entity/Q6277016']},
  {'thingLabel': 'Derbyshire',
   'thingUri': 'http://www.bbc.co.uk/things/f1cf068a-f25a-4c04-ab12-99e16f6c61b8#id',
   'thingId': 'f1cf068a-f25a-4c04-ab12-99e16f6c61b8',
   'thingType': ['core:CeremonialCounty', 'Thing', 'Place'],
   'thingSameAs': ['http://dbpedia.org/resource/Derbyshire']}]}

In [52]:
# associate boolean categories with content tags

category_tags = {
    'isBLM': {'should': ['Black Lives Matter', 'BLM'], 
              'should_not': []},
    'isBrexit': {'should':['Brexit', 'Operation Yellowhammer'], 
                 'should_not': []},
    'isCovid': {'should': ['Covid', 'Coronavirus', 'Self-isolation', 'Lockdown', 'Contact tracing', 'Mers virus', 'Joint Biosecurity Centre (JBC)'],
                'should_not': []},
    'isEducation': {'should': ['Education'], 
                    'should_not': []},
    'isImmigration': {'should': ['Immigration'], 
                      'should_not': []},
    "isEconomy": {'should': ['Economy'], 
                  'should_not': []},
    "isProtest": {'should': [], 
                  'should_not': []},
    "isRacial": {'should': [], 
                 'should_not': []},
    "isLawAndOrder": {'should': [], 
                      'should_not': []},
}

In [22]:
# extract relevant text elements from article


def get_tags(article):
    """
    Extract tags from within article json.
    """
    all_tags = article.get('metadata', {}).get('tags', {}).get('about', [])
    tag_names = [t.get('thingLabel') for t in all_tags]
    return tag_names


def get_headline(article):
    """
    Extract headline from within article json.
    """
    return article.get('promo', {}).get('headlines', {}).get('headline', '')


def get_summary(article):
    """
    Extract summary from within article json.
    """
    return article.get('promo', {}).get('summary', '')


def get_body(article):
    """
    Extract body text from within article json.
    """
    text = ''
    for t in article.get('content', {}).get('blocks', []):
        text += t.get('text', '')
    return text


def get_uri(article):
    """
    Extract uri from within article json.
    """
    return article.get('metadata', {}).get('locators', {}).get('assetUri', '')

In [23]:
test_tags = get_tags(test_article)
test_headline = get_headline(test_article)
test_summary = get_summary(test_article)
test_uri = get_uri(test_article)

In [12]:
def check_text_for_keywords(text, category_tags):
    """
    Function checks if text includes category tags, excluding invalid tags.
    Args:
        text (list): list of text segments to be searched (these may be tags, headlines, or body text)
        category_tags (dict): dictionary containing a list of "should" tags, and optional "should_not" tags
    Returns:
        True if text contains at least one valid category tag, and no invalid tags.
    """
    all_text = (' ').join([t.lower() for t in text])
    should = [t.lower() for t in category_tags.get('should', [])]
    should_not = [t.lower() for t in category_tags.get('should_not', [])]
    
    for s in should_not:
        if all_text.find(s) >= 0:
            return False
    for s in should:
        if all_text.find(s) >= 0:
            return True
    return False

In [20]:
all_text = test_tags + [test_headline, test_summary]
check_text_for_keywords(all_text, {'should': ['Entertainment', 'Scotland'], 'should_not': ['Brexit']})

True

In [40]:
#!pip install requests
import requests
import json


starfruit_api = 'http://starfruit.virt.ch.bbc.co.uk'
mango_api = 'http://api.mango-en.virt.ch.bbc.co.uk'


def get_results_from_autotagger(asset_uri, api=starfruit_api):
    """
    Query starfruit or mango api with article URI to return auto-generated content tags.
    """
    response = requests.get(f'{api}/topics?uri=https://www.bbc.co.uk{asset_uri}')
    body = response.content
    return json.loads(body.decode("utf-8"))


def parse_labels_from_starfruit_response(response):
    """
    Extract list of auto-generated labels from starfruit api response.
    """
    all_labels = response.get('results', [])
    return [l.get('label', {}).get('en-gb', '') for l in all_labels]


def parse_labels_from_mango_response(response):
    """
    Extract list of auto-generated labels from mango api response.
    """
    all_labels = response.get('results', [])
    return [l.get('label', {}) for l in all_labels]

In [33]:
response = get_results_from_autotagger(test_uri, api=mango_api)
parse_labels_from_mango_response(response)

Arrested


['Arrested',
 'Stabbed',
 'Bessbrook',
 'murder',
 'Police',
 'Coleraine',
 'BBC News',
 'Hospital',
 'Robbery',
 'Gang',
 'Anniversary',
 'County Armagh',
 'PSNI',
 'Prosecution',
 'Crimestoppers']

In [50]:
def end_to_end_labelling(article, category_tags, use_tags=True, use_headline=True, use_summary=True, use_body=True, use_starfish=True, use_mango=True):
    """
    Orchestration of boolean labelling for a single article and category.
    args:
        article (dict): article object to be labelled
        category_tags (dict): dictionary containing a list of "should" tags, and optional "should_not" tags
    Returns:
        True if text returned from various optional locations (tags, headline, summary, body, auto-taggers) contains at least one valid category tag, 
        and no invalid tags.
    """
    text = []
    if use_tags:
        text += get_tags(article)
    if use_headline:
        text += [get_headline(article)]
    if use_summary:
        text += [get_summary(article)]
    if use_body:
        text += [get_body(article)]
    if use_starfish or use_mango:
        uri = get_uri(article)
        if use_starfish:
            response = get_results_from_autotagger(uri, api='http://starfruit.virt.ch.bbc.co.uk')
            text += parse_labels_from_starfruit_response(response)
        if use_mango:
            response = get_results_from_autotagger(uri, api='http://api.mango-en.virt.ch.bbc.co.uk')
            text += parse_labels_from_mango_response(response)
    return check_text_for_keywords(text, category_tags)

In [53]:
end_to_end_labelling(test_article, category_tags['isImmigration'])

True

In [1]:
!pip install transformers
!pip install transformers[torch]


Collecting transformers
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
[K     |████████████████████████████████| 674 kB 710 kB/s eta 0:00:01     |███████████████████             | 399 kB 710 kB/s eta 0:00:01
[?25hCollecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp37-cp37m-macosx_10_10_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 397 kB/s eta 0:00:01     |████▊                           | 174 kB 397 kB/s eta 0:00:03     |██████████▏                     | 378 kB 397 kB/s eta 0:00:03
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.91-cp37-cp37m-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 3.4 MB/s eta 0:00:01     |██████████                      | 348 kB 3.4 MB/s eta 0:00:01
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 13.1 MB/s eta 0:00:01     |███████████████▏                | 419 kB 13.1 MB/s eta 0:00:01
Collec

In [2]:
!pip install transformers[tf-cpu]

Collecting onnxconverter-common; extra == "tf-cpu"
  Downloading onnxconverter_common-1.7.0-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.1 MB/s eta 0:00:01
[?25hCollecting keras2onnx; extra == "tf-cpu"
  Downloading keras2onnx-1.7.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 604 kB/s eta 0:00:01
[?25hCollecting tensorflow-cpu; extra == "tf-cpu"
  Downloading tensorflow_cpu-2.2.0-cp37-cp37m-macosx_10_11_x86_64.whl (175.3 MB)
[K     |████████████████████████████████| 175.3 MB 24 kB/s  eta 0:00:012 |                                | 30 kB 391 kB/s eta 0:07:28     |▍                               | 2.1 MB 631 kB/s eta 0:04:35     |▌                               | 2.7 MB 631 kB/s eta 0:04:34     |█▌                              | 8.2 MB 395 kB/s eta 0:07:02     |██▎                             | 12.3 MB 2.8 MB/s eta 0:00:59     |██▋                             | 14.3 MB 359 kB/s eta 0:07:29     |███▌                      

In [11]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForQuestionAnswering
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForQuestionAnswering.from_pretrained('roberta-base')
question, text = "when was the search terminated?", "A search for a person missing in the sea off Hampshire has been called off. A woman was rescued and a search was under way for a person said to be in difficulty at Langstone Harbour, near Hayling Island, at about 13:25 GMT. The woman was taken to hospital while four lifeboat crews and a coastguard helicopter continued investigating. The coastguard said nothing had been found and the search was 'terminated pending further information' at about 19:00 GMT. Alan Barnett, from Hayling Island Lifeboat Station, described it as 'quite a big search operation'"
input_dict = tokenizer.encode_plus(question, text, return_tensors='tf')
start_scores, end_scores = model(input_dict)
all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])

In [12]:
answer

"Ġcalled Ġoff . ĠA Ġwoman Ġwas Ġrescued Ġand Ġa Ġsearch Ġwas Ġunder Ġway Ġfor Ġa Ġperson Ġsaid Ġto Ġbe Ġin Ġdifficulty Ġat ĠLang stone ĠHarbour , Ġnear ĠHay ling ĠIsland , Ġat Ġabout Ġ13 : 25 ĠGMT . ĠThe Ġwoman Ġwas Ġtaken Ġto Ġhospital Ġwhile Ġfour Ġlife boat Ġcrews Ġand Ġa Ġcoast guard Ġhelicopter Ġcontinued Ġinvestigating . ĠThe Ġcoast guard Ġsaid Ġnothing Ġhad Ġbeen Ġfound Ġand Ġthe Ġsearch Ġwas Ġ' termin ated Ġpending Ġfurther Ġinformation ' Ġat Ġabout Ġ19 : 00 ĠGMT . ĠAlan ĠBarnett , Ġfrom ĠHay ling ĠIsland ĠLife boat ĠStation , Ġdescribed Ġit Ġas Ġ' quite Ġa"

In [6]:
from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267844284.0, style=ProgressStyle(descri…


[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]


In [7]:
all_tokens


['<s>',
 'ĠWho',
 'Ġwas',
 'ĠJim',
 'ĠH',
 'enson',
 '?',
 '</s>',
 '</s>',
 'ĠJim',
 'ĠH',
 'enson',
 'Ġwas',
 'Ġa',
 'Ġnice',
 'Ġpuppet',
 '</s>']

In [8]:
start_scores

<tf.Tensor: shape=(1, 17), dtype=float32, numpy=
array([[ 0.09933002, -0.12340708, -0.13821322,  0.01947924, -0.03826241,
         0.05032244, -0.02616964,  0.11487867,  0.13624862, -0.01753932,
        -0.04074576,  0.01615429, -0.15535429, -0.06976391, -0.02800934,
        -0.00207993,  0.11487873]], dtype=float32)>

In [9]:
end_scores

<tf.Tensor: shape=(1, 17), dtype=float32, numpy=
array([[-0.06370163,  0.1411129 ,  0.16514681, -0.07518349,  0.17826185,
        -0.03513342, -0.0654796 , -0.07701048, -0.0368855 , -0.0055783 ,
         0.12038381, -0.05984056,  0.13992089,  0.03762101, -0.01167762,
        -0.07321434, -0.07701055]], dtype=float32)>

In [13]:
import os
import sys
module_path = os.path.abspath(os.path.join('../src/nlp'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [18]:
from text_entailment import *

In [21]:
#premise = "Oriel College's governors vote to take down the statue of the Victorian colonialist Cecil Rhodes."
#hypothesis = 'References the diamond trade'

#text_entailment.get_premise_hypothesis_entailment(premise, hypothesis, tokenizer, model)

text_entailment.get_premise_hypothesis_entailment("A search for a person missing in the sea off Hampshire has been called off. A woman was rescued and a search was under way for a person said to be in difficulty at Langstone Harbour, near Hayling Island, at about 13:25 GMT. The woman was taken to hospital while four lifeboat crews and a coastguard helicopter continued investigating. The coastguard said nothing had been found and the search was 'terminated pending further information' at about 19:00 GMT. Alan Barnett, from Hayling Island Lifeboat Station, described it as 'quite a big search operation'", "is about the sea", tokenizer, model)

Probability that the label is true: 61.45%


In [22]:
def get_premise_hypothesis_entailment_probability(premise, hypothesis, tokenizer, model):
    input_ids = tokenizer.encode(premise, hypothesis, return_tensors='pt')
    logits = model(input_ids)[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true
    entail_contradiction_logits = logits[:,[0,2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    true_prob = probs[:,1].item()
    return true_prob

In [23]:
def get_label_from_entailment(article_text, category, tokenizer, model, threshold=0.5):
    '''
    Function applies a binary label to an article about whether it discusses the category
    Args: article_text(string), category(string), threshold(float), tokenizer(BartTokenizer), model(BartForSequenceClassification)
    Returns: boolean label
    '''
    hypothesis = f"discusses {category}"
    probability = get_premise_hypothesis_entailment_probability(article_text, hypothesis, tokenizer, model)
    if probability >= threshold:
        return True
    return False

In [25]:
test_article_text = "A search for a person missing in the sea off Hampshire has been called off. A woman was rescued and a search was under way for a person said to be in difficulty at Langstone Harbour, near Hayling Island, at about 13:25 GMT. The woman was taken to hospital while four lifeboat crews and a coastguard helicopter continued investigating. The coastguard said nothing had been found and the search was 'terminated pending further information' at about 19:00 GMT. Alan Barnett, from Hayling Island Lifeboat Station, described it as 'quite a big search operation"

In [29]:
get_label_from_entailment(test_article_text, "penguins", tokenizer, model)

False