In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import Counter
from itertools import chain
import warnings
warnings.filterwarnings('ignore')
import nltk
import time
#nltk.download('stopwords')

### Notes on Prediction
This notebook contains all the code necessary to predict NER topics on input queries using the saved NER model.

In [11]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [12]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [13]:
# Load saved NER model
filename = 'crf_ner_model.sav'

crf = pickle.load(open(filename, 'rb'))

In [14]:
# Prepare the input query by tagging for POS
def prep_query(phrase):
    split_query = re.findall(r"[\w']+|[.,!?;]", phrase)
    
    pos_tags = pos_tag(split_query)
    
    df_query = pd.DataFrame({'Sentence #':['Sentence: 1'] * len(pos_tags),
                            'Word':[pair[0] for pair in pos_tags],
                            'POS':[pair[1] for pair in pos_tags],
                            'Tag':[None] * len(pos_tags)})
       
    return df_query

In [15]:
# Try predicting on some sample queries
s = "Donald Trump is a former host on The Apprentice. He is an American businessman and former President."
s = 'hello how are you'
s = 'The Second World War started in 1914 and ended in 1918'
s = 'The Korean War started in 1939 and ended in 1945'
s = 'Iraq and Iran were once at war. Saddam Hussein was involved'
s = 'The World Cup is a quadrennial sporting event. FIFA is the governing body involved.'
s = 'Biden under pressure over Afghanistan and Covid as approval ratings slide'
s = 'But the Taliban warned on Monday there would be “consequences” if the US and its allies linger beyond that date.'
s = 'Thousands of American troops have poured back into the country to oversee the chaotic airlift of foreigners and \
selected Afghans from Kabul airport, and Biden is being called upon to extend a 31 August deadline for full US withdrawal'
s = 'As it leaves Afghanistan in chaos, America’s decline mirrors Britain’s a century ago. It may also invite wider \
conflict, warns a historian'
s = 'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory \
accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant \
evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.'

x = prep_query(s)

In [24]:
def concat_ents(ents_list):
    all_named_ents = {}
    counter = 0
    counts = {}
      
    full_named_ent = ents_list[0][1]

    for i in range(1,len(ents_list) - 1):
        if ents_list[i][0][0] == 'I':
            full_named_ent = full_named_ent + ' ' + ents_list[i][1]
            if ents_list[i + 1][0][0] == 'B':
                if full_named_ent in all_named_ents.keys():
                    all_named_ents[full_named_ent] += 1
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent] = 1
                    full_named_ent = ''
            else:
                continue
                full_named_ent = full_named_ent + ' ' + ents_list[i + 1][1]
                           
        elif ents_list[i][0][0] == 'B':
            full_named_ent = ents_list[i][1]
            if ents_list[i + 1][0][0] == 'B':
                if full_named_ent in all_named_ents.keys():
                    all_named_ents[full_named_ent] += 1
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent] = 1
                    full_named_ent = '' 

    return all_named_ents

In [84]:
''' The output from the NER tagging is a collection of individual words with a "B" or "I" tag, depending on whether the 
word is at the beginning or interior of the named entity. This function puts each named entity together from its constituent 
words
'''
def concat_ents(ents_list):
    all_named_ents = {}
    counter = 0
    counts = {}
       
    full_named_ent = ents_list[0][1]

    for i in range(1,len(ents_list) - 1):
        if ents_list[i][0][0] == 'I':
            full_named_ent = full_named_ent + ' ' + ents_list[i][1] # State name
            if ents_list[i + 1][0][0] == 'B':              
                if full_named_ent + ents_list[i][0][1:] in all_named_ents.keys():
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] += 1            # Add
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] = 1             # Add
                    full_named_ent = ''
            else:
                continue
                full_named_ent = full_named_ent + ' ' + ents_list[i + 1][1]
                           
        elif ents_list[i][0][0] == 'B':
            full_named_ent = ents_list[i][1]
            if ents_list[i + 1][0][0] == 'B':
                if full_named_ent + ents_list[i][0][1:] in all_named_ents.keys():
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] += 1           # Add
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] = 1            # Add
                    full_named_ent = '' 
                              
    return all_named_ents

In [46]:
'''For each dictionary of NEs, add up the number of time each key (i.e. each word) appears'''

def get_totals(dicts):
    totals = {}
    for _dict in dicts:
        for key in _dict.keys():
            if key not in totals.keys():
                totals[key] = _dict[key]
            else:
                totals[key] += _dict[key]
    return totals            

In [20]:
''' Extract the named entities from a given string. 
Return a list of NEs of the form: [('B-tim', 'March'),
                                   ('B-per', 'Dr'),
                                   ('I-per', 'Ben'),
                                   ('I-per', 'Embarek'),
                                   ('B-gpe', 'Chinese')]'''
def get_nes(string):
    x = prep_query(string)
    getter_query = SentenceGetter(x)
    sentences_query = getter_query.sentences

    X_query = [sent2features(s) for s in sentences_query]
    X_words = [s[0] for s in sentences_query[0]]
    
    # Obtain NEs from the prediction model
    pred = crf.predict(X_query)

    ents = list(zip(pred[0],X_words))
    # Leave behind anything not designated as a named entity
    named_ents = [pair for pair in ents if pair[0] != 'O']
    return named_ents

In [18]:
df_ht = pd.read_csv('GLG/models/14passes_265_topics_df_ht_lda.csv')
df_ht.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,date,year,month,day,author,title,article,url,section,publication,article_words,lda_topic,other_topics
0,0,7,7,2018-05-02 17:09:00,2018,5.0,2,Caroline Williams,You Can Trick Your Brain Into Being More Focused,If only every day could be like this. You can’...,https://www.vice.com/en_us/article/9kgp4v/how-...,Health,Vice,"['every', 'day', 'could', 'like', 'put', 'fing...",257,"[61, 166, 227]"


In [91]:
df_ht['lda_topic'].nunique()

247

In [93]:
''' Here we create the dictionary of named entities from the LDA output. Ultimately each key will
be a topic number, and the value for that key will be a dict containing the most commonly 
occurring named entities in the training data for that topic. 
The goal that when an input query is found to belong to a particular LDA topic, those top named 
entities will be reported along with the topic name. '''

#8:21
start = time.time()
ner_dict = {}

for i in range(265):
    topic = i
    mini_df = df_ht[df_ht['lda_topic'] == topic]
    mini_df['named_ents'] = [get_nes(article) for article in mini_df['article']]
    #mini_df['counted_ents'] = [concat_ents(ents) for ents in mini_df['named_ents']]
    mini_df['counted_ents'] = [concat_ents(mini_df.loc[i,'named_ents']) if len(mini_df.loc[i,'named_ents']) > 0 else {} for i in mini_df.index]
    
    topic_ner_dict = get_totals([ents for ents in mini_df['counted_ents']])
    topic_ner_dict = {k: v for k, v in sorted(topic_ner_dict.items(), key=lambda item: item[1],reverse=True)}
    ner_dict[i] = topic_ner_dict
    
interval = round((time.time() - start)/60,1)
print(f'That took {interval} mins.')

That took 148.9 mins.


In [153]:
ner_dict_cleaned = ner_dict.copy()

In [8]:
''' Clean up the output: Filter out any NEs of length <= 2 charcters, with the exception of some
allowed NEs listed below. Remove some that are an artifact of the news reporting service.'''

allowed = ['US','BA','UK','GB','AP','BP','LG','EA','DA','IQ','HR','AA','TB',
           'VR','VC','UX','UV','XX','XY','QE','TV','AG','VW','UN','EU','AK','AS','AZ','AR','CA','CO','CT',
           'DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY',
           'LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH',
           'NJ','NM','NY','NC','ND','MP','OH','OK','OR','PA','PR','RI',
           'SC','SD','TN','TX','UT','VT','VA','VI','WA','WV','WI','WY']

not_allowed = ['Reuters','VICE','Vice','The Wall Street Journal','WSJ',
              'Reuters Saudi Arabia','NYT','New York Times']

ent_types = ['gpe','org','per','geo']

ner_dict_stripped =  {}

for key in ner_dict_cleaned.keys():
    key_dict = {}
    
    # If length of NE < 7 (allowing for the '-geo' suffix) then remove
    # Unless NE belongs to the allowed list
    keys_to_delete = []
    for k in ner_dict_cleaned[key].keys():  
        if ((len(k) < 7) & (k[:2] not in allowed)) | (k[0].islower()):
            keys_to_delete.append(k)
    # Remove these keys from the dict    
    for k in keys_to_delete:
        del ner_dict_cleaned[key][k]

    # Strip NEs down to the top 5 only, in each category
    # and put in final stripped dict
    for ent_type in ent_types:
        mini_dict = {key:val for (key,val) in ner_dict_cleaned[key].items() if (key[-3:] == ent_type) & (key[:-4] not in not_allowed)}
        top_5_ent_names = list(mini_dict)[:5]
        top_5_ents = {key:val for (key,val) in mini_dict.items() if key in top_5_ent_names}
        key_dict.update(top_5_ents)
    ner_dict_stripped[key] = key_dict

In [15]:
#Example output from the NER determination
for item in ner_dict_stripped[1]:
    print(item)

American-gpe
Chinese-gpe
African-gpe
German-gpe
Americans-gpe
Google-org
University-org
US-org
CNN-org
University of California-org
Sendler-per
Johnson-per
Trump-per
Redfield-per
Harvard-per
United States-geo
China-geo
New York-geo
Washington-geo
California-geo


In [14]:
ner_dict_stripped

{0: {},
 1: {'American-gpe': 195,
  'Chinese-gpe': 168,
  'African-gpe': 71,
  'German-gpe': 50,
  'Americans-gpe': 45,
  'Google-org': 300,
  'University-org': 155,
  'US-org': 151,
  'CNN-org': 137,
  'University of California-org': 123,
  'Sendler-per': 88,
  'Johnson-per': 73,
  'Trump-per': 59,
  'Redfield-per': 42,
  'Harvard-per': 39,
  'United States-geo': 232,
  'China-geo': 177,
  'New York-geo': 116,
  'Washington-geo': 102,
  'California-geo': 89},
 2: {},
 3: {'African-gpe': 144,
  'British-gpe': 108,
  'French-gpe': 86,
  'German-gpe': 84,
  'Chinese-gpe': 75,
  'Nov-org': 79,
  'Reuters A-org': 62,
  'Reuters South Africa-org': 54,
  'Google-org': 50,
  'Eskom-org': 49,
  'Noor Zainab Hussain-per': 69,
  'Tanisha Heiberg-per': 29,
  'Nikhil Kurian Nainan-per': 23,
  'Alexander Cornwell-per': 23,
  'Yadarisa Shabong-per': 22,
  'Bengaluru-geo': 593,
  'China-geo': 163,
  'South Africa-geo': 113,
  'Jan-geo': 97,
  'United States-geo': 96},
 4: {'Swiss-gpe': 403,
  'Americ

In [9]:
filename = ('GLG/models/14passes_265_ents_dict(stripped_final).pkl')
pickle.dump(ner_dict_stripped,open(filename,'wb'))