In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import Counter
from itertools import chain
import warnings
warnings.filterwarnings('ignore')
import nltk
import time
#nltk.download('stopwords')

### Notes on Prediction
This notebook contains all the code necessary to predict NER topics on input queries using the saved NER model.

In [2]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [6]:
# Load saved NER model
filename = 'models/crf_ner_model.sav'

crf = pickle.load(open(filename, 'rb'))

In [7]:
# Prepare the input query by tagging for POS
def prep_query(phrase):
    split_query = re.findall(r"[\w']+|[.,!?;]", phrase)
    
    pos_tags = pos_tag(split_query)
    
    df_query = pd.DataFrame({'Sentence #':['Sentence: 1'] * len(pos_tags),
                            'Word':[pair[0] for pair in pos_tags],
                            'POS':[pair[1] for pair in pos_tags],
                            'Tag':[None] * len(pos_tags)})
       
    return df_query

In [8]:
# Try predicting on some sample queries
s = "Donald Trump is a former host on The Apprentice. He is an American businessman and former President."
s = 'hello how are you'
s = 'The Second World War started in 1914 and ended in 1918'
s = 'The Korean War started in 1939 and ended in 1945'
s = 'Iraq and Iran were once at war. Saddam Hussein was involved'
s = 'The World Cup is a quadrennial sporting event. FIFA is the governing body involved.'
s = 'Biden under pressure over Afghanistan and Covid as approval ratings slide'
s = 'But the Taliban warned on Monday there would be “consequences” if the US and its allies linger beyond that date.'
s = 'Thousands of American troops have poured back into the country to oversee the chaotic airlift of foreigners and \
selected Afghans from Kabul airport, and Biden is being called upon to extend a 31 August deadline for full US withdrawal'
s = 'As it leaves Afghanistan in chaos, America’s decline mirrors Britain’s a century ago. It may also invite wider \
conflict, warns a historian'
s = 'In March the joint study reported that it was “extremely unlikely” that the virus had been released in a laboratory \
accident. Dr Ben Embarek revealed that this conclusion did not come from a balanced assessment of all the relevant \
evidence but from a steadfast refusal by the Chinese members of the joint study to support anything stronger.'

x = prep_query(s)

In [9]:
'''This is the original, simpler version of the function.'''
def concat_ents(ents_list):
    all_named_ents = {}
    counter = 0
    counts = {}
      
    full_named_ent = ents_list[0][1]

    for i in range(1,len(ents_list) - 1):
        if ents_list[i][0][0] == 'I':
            full_named_ent = full_named_ent + ' ' + ents_list[i][1]
            if ents_list[i + 1][0][0] == 'B':
                if full_named_ent in all_named_ents.keys():
                    all_named_ents[full_named_ent] += 1
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent] = 1
                    full_named_ent = ''
            else:
                continue
                full_named_ent = full_named_ent + ' ' + ents_list[i + 1][1]
                           
        elif ents_list[i][0][0] == 'B':
            full_named_ent = ents_list[i][1]
            if ents_list[i + 1][0][0] == 'B':
                if full_named_ent in all_named_ents.keys():
                    all_named_ents[full_named_ent] += 1
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent] = 1
                    full_named_ent = '' 

    return all_named_ents

In [10]:
''' The output from the NER tagging is a collection of individual words with a "B" or "I" tag, depending on whether the 
word is at the beginning or interior of the named entity. This function puts each named entity together from its constituent 
words
'''
def concat_ents(ents_list):
    all_named_ents = {}
    counter = 0
    counts = {}
       
    full_named_ent = ents_list[0][1]

    for i in range(1,len(ents_list) - 1):
        if ents_list[i][0][0] == 'I':
            full_named_ent = full_named_ent + ' ' + ents_list[i][1] # State name
            if ents_list[i + 1][0][0] == 'B':              
                if full_named_ent + ents_list[i][0][1:] in all_named_ents.keys():
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] += 1            # Add
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] = 1             # Add
                    full_named_ent = ''
            else:
                continue
                full_named_ent = full_named_ent + ' ' + ents_list[i + 1][1]
                           
        elif ents_list[i][0][0] == 'B':
            full_named_ent = ents_list[i][1]
            if ents_list[i + 1][0][0] == 'B':
                if full_named_ent + ents_list[i][0][1:] in all_named_ents.keys():
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] += 1           # Add
                    full_named_ent = ''
                else:
                    all_named_ents[full_named_ent+ ents_list[i][0][1:]] = 1            # Add
                    full_named_ent = '' 
                              
    return all_named_ents

In [11]:
'''For each dictionary of NEs, add up the number of time each key (i.e. each word) appears'''

def get_totals(dicts):
    totals = {}
    for _dict in dicts:
        for key in _dict.keys():
            if key not in totals.keys():
                totals[key] = _dict[key]
            else:
                totals[key] += _dict[key]
    return totals            

In [12]:
''' Extract the named entities from a given string. 
Return a list of NEs of the form: [('B-tim', 'March'),
                                   ('B-per', 'Dr'),
                                   ('I-per', 'Ben'),
                                   ('I-per', 'Embarek'),
                                   ('B-gpe', 'Chinese')]'''
def get_nes(string):
    x = prep_query(string)
    getter_query = SentenceGetter(x)
    sentences_query = getter_query.sentences

    X_query = [sent2features(s) for s in sentences_query]
    X_words = [s[0] for s in sentences_query[0]]
    
    # Obtain NEs from the prediction model
    pred = crf.predict(X_query)

    ents = list(zip(pred[0],X_words))
    # Leave behind anything not designated as a named entity
    named_ents = [pair for pair in ents if pair[0] != 'O']
    return named_ents

In [14]:
df_ht = pd.read_csv('models/14passes_265_topics_df_ht_lda.csv')
df_ht.head(1)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,date,year,month,day,author,title,article,url,section,publication,article_words,lda_topic,other_topics
0,0,7,7,2018-05-02 17:09:00,2018,5.0,2,Caroline Williams,You Can Trick Your Brain Into Being More Focused,If only every day could be like this. You can’...,https://www.vice.com/en_us/article/9kgp4v/how-...,Health,Vice,"['every', 'day', 'could', 'like', 'put', 'fing...",257,"[61, 166, 227]"


In [15]:
df_ht['lda_topic'].nunique()

247

In [93]:
''' Here we create the dictionary of named entities from the LDA output. Ultimately each key will
be a topic number, and the value for that key will be a dict containing the most commonly 
occurring named entities in the training data for that topic. 
The goal that when an input query is found to belong to a particular LDA topic, those top named 
entities will be reported along with the topic name. '''

#8:21
start = time.time()
ner_dict = {}

for i in range(265):
    topic = i
    mini_df = df_ht[df_ht['lda_topic'] == topic]
    mini_df['named_ents'] = [get_nes(article) for article in mini_df['article']]
    #mini_df['counted_ents'] = [concat_ents(ents) for ents in mini_df['named_ents']]
    mini_df['counted_ents'] = [concat_ents(mini_df.loc[i,'named_ents']) if len(mini_df.loc[i,'named_ents']) > 0 else {} for i in mini_df.index]
    
    topic_ner_dict = get_totals([ents for ents in mini_df['counted_ents']])
    topic_ner_dict = {k: v for k, v in sorted(topic_ner_dict.items(), key=lambda item: item[1],reverse=True)}
    ner_dict[i] = topic_ner_dict
    
interval = round((time.time() - start)/60,1)
print(f'That took {interval} mins.')

That took 148.9 mins.


In [39]:
filename = 'models/14passes_265_ents_dict(stripped).pkl'
ner_dict_cleaned = pickle.load(open(filename,'rb'))

In [40]:
ner_dict_cleaned

{0: {'MEXICAN BILLIONAIRE SLIM SAYS HAS SPOKEN WITH GOVERNMENTS OF GUATEMALA AND EL SALVADOR ABOUT BOOSTING TELECOMS INVESTMENTS THERE-art': 1},
 1: {'Google-org': 300,
  'United States-geo': 232,
  'American-gpe': 195,
  'China-geo': 177,
  'Chinese-gpe': 168,
  'University-org': 155,
  'US-org': 151,
  'CNN-org': 137,
  'University of California-org': 123,
  'New York-geo': 116,
  'Microsoft-org': 106,
  'Apple-org': 103,
  'Washington-geo': 102,
  'Facebook-art': 100,
  'NASA-org': 98,
  'Nobel Prize-org': 93,
  'California-geo': 89,
  'Sendler-per': 88,
  'Twitter-art': 83,
  '2015-tim': 82,
  'Chicago-geo': 82,
  'Tuesday-tim': 82,
  'Berkeley-geo': 82,
  'Epstein-geo': 81,
  'Stanford-org': 79,
  '2013-tim': 76,
  'MIT-org': 73,
  'Johnson-per': 73,
  '2014-tim': 72,
  'Thursday-tim': 71,
  'African-gpe': 71,
  'Motherboard-org': 70,
  'Silicon Valley-geo': 67,
  'Wednesday-tim': 67,
  'Earth-geo': 66,
  '2018-tim': 65,
  'STEM-org': 65,
  'Harvard-org': 64,
  'San Francisco-geo'

In [153]:
ner_dict_cleaned = ner_dict.copy()

In [54]:
# Final Answer
''' Clean up the output: Filter out any NEs of length <= 2 charcters, with the exception of some
allowed NEs listed below. Remove some that are an artifact of the news reporting service.'''

allowed = ['US','BA','UK','GB','AP','BP','LG','EA','DA','IQ','HR','AA','TB',
           'VR','VC','UX','UV','XX','XY','QE','TV','AG','VW','UN','EU','AK','AS','AZ','AR','CA','CO','CT',
           'DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY',
           'LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH',
           'NJ','NM','NY','NC','ND','MP','OH','OK','OR','PA','PR','RI',
           'SC','SD','TN','TX','UT','VT','VA','VI','WA','WV','WI','WY']

not_allowed = ['Reuters','VICE','Vice','The Wall Street Journal','WSJ',
              'Reuters Saudi Arabia','NYT','New York Times']

ent_types = ['gpe','org','per','geo']

ner_dict_stripped =  {}

for key in ner_dict_cleaned.keys():
    key_dict = {}
    
    # If length of NE < 7 (allowing for the '-geo' suffix) then remove
    # Unless NE belongs to the allowed list
    keys_to_delete = []
    for k in ner_dict_cleaned[key].keys():  
        if ((len(k) < 7) & (k[:2] not in allowed)) | (k[0].islower()):
            keys_to_delete.append(k)
    # Remove these keys from the dict    
    for k in keys_to_delete:
        del ner_dict_cleaned[key][k]

    # Strip NEs down to the top 5 only, in each category
    # and put in final stripped dict
    for ent_type in ent_types:
        mini_dict = {key:val for (key,val) in ner_dict_cleaned[key].items() if (key[-3:] == ent_type) & (key[:-4] not in not_allowed)}
        top_20_ent_names = list(mini_dict)[:20]
        print('Hey1!')
        print(top_20_ent_names)
        
############################ 
## For the top 'person' entities only, reach back through the top 30 NEs
## If there are any 'duplicates', e.g. 'Bezos' and 'Jeff Bezos'
## keep the highest-occuring instance of that name, and discard the 
## any lower-ranking ones. This avoids repetition in the displayed 
## list of NEs in the final application.        
        # This only applies to 'person' entities
        if ent_type == 'per':  
            inds_to_drop = []
            for i in range(len(top_20_ent_names)):
                for j in range(i+1,len(top_20_ent_names)):
                    if (top_20_ent_names[i][:-4].lower() in top_20_ent_names[j][:-4].lower()) | (top_20_ent_names[j][:-4].lower() in top_20_ent_names[i][:-4].lower()):
                        inds_to_drop.append(j)
            top_20_ent_names = [top_20_ent_names[i] for i in range(len(top_20_ent_names)) if i not in inds_to_drop]
            print('Hey2!')
            print(top_20_ent_names)
            top_5_ent_names = top_20_ent_names[:5]   
        else:
            top_5_ent_names = list(mini_dict)[:5]
       
        top_5_ents = {key:val for (key,val) in mini_dict.items() if key in top_5_ent_names}
        key_dict.update(top_5_ents)
    ner_dict_stripped[key] = key_dict

Hey1!
[]
Hey1!
[]
Hey1!
[]
Hey2!
[]
Hey1!
[]
Hey1!
['American-gpe', 'Chinese-gpe', 'African-gpe', 'German-gpe', 'Americans-gpe', 'British-gpe', 'Indian-gpe', 'French-gpe', 'Canadian-gpe', 'English-gpe', 'Russian-gpe', 'Spanish-gpe', 'Italian-gpe', 'Japanese-gpe', 'Israeli-gpe', 'Dutch-gpe', 'African American-gpe', 'Finnish-gpe', 'Norwegian-gpe', 'Swedish-gpe']
Hey1!
['Google-org', 'University-org', 'US-org', 'CNN-org', 'University of California-org', 'Microsoft-org', 'Apple-org', 'NASA-org', 'Nobel Prize-org', 'Stanford-org', 'MIT-org', 'Motherboard-org', 'STEM-org', 'Harvard-org', 'American-org', 'White House-org', 'CERN-org', 'Times-org', 'Congress-org', 'The New York Times-org']
Hey1!
['Sendler-per', 'Johnson-per', 'Trump-per', 'Redfield-per', 'Harvard-per', 'Watson-per', 'President Trump-per', 'Miller-per', 'Fitzgerald-per', 'Epstein-per', 'Anderson-per', 'President-per', 'Delzer-per', 'John-per', 'Cornell-per', 'Croce-per', 'Dyson-per', 'Tesler-per', 'Carnegie Mellon-per', 'Hahn-p

Hey1!
['Russian-gpe', 'Chinese-gpe', 'Korean-gpe', 'Iranian-gpe', 'Japanese-gpe', 'Syrian-gpe', 'American-gpe', 'French-gpe', 'Turkish-gpe', 'German-gpe', 'Canadian-gpe', 'Mexican-gpe', 'British-gpe', 'Venezuelan-gpe', 'Ukrainian-gpe', 'Americans-gpe', 'Indian-gpe', 'Arab-gpe', 'Israeli-gpe', 'Russians-gpe']
Hey1!
['White House-org', 'NATO-org', 'Reuters U-org', 'Putin-org', 'Kremlin-org', 'State Department-org', 'EU-org', 'State-org', 'Treasury-org', 'United Nations-org', 'Security Council-org', 'Qatar-org', 'Reuters Russia-org', 'European Union-org', 'Sept-org', 'Congress-org', 'Turkey-org', 'RIA-org', 'G20-org', 'Pentagon-org']
Hey1!
['Trump-per', 'President Donald Trump-per', 'President Vladimir Putin-per', 'Kim-per', 'Mike Pompeo-per', 'Prime-per', 'Kim Jong Un-per', 'Pompeo-per', 'Obama-per', 'Secretary Steven Mnuchin-per', 'President Xi Jinping-per', 'Macron-per', 'Vladimir Soldatkin-per', 'Abe-per', 'Mnuchin-per', 'Steve Holland-per', 'Saudi Arabia-per', 'Rouhani-per', 'Preside

Hey1!
['Tecentriq-per', 'Novartis-per', 'Bristol Myers-per', 'Bristol Myers Squibb-per', 'Pfizer-per', 'Bill Berkrot-per', 'Manas Mishra-per', 'Johnson Johnson-per', 'Ben Hirschler-per', 'Tamara Mathias-per', 'Brawley-per', 'Celgene-per', 'Trump-per', 'Bayer-per', 'Allison-per', 'John Miller-per', 'Natalie Grover-per', 'Tagrisso-per', 'Biogen-per', 'Biden-per']
Hey2!
['Tecentriq-per', 'Novartis-per', 'Bristol Myers-per', 'Pfizer-per', 'Bill Berkrot-per', 'Manas Mishra-per', 'Johnson Johnson-per', 'Ben Hirschler-per', 'Tamara Mathias-per', 'Brawley-per', 'Celgene-per', 'Trump-per', 'Bayer-per', 'Allison-per', 'John Miller-per', 'Natalie Grover-per', 'Tagrisso-per', 'Biogen-per', 'Biden-per']
Hey1!
['United States-geo', 'Keytruda-geo', 'Bengaluru-geo', 'Opdivo-geo', 'Alzheimer-geo', 'New York-geo', 'Tecentriq-geo', 'China-geo', 'Europe-geo', 'Novartis-geo', 'Chicago-geo', 'Yescarta-geo', 'Imfinzi-geo', 'Lynparza-geo', 'Roche-geo', 'Kymriah-geo', 'London-geo', 'Japan-geo', 'Medicare-geo',

Hey1!
['Chinese-gpe', 'American-gpe', 'Mexican-gpe', 'Japanese-gpe', 'Canadian-gpe', 'French-gpe', 'German-gpe', 'Brazilian-gpe', 'Indian-gpe', 'Americans-gpe', 'African-gpe', 'Turkish-gpe', 'Italian-gpe', 'Soybeans-gpe', 'British-gpe', 'Korean-gpe', 'Russian-gpe', 'Australian-gpe', 'Spanish-gpe', 'Scottish-gpe']
Hey1!
['EU-org', 'White House-org', 'WTO-org', 'Reuters U-org', 'Apple-org', 'European Union-org', 'Treasury-org', 'Airbus-org', 'USTR-org', 'Commerce Department-org', 'Congress-org', 'Boeing-org', 'G20-org', 'CNBC-org', 'WASHINGTON-org', 'Reuters China-org', 'American-org', 'TPP-org', 'S P-org', 'Nov-org']
Hey1!
['Trump-per', 'President Donald Trump-per', 'Robert Lighthizer-per', 'David Lawder-per', 'President Xi Jinping-per', 'Secretary Steven Mnuchin-per', 'Vice Premier Liu-per', 'Lighthizer-per', 'Liu-per', 'Susan Heavey-per', 'Jeff Mason-per', 'Makini Brice-per', 'Mnuchin-per', "President Donald Trump's-per", 'Ben Blanchard-per', 'Larry Kudlow-per', 'President Trump-per',

Hey1!
['Brent-per', 'Saudi Arabia-per', 'President Donald Trump-per', 'Trump-per', 'Aaron Sheldrick-per', 'Falih-per', 'John Kilduff-per', 'Goldman Sachs-per', 'Alexander Novak-per', 'Baker Hughes-per', 'Phil Flynn-per', 'Riyadh-per', 'Marguerita Choy-per', 'Jim Ritterbusch-per', 'John Kemp-per', 'Saudi Aramco-per', 'Stephen Innes-per', 'Novak-per', 'Florence Tan-per', 'Tamas Varga-per']
Hey2!
['Brent-per', 'Saudi Arabia-per', 'President Donald Trump-per', 'Aaron Sheldrick-per', 'Falih-per', 'John Kilduff-per', 'Goldman Sachs-per', 'Alexander Novak-per', 'Baker Hughes-per', 'Phil Flynn-per', 'Riyadh-per', 'Marguerita Choy-per', 'Jim Ritterbusch-per', 'John Kemp-per', 'Saudi Aramco-per', 'Stephen Innes-per', 'Florence Tan-per', 'Tamas Varga-per']
Hey1!
['China-geo', 'Iran-geo', 'Russia-geo', 'United States-geo', 'Saudi Arabia-geo', 'Venezuela-geo', 'West Texas Intermediate WTI-geo', 'Middle East-geo', 'Iraq-geo', 'West Texas Intermediate-geo', 'Washington-geo', 'Jan-geo', 'Libya-geo', '

Hey1!
['American-gpe', 'Chinese-gpe', 'Canadian-gpe', 'Americans-gpe', 'Mexican-gpe', 'Indian-gpe', 'Australian-gpe', 'Japanese-gpe', 'British-gpe', 'Russian-gpe', 'French-gpe', 'Canadians-gpe', 'Swiss-gpe', 'German-gpe', 'African-gpe', 'Indonesian-gpe', 'Brazilian-gpe', 'Indians-gpe', 'Europeans-gpe', 'Appalachian-gpe']
Hey1!
['EPA-org', 'Congress-org', 'White House-org', 'FCC-org', 'US-org', 'Google-org', 'FDA-org', 'Environmental Protection Agency-org', 'Apple-org', 'Senate-org', 'SEC-org', 'EU-org', 'Microsoft-org', 'NASA-org', 'FAA-org', 'CNBC-org', 'Commission-org', 'AT T-org', 'Republican-org', 'House-org']
Hey1!
['Trump-per', 'Obama-per', 'President Donald Trump-per', 'President Trump-per', 'Pruitt-per', 'President-per', 'President Barack Obama-per', 'Lopez Obrador-per', 'Wheeler-per', 'Scott Pruitt-per', 'Andrew Wheeler-per', 'Donald Trump-per', 'David Shepardson-per', 'President Obama-per', 'Bush-per', "President Donald Trump's-per", 'Zinke-per', 'Pemex-per', 'President Andre

Hey1!
['Earth-geo', 'United States-geo', 'San Francisco-geo', 'New York-geo', 'California-geo', 'China-geo', 'Amazon-geo', 'Puerto Rico-geo', 'Mexico-geo', 'London-geo', 'Los Angeles-geo', 'Manhattan-geo', 'America-geo', 'Japan-geo', 'Hong Kong-geo', 'Khosla-geo', 'New York City-geo', 'Silicon Valley-geo', 'Texas-geo', 'Europe-geo']
Hey1!
['Chinese-gpe', 'Swiss-gpe', 'British-gpe', 'German-gpe', 'French-gpe', 'Canadian-gpe', 'Swedish-gpe', 'American-gpe', 'Australian-gpe', 'Indian-gpe', 'Danish-gpe', 'African-gpe', 'Japanese-gpe', 'Norwegian-gpe', 'Brazilian-gpe', 'Russian-gpe', 'Korean-gpe', 'Italian-gpe', 'Dutch-gpe', 'Spanish-gpe']
Hey1!
['Apple-org', 'IBES-org', 'EBITDA-org', 'Google-org', 'UK-org', 'Microsoft-org', 'CNBC-org', 'B E S-org', 'European Union-org', 'Eikon Further-org', 'Intel-org', 'Nov-org', 'Alphabet-org', 'EBIT-org', 'S P-org', 'Lyft-org', 'Thomson Reuters-org', 'Facebook-org', 'Disney-org', 'Sony-org']
Hey1!
['Thomson Reuters-per', 'Cook-per', 'Street-per', 'Amazo

Hey1!
['Canadian-gpe', 'Chinese-gpe', 'German-gpe', 'British-gpe', 'Japanese-gpe', 'Italian-gpe', 'Australian-gpe', 'Swiss-gpe', 'American-gpe', 'French-gpe', 'Polish-gpe', 'Mexican-gpe', 'Czech-gpe', 'Russian-gpe', 'Hungarian-gpe', 'Turkish-gpe', 'Spanish-gpe', 'Brazilian-gpe', 'Romanian-gpe', 'Serbian-gpe']
Hey1!
['Fed-org', 'Federal Reserve-org', 'ECB-org', 'Treasury-org', 'European Union-org', 'S P-org', 'EU-org', 'European Central Bank-org', 'UK-org', 'NEW YORK-org', 'Bank of Canada-org', 'Bank of England-org', 'Reuters U-org', 'US-org', 'TORONTO-org', 'Reuters Gold-org', 'White House-org', 'Nov-org', 'ING-org', 'Commerzbank-org']
Hey1!
['President Donald Trump-per', 'Trump-per', 'Powell-per', 'Jerome Powell-per', 'Prime Minister Theresa May-per', 'Johnson-per', 'Street-per', 'Graphic Trade-per', 'Brent-per', 'Prime Minister Boris Johnson-per', 'Draghi-per', 'Dow Jones Industrial Average-per', 'Wall Street-per', 'Sujata Rao-per', 'Goldman Sachs-per', "President Donald Trump's-per"

Hey1!
['Facebook-org', 'Google-org', 'YouTube-org', 'Apple-org', 'Motherboard-org', 'Twitter-org', 'CNN-org', 'US-org', 'CNBC-org', 'Microsoft-org', 'EU-org', 'ISIS-org', 'Verge-org', 'Jones-org', 'CNN Business-org', 'Congress-org', 'White House-org', 'The New York Times-org', 'UK-org', 'DMCA-org']
Hey1!
['Facebook-per', 'Trump-per', 'Twitter-per', 'Motherboard-per', 'Jones-per', 'Alex Jones-per', 'Donald Trump-per', 'Nazis-per', 'Mark Zuckerberg-per', 'President Trump-per', 'Amazon-per', 'Dorsey-per', 'President Donald Trump-per', 'Hillary Clinton-per', 'Wojcicki-per', 'Paul-per', 'Jack Dorsey-per', 'Zuckerberg-per', 'Daily Stormer-per', 'Jigsaw-per']
Hey2!
['Facebook-per', 'Trump-per', 'Twitter-per', 'Motherboard-per', 'Jones-per', 'Nazis-per', 'Mark Zuckerberg-per', 'Amazon-per', 'Dorsey-per', 'Hillary Clinton-per', 'Wojcicki-per', 'Paul-per', 'Daily Stormer-per', 'Jigsaw-per']
Hey1!
['China-geo', 'Reddit-geo', 'United States-geo', 'Instagram-geo', 'India-geo', 'Infowars-geo', 'Wiki

Hey1!
['American-gpe', 'Canadian-gpe', 'Chinese-gpe', 'Indian-gpe', 'French-gpe', 'Americans-gpe', 'British-gpe', 'Spanish-gpe', 'Los Angeles-gpe', 'Italian-gpe', 'Mexican-gpe', 'Japanese-gpe', 'Egyptian-gpe', 'Danish-gpe', 'German-gpe', 'Evans-gpe', 'Russian-gpe', 'Azerbaijan-gpe', 'Indonesian-gpe', 'Burmese-gpe']
Hey1!
['Google-org', 'Amazon-org', 'MTA-org', 'HQ2-org', 'Apple-org', 'Boring Company-org', 'US-org', 'CNBC-org', 'City Council-org', 'Google Maps-org', 'LA-org', 'Facebook-org', 'Meredith-org', 'Times-org', 'New York Governor-org', 'Times Square-org', 'NEW YORK-org', 'Reuters A-org', 'Motherboard-org', 'Queens-org']
Hey1!
['Amazon-per', 'Cuomo-per', 'Andrew Cuomo-per', 'Mayor Bill de Blasio-per', 'Blasio-per', 'Amtrak-per', 'Airbnb-per', 'Jeff Bezos-per', 'De Blasio-per', 'Sidewalk Labs-per', 'Elon Musk-per', 'Trump-per', 'Musk-per', 'President Donald Trump-per', 'Gianaris-per', 'Silicon Valley-per', 'Brooklyn-per', 'Wi Fi-per', 'Berman-per', 'Sandy-per']
Hey2!
['Amazon-per

Hey1!
['Chinese-gpe', 'American-gpe', 'Australian-gpe', 'German-gpe', 'British-gpe', 'Indian-gpe', 'Japanese-gpe', 'French-gpe', 'Egyptian-gpe', 'Mexican-gpe', 'Italian-gpe', 'Turkish-gpe', 'Swiss-gpe', 'Brazilian-gpe', 'African-gpe', 'Iranian-gpe', 'Russian-gpe', 'Korean-gpe', 'Canadian-gpe', 'Colombian-gpe']
Hey1!
['S P-org', 'Federal Reserve-org', 'Nasdaq Composite-org', 'Fed-org', 'Apple-org', 'S P ASX-org', 'European Union-org', 'MSCI-org', 'S P NZX-org', 'UK-org', 'Reuters Gold-org', 'CSI300-org', 'STOXX-org', 'Shanghai Composite Index-org', 'LIVE-org', 'Reuters China-org', 'EU-org', 'Treasury-org', 'Qatar-org', 'Gold-org']
Hey1!
['President Donald Trump-per', 'Dow Jones Industrial Average-per', 'Trump-per', 'Medha Singh-per', 'Street-per', 'John Ruwitch-per', 'Samuel Shen-per', 'Wall Street-per', 'Saudi Arabia-per', 'Abu Dhabi-per', 'BHP Billiton-per', 'Dow-per', 'Susan Mathew-per', 'Agamoni Ghosh-per', 'Andrew Torchia-per', 'Celine Aswad-per', 'Brent-per', 'Luoyan Liu-per', 'Pr

['Hasbro-geo', 'Beale-geo', 'Newtoy-geo', 'Scrabble-geo', 'Air Jordans-geo', 'Virgo-geo', 'Aries-geo', 'Hangman-geo', 'Anagrams-geo', 'Wheel of Fortune-geo', 'Built-geo', 'Inc-geo', 'Dictionary-geo', 'Friends-geo', 'Magic-geo', 'Ebay-geo', 'Rhystic-geo', 'Pokemon-geo', 'Rhystic Studios-geo', 'Wizards-geo']
Hey1!
['Russian-gpe', 'British-gpe', 'American-gpe', 'Brazilian-gpe', 'Chinese-gpe', 'Australian-gpe', 'Japanese-gpe', 'French-gpe', 'German-gpe', 'Canadian-gpe', 'African-gpe', 'Americans-gpe', 'Kenyan-gpe', 'Dutch-gpe', 'Korean-gpe', 'Brazilians-gpe', 'Italian-gpe', 'Swiss-gpe', 'Olympian-gpe', 'Russians-gpe']
Hey1!
['IOC-org', 'Rio-org', 'Rio Olympics-org', 'WADA-org', 'Committee-org', 'RIO DE JANEIRO-org', 'IAAF-org', 'RIO DE JANEIRO Reuters-org', 'Olympics-org', 'US-org', 'Committee IOC-org', 'Open-org', 'NASA-org', 'American-org', 'Rio de Janeiro-org', 'Tour de France-org', 'World Anti Doping Agency WADA-org', 'Larsen C-org', 'Phelps-org', 'World Anti Doping Agency-org']
Hey1!


Hey1!
['T 0-org', 'Eikon Further-org', 'LTD-org', 'JV Source-org', 'Ltd-org', 'Shenzhen-org', 'Ministry of Finance-org', 'CFO-org', 'T 5-org', 'Beijing Headline News Beijing-org', 'CFO Source-org', 'Reuters Taiwan Semiconductor Manufacturing-org', 'Tokyo Stock Exchange TSE-org', 'Reuters Wuhan Thalys Medical Technology Inc-org', 'China Food and Drug Administration-org', 'R D-org', 'TSE Mothers-org', 'Reuters Fortis Inc-org', 'Chengdu-org', 'SEK-org']
Hey1!
['Aug-per', 'Co Ltd-per', 'Inc-per', 'Shenzhen-per', 'Ltd-per', 'Nov-per', 'Ningbo-per', 'Will-per', 'Samsung Electronics Co-per', 'Says JV-per', 'Zhejiang-per', 'Jiangsu-per', 'Technology Inc-per', 'Wuhan-per', 'Prime Corp-per', 'AOI Pro-per', 'Proceeds-per', 'Medy Tox Inc-per', 'Shanghai-per', 'Effective Sept-per']
Hey2!
['Aug-per', 'Co Ltd-per', 'Inc-per', 'Shenzhen-per', 'Nov-per', 'Ningbo-per', 'Will-per', 'Samsung Electronics Co-per', 'Says JV-per', 'Zhejiang-per', 'Jiangsu-per', 'Wuhan-per', 'Prime Corp-per', 'AOI Pro-per', 'P

Hey1!
['Americans-gpe', 'American-gpe', 'Chinese-gpe', 'French-gpe', 'German-gpe', 'British-gpe', 'Canadian-gpe', 'Australian-gpe', 'Swiss-gpe', 'Spanish-gpe', 'African-gpe', 'Japanese-gpe', 'Russian-gpe', 'English-gpe', 'Italian-gpe', 'Korean-gpe', 'Cuban-gpe', 'Cleeremans-gpe', 'Dutch-gpe', 'Brazilian-gpe']
Hey1!
['LSD-org', 'CTE-org', 'EEG-org', 'MRI-org', 'University-org', 'University of California-org', 'ADHD-org', 'FDA-org', 'NFL-org', 'Science Times-org', 'Neuralink-org', 'US-org', 'CNN-org', 'ALS-org', 'Kennedy-org', 'Gizmodo-org', 'DBS-org', 'UK-org', 'BCI-org', 'University College London-org']
Hey1!
['Parkinson-per', 'Johnson-per', 'Kennedy-per', 'Brown-per', 'Musk-per', 'Ramirez-per', 'Howard-per', 'Ajiboye-per', 'Tanzi-per', 'Louis-per', 'Kochevar-per', 'Behrmann-per', 'Smith-per', 'Liu-per', 'Neuron-per', 'Ramsey-per', 'Moulin-per', 'Miller-per', 'Elon Musk-per', 'Neuralink-per']
Hey2!
['Parkinson-per', 'Johnson-per', 'Kennedy-per', 'Brown-per', 'Musk-per', 'Ramirez-per', 

Hey1!
['Apple-org', 'Microsoft-org', 'Google-org', 'Intel-org', 'FDA-org', 'Verge-org', 'Motherboard-org', 'US-org', 'Facebook-org', 'AT T-org', 'Sony-org', 'AMD-org', 'CNBC-org', 'Amazon-org', 'FTC-org', 'CPSC-org', 'TSB-org', 'Apple Store-org', 'Touch ID-org', 'Apple Watch-org']
Hey1!
['John Deere-per', 'Amazon-per', 'Facebook-per', 'Motherboard-per', 'Mac-per', 'Jones-per', 'Gay Gordon Byrne-per', 'Kyle Wiens-per', 'Samsung-per', 'Nathan Proctor-per', 'Bose-per', 'Verge-per', 'MacBook Pro-per', 'Down Detector-per', 'Surface Pro-per', 'Gordon Byrne-per', 'Joy Con-per', 'Consumer Reports-per', 'Rossmann-per', 'Sabadell-per']
Hey2!
['John Deere-per', 'Amazon-per', 'Facebook-per', 'Motherboard-per', 'Mac-per', 'Jones-per', 'Gay Gordon Byrne-per', 'Kyle Wiens-per', 'Samsung-per', 'Nathan Proctor-per', 'Bose-per', 'Verge-per', 'Down Detector-per', 'Surface Pro-per', 'Joy Con-per', 'Consumer Reports-per', 'Rossmann-per', 'Sabadell-per']
Hey1!
['Amazon-geo', 'United States-geo', 'Samsung-ge

Hey1!
['Americans-gpe', 'American-gpe', 'Chinese-gpe', 'British-gpe', 'African-gpe', 'Physicians-gpe', 'Indian-gpe', 'Syrian-gpe', 'Canadian-gpe', 'French-gpe', 'English-gpe', 'Brazilian-gpe', 'Spanish-gpe', 'Jordan-gpe', 'Japanese-gpe', 'German-gpe', 'Australian-gpe', 'Romanian-gpe', 'Canadians-gpe', 'Mexican-gpe']
Hey1!
['CNN-org', 'US-org', 'Apple-org', 'Reuters Health-org', 'FDA-org', 'Google-org', 'CNBC-org', 'CMS-org', 'CDC-org', 'CVS-org', 'UK-org', 'University-org', 'Affordable Care Act-org', 'NHS-org', 'Medicaid-org', 'University of California-org', 'White House-org', 'Congress-org', 'American-org', 'HIV-org']
Hey1!
['Trump-per', 'Aetna-per', 'Amazon-per', 'President-per', 'Ali-per', 'Johnson-per', 'President Donald Trump-per', 'Overall-per', 'President Trump-per', 'Anderson-per', 'Lee-per', 'Frank-per', 'Cohen-per', 'Hall-per', 'Junjun-per', 'Smith-per', 'Obama-per', 'Louis-per', 'Davis-per', 'Jones-per']
Hey2!
['Trump-per', 'Aetna-per', 'Amazon-per', 'President-per', 'Ali-pe

Hey1!
['Chinese-gpe', 'French-gpe', 'German-gpe', 'American-gpe', 'Canadian-gpe', 'British-gpe', 'Russian-gpe', 'Brazilian-gpe', 'Swiss-gpe', 'Japanese-gpe', 'Italian-gpe', 'Indian-gpe', 'Australian-gpe', 'Swedish-gpe', 'Americans-gpe', 'African-gpe', 'Dutch-gpe', 'Nigerian-gpe', 'Korean-gpe', 'Malaysian-gpe']
Hey1!
['Apple-org', 'Google-org', 'Supreme Court-org', 'Justice Department-org', 'SEC-org', 'J J-org', 'FTC-org', 'Microsoft-org', 'FBI-org', 'Congress-org', 'Department of Justice-org', 'District Court-org', 'Facebook-org', 'US-org', 'FDA-org', 'CNN-org', 'Commission-org', 'CNBC-org', 'DOJ-org', 'Circuit Court of Appeals-org']
Hey1!
['Trump-per', 'Johnson Johnson-per', 'Shkreli-per', 'Uber-per', 'Qualcomm-per', 'President Donald Trump-per', 'Jonathan Stempel-per', 'Ghosn-per', 'Bayer-per', 'Amazon-per', 'Musk-per', 'Waymo-per', 'Avenatti-per', 'Brafman-per', 'Facebook-per', 'Smith-per', 'Eni-per', 'Johnson-per', 'Meng-per', 'Lee-per']
Hey2!
['Trump-per', 'Johnson Johnson-per', '

Hey1!
['EU-org', 'European Union-org', 'UK-org', 'European Commission-org', 'Commission-org', 'European Parliament-org', 'Reuters Britain-org', 'May-org', 'BBC-org', 'Labour-org', 'United Kingdom-org', 'Reuters British-org', 'Labour Party-org', 'Bank of England-org', 'Sept-org', 'Nov-org', 'Guy Faulconbridge-org', 'Parliament-org', 'European Council-org', 'Conservative Party-org']
Hey1!
['Johnson-per', 'Prime Minister Theresa May-per', 'Prime Minister Boris Johnson-per', 'Trump-per', 'William James-per', 'Kate Holton-per', 'Elizabeth Piper-per', 'Prime-per', 'Hammond-per', 'Jeremy Corbyn-per', 'Michael Holden-per', 'Kylie MacLellan-per', 'Boris Johnson-per', 'Paul Sandle-per', 'President Donald Trump-per', 'Huw Jones-per', 'David Cameron-per', 'Leyen-per', 'Alistair Smout-per', 'William Schomberg-per']
Hey2!
['Johnson-per', 'Prime Minister Theresa May-per', 'Trump-per', 'William James-per', 'Kate Holton-per', 'Elizabeth Piper-per', 'Hammond-per', 'Jeremy Corbyn-per', 'Michael Holden-pe

Hey1!
['Japanese-gpe', 'American-gpe', 'British-gpe', 'Chinese-gpe', 'French-gpe', 'Russian-gpe', 'German-gpe', 'Americans-gpe', 'Guardian-gpe', 'Italian-gpe', 'Which-gpe', 'Canadian-gpe', 'Indian-gpe', 'Australian-gpe', 'English-gpe', 'Spanish-gpe', 'Irish-gpe', 'Korean-gpe', 'Swedish-gpe', 'Polish-gpe']
Hey1!
['Apple-org', 'Google-org', 'VR-org', 'Which-org', 'Wild-org', 'Breath-org', 'US-org', 'Microsoft-org', 'Dark Souls-org', 'Star Wars-org', 'PS4-org', 'UK-org', 'Sony-org', 'Disney-org', 'Link-org', 'Wii U-org', 'Facebook-org', 'Resident Evil-org', 'Switch-org', 'RPG-org']
Hey1!
['Follow Patrick-per', 'Well-per', 'Trump-per', 'Tandy-per', 'Click-per', 'Resident Evil-per', 'Cameron-per', 'Mario Maker-per', 'Siri-per', 'Hell-per', 'Pokémon Go-per', 'Noor-per', 'Follow Mike-per', 'Spider Man-per', 'Mike-per', 'Mac-per', 'God-per', 'Xbox One-per', 'Prime-per', 'Sky-per']
Hey2!
['Follow Patrick-per', 'Well-per', 'Trump-per', 'Tandy-per', 'Click-per', 'Resident Evil-per', 'Cameron-per'

Hey2!
[]
Hey1!
[]
Hey1!
['Chinese-gpe', 'German-gpe', 'American-gpe', 'Japanese-gpe', 'Americans-gpe', 'French-gpe', 'Swedish-gpe', 'Israeli-gpe', 'Salehian-gpe', 'Korean-gpe', 'British-gpe', 'Canadian-gpe', 'Russian-gpe', 'Sebastian-gpe', 'Australian-gpe', 'Embark-gpe', 'Tech-gpe', 'Dutch-gpe', 'Minivans-gpe', 'Which-gpe']
Hey1!
['Google-org', 'Apple-org', 'Ford-org', 'Waymo-org', 'Uber-org', 'Lyft-org', 'Intel-org', 'NTSB-org', 'Super Cruise-org', 'US-org', 'NHTSA-org', 'BMW-org', 'Alphabet-org', 'General Motors-org', 'Pittsburgh-org', 'LIDAR-org', 'Singapore-org', 'National Highway Traffic Safety Administration-org', 'DMV-org', 'CES-org']
Hey1!
['Uber-per', 'Waymo-per', 'Mobileye-per', 'Toyota-per', 'Levandowski-per', 'Musk-per', 'Otto-per', 'Elaine Herzberg-per', 'Krafcik-per', 'Delphi-per', 'Elon Musk-per', 'Mercedes Benz-per', 'Anthony Levandowski-per', 'Urmson-per', 'David Shepardson-per', 'Kalanick-per', 'Chris Urmson-per', 'Travis Kalanick-per', 'John Krafcik-per', 'Khosrowsha

Hey1!
['Singapore-org', 'SK Hynix-org', 'KS-org', 'WTO-org', 'JSR-org', 'Showa Denko KK-org', 'Seoul-org', 'Makiko Yamazaki-org', 'Heekyong Yang-org', 'Yonhap-org', 'JAPAN AIRLINES-org', 'TOKYO SEOUL Reuters Japan-org', 'LG-org', 'T and Shin Etsu Chemical-org', 'Reuters SK Hynix Inc SK Hynix-org', 'DRAM-org', 'SEOUL SINGAPORE-org', 'Reuters Pirates-org', 'Singapore Strait-org', 'CK BlueBell-org']
Hey1!
['Kanto Denka Kogyo-per', 'Nippon Steel-per', 'David Dolan-per', 'John Geddie-per', 'Joyce Lee-per', 'Josh Smith-per', 'Mahindra Solar-per', 'Prime Minister Mohamad Mahathir-per', 'Mahathir-per', 'Shazlie Akbar-per', 'Joe Brock-per', 'IT MATTER-per', 'LG Display-per', 'Ram Technology-per', 'Ju min Park-per']
Hey2!
['Kanto Denka Kogyo-per', 'Nippon Steel-per', 'David Dolan-per', 'John Geddie-per', 'Joyce Lee-per', 'Josh Smith-per', 'Mahindra Solar-per', 'Prime Minister Mohamad Mahathir-per', 'Shazlie Akbar-per', 'Joe Brock-per', 'IT MATTER-per', 'LG Display-per', 'Ram Technology-per', 'Ju

In [27]:
# Mini-test algo for 'duplicate' removal above
ml = ['TRUMP','donald trump','Rudolph Giuliani','giuliani']#'joe biden','Biden']

inds_to_drop = []
for i in range(len(ml)):
    name = ml[i]
    for j in range(i+1,len(ml)):
        if (ml[i].lower() in ml[j].lower()) | (ml[j].lower() in ml[i].lower()):
            namei,namej = ml[i], ml[j]
            if len(namei) > len(namej):
                inds_to_drop.append(j)
            else:
                inds_to_drop.append(i)
                
#inds_to_keep = [ind for ind in range if ind not in inds_to_drop]

ml = [ml[i] for i in range(len(ml)) if i not in inds_to_drop]
ml

['donald trump', 'Rudolph Giuliani']

In [15]:
#Example output from the NER determination
for item in ner_dict_stripped[1]:
    print(item)

American-gpe
Chinese-gpe
African-gpe
German-gpe
Americans-gpe
Google-org
University-org
US-org
CNN-org
University of California-org
Sendler-per
Johnson-per
Trump-per
Redfield-per
Harvard-per
United States-geo
China-geo
New York-geo
Washington-geo
California-geo


In [55]:
# Final answer
ner_dict_stripped

{0: {},
 1: {'American-gpe': 195,
  'Chinese-gpe': 168,
  'African-gpe': 71,
  'German-gpe': 50,
  'Americans-gpe': 45,
  'Google-org': 300,
  'University-org': 155,
  'US-org': 151,
  'CNN-org': 137,
  'University of California-org': 123,
  'Sendler-per': 88,
  'Johnson-per': 73,
  'Trump-per': 59,
  'Redfield-per': 42,
  'Harvard-per': 39,
  'United States-geo': 232,
  'China-geo': 177,
  'New York-geo': 116,
  'Washington-geo': 102,
  'California-geo': 89},
 2: {},
 3: {'African-gpe': 144,
  'British-gpe': 108,
  'French-gpe': 86,
  'German-gpe': 84,
  'Chinese-gpe': 75,
  'Nov-org': 79,
  'Reuters A-org': 62,
  'Reuters South Africa-org': 54,
  'Google-org': 50,
  'Eskom-org': 49,
  'Noor Zainab Hussain-per': 69,
  'Tanisha Heiberg-per': 29,
  'Nikhil Kurian Nainan-per': 23,
  'Alexander Cornwell-per': 23,
  'Yadarisa Shabong-per': 22,
  'Bengaluru-geo': 593,
  'China-geo': 163,
  'South Africa-geo': 113,
  'Jan-geo': 97,
  'United States-geo': 96},
 4: {'Swiss-gpe': 403,
  'Americ

In [57]:
filename = 'models/14passes_265_ents_dict(stripped_final_sep_14).pkl'
pickle.dump(ner_dict_stripped,open(filename,'wb'))