The below environment is required for this notebook

In [1]:
print(__import__('sys').version)
!conda list -n NLP37

3.7.1 (default, Dec 10 2018, 22:54:23) [MSC v.1915 64 bit (AMD64)]
# packages in environment at C:\Anaconda3\envs\NLP37:
#
# Name                    Version                   Build  Channel
_pytorch_select           1.1.0                       cpu  
altair                    3.1.0                    py37_0    conda-forge
asn1crypto                0.24.0                   py37_0  
atomicwrites              1.3.0                    py37_1  
attrs                     19.1.0                   py37_1  
backcall                  0.1.0                    py37_0  
beautifulsoup4            4.7.1                    pypi_0    pypi
blas                      1.0                         mkl  
boto                      2.49.0                   py37_0    anaconda
boto3                     1.9.162                    py_0    anaconda
botocore                  1.12.163                   py_0    anaconda
branca                    0.3.1                      py_0    conda-forge
bs4                       0.

In [1]:
from tqdm import tqdm
from toolz import compose, curry, concat
from itertools import combinations
from collections import Counter
from operator import itemgetter
from unicodedata import normalize

import os, sys, re, datetime, pycountry

try:
    import cPickle as pickle
except:
    import pickle

In [2]:
def load_pickle(filename):
    print('Loading file...')
    with open(os.path.normpath(filename), 'rb') as open_file:
        return pickle.load(open_file)

def save_pickle(filename, data):
    print('saving...')
    with open(os.path.normpath(filename), 'wb') as open_file:
        pickle.dump(data, open_file)
        

In [3]:
INPUT_FOLDER = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\__Data__\temp'

KNOWN_PERSONS = r'lookups\normalized_person_lookup.pkl'

KNOWN_ORGANIZATIONS = r'lookups\normalized_organization_lookup.pkl'

KNOWN_LOCATIONS = r'lookups\clean_location_lookup.pkl'

RESOLVED_ENTITIES = r'lookups\normalized_resolution_lookup.pkl'

NORMALIZATION_PROTOCOL = 'NFKD' # Alt: NFKD, NFKC, NFC, NFD

CTRL_REGEX = re.compile(r'[\x00-\x1f\x7f-\x9f]|\s{2,}') # matches non-utf-8 encoded bytes

# STOPWORDS = {'ENDOFDOC', 'Telegraph', 'Agence France-Presse', 'Guardian', 'God', 'Allah', 'Open',
#              'Northen', 'North'}

STOPTAGS = {'IDEOLOGY', 'HANDLE', 'CAUSE_OF_DEATH', 'TITLE', 'CRIMINAL_CHARGE', 
            'URL', 'EMAIL', 'RELIGION', 'MISC'}

TRANSLATOR = {'PERSON': 'person',
              'LOCATION': 'location',
              'CITY': 'location',
              'COUNTRY': 'location',
              'STATE_OR_PROVINCE': 'location',
              'NATIONALITY': 'location',
              'ORGANIZATION': 'organization',
              
              }

MANUAL_TRANS = load_pickle(r'lookups\manual_trans.pkl')

STOPENTS = {'5ft 5ins', 'Olaf', 
            'Beyonce-obsessed Tracey', 'A. Got', 'Spotify', "'' Hidalgo",
            'PSGCredit', 'BST The Labour', 'ENDOFDOC', 'Central', 'Republic', 'BBC Monitoring', 'A. Headlines B. Main',
            'B. Home', 'CITY', 'fold', 'C. Adverts', 'N.', 'CNN', 'Telegraph', 'Agence France-Presse', 'Guardian', 'God',
            'Allah', 'Open', 'Northen', 'North', 'New York Times', 'Times', 'His Highness', 'Region', 'AFP',
            'BBC Monitoring in Arabic', 'BBC',' 202:57', '229:19', '3in', 'Aa', 'U S', 'Interfax', 'Interfax Ukraine',
            'A A', 'Aaaaa', 'PST', 'GMT', 'UTC', 'AM', 'PM', 'Santa Claus', 'Santa', 'URL', 'URLs', 'Express Tribune'
            'Rotten Tomatoes', 'RSVP', 'Resident Evil', 'Daily Nation', 'Daily Star', 'Telegram'}


PUBLICATIONS = {
                'BBC', 'new york times', 'the guardian', 'telegraph', 'agence france presse', 'CNN', 'UPI', 'ukrinform',
                'associated press', 'Aaj Shahzeb Khanzada', 'News', 'TV', 'Channel', 'Radio', 'Rapporteur'
                'television', 'Online', 'Press',
                }

REPLACE_WITH_SPACE = {
                      ' -RSB- ', ' -RRB- ', ' -LSB- ', '-', '  ', ' aka ', ' " ', ' ; ', 
                      }

ENTITY_MERGE_MAP = {
                    ', D.C.': {'Washington': 'Washington DC'},
                    'Petersburg': {'St.': 'St. Petersburg'}
                    }

REPLACE_WITH_EMPTY = {'/Telegraph', 'Tory Brexiter ', '/PA', '/FilmMagic Right', '-born', '``', '-owned', '-backed',
                      'His Royal Majesty ', '-held', '.', 'Associated Press of ', '`', 'His Highness', 'Jankovska'}

REPLACE_SPECIAL = load_pickle(r'lookups\replace_special.pkl')


REPLACE_TRANS = { **{i: ' ' for i in REPLACE_WITH_SPACE}, 
                  **{i: '' for i in REPLACE_WITH_EMPTY},
                  **REPLACE_SPECIAL
                 }

REPLACE_TRANS = dict((re.escape(k), v) for k, v in REPLACE_TRANS.items())
REPLACE_REGEX = re.compile('|'.join(REPLACE_TRANS.keys()))

ACRO_REGEX = re.compile(r'^[A-Z]{2,3}$') # For finding abbreviations

ACRO_REPLACE_TRANS = {i: ' ' for i in {' and ', ' of ', ' the '}}

ACRO_REPLACE_TRANS = dict((re.escape(k), v) for k, v in ACRO_REPLACE_TRANS.items())
ACRO_REPLACE_REGEX = re.compile('|'.join(ACRO_REPLACE_TRANS.keys()))


Loading file...
Loading file...


In [4]:
def directory_explorer(extension, directory):
    '''
    A generator to find filenames with a given extension within a given 
    directory
    '''
    ext_upper, ext_lower = extension.lower(), extension.upper()
    for filename in os.listdir(os.path.normpath(directory)):
        if filename.endswith(ext_upper) \
        or filename.endswith(ext_lower):
            yield '%s/%s' % (directory, filename)
            
def replace_bad_chars(text, regex=CTRL_REGEX):
    '''
    Removes all ctrl characters such as '\n'
    and consecutive spaces
    '''
    return regex.sub(r' ', text)

def normalize_unicode(text, protocol=NORMALIZATION_PROTOCOL):
    '''
    Map all characters to ascii encoding and then back to utf-8 in order
    to remove invalid characters for nlp.
    '''
    return normalize(protocol, text).encode('ascii', 'ignore').decode('utf8')

clean_text = compose(lambda x: x.replace('&', 'and'),
                     replace_bad_chars,
                     normalize_unicode)

Many functions use a variable called data as input. This variable is a list of dictionaries where a 
dictionary (doc) represents a document. 

doc = {

        'id': the unique id of the doc,

       'datetime': a datetime object of when the document was published,
       
       'sentences': a list of lists where each nested list represents a sentence
       
       }
       
      
doc['sentences'] = [

                   [ent1, ent2, ent3, ent4, ent1, ent2],
                    
                   [ent1, ent5, ent6, ent5, ent2],
                    
                   ]
               
               
ent = {

       'word': word that occured in the document,
       
       'words': the word split by a space i.e. ent['word'].split(' '),
       
       'tag': the type of entity i.e. person, location, organization,
       
        'len': number of characters in the word,
    
        'lemma': the ent word but lowercase i.e. ent['word'].lower(),
       
       }

In [6]:
def sentence_iter(data):
    '''
    Iterates through data structure one sentence at a time
    '''
    for doc in data:
        for sentence in doc['sentences']:
            yield sentence
                
def ent_iter(data, sentence_iter=sentence_iter):
    '''
    Iterates through data structure one ent at a time
    '''
    for sentence in sentence_iter(data):
        for ent in sentence:
            yield ent
            
def filter_empty(data):
    '''
    Removes empty sentences
    '''
    for doc in data:
        for i, sentence in enumerate(doc['sentences']):
            doc['sentences'][i] = list(filter(lambda x: x['word'].strip(), sentence))
    return data

def filter_size(data):
    '''
    Removes documents that are too short
    '''
    for doc in data:
        for i, sentence in enumerate(doc['sentences']):
            doc['sentences'][i] = list(filter(lambda x: len(x['word']) > 1, sentence))
    return data

def filter_publications(data, pubs=PUBLICATIONS):
    '''
    Removes entities which are the names of publications in the corpus
    '''
    for doc in data:
        for i, sentence in enumerate(doc['sentences']):
            new_sentence = []
            for ent in sentence:
                flag = True
                for pub in pubs:
                    if pub in ent['word'] or pub in ent['word'].lower():
                        flag = False
                        break
                if flag:
                    new_sentence.append(ent)
            doc['sentences'][i] = new_sentence                 
    return data

def generate_entity(word, tag, resolved=None):
    '''
    Creates a new entity object
    '''
    ent = {'tag': tag, 'ner': tag}
    ent['words'] = [x.strip() for x in word.split(' ') if x]
    ent['n_words'] = len(ent['words'])
    ent['word'] = ' '.join(ent['words']).strip()
    ent['len'] = len(ent['word'])
    ent['lemma'] = ent['word'].lower()
    if resolved is not None:
        ent['resolved'] = resolved
    return ent

def analyze_entities(
                     data, 
                     ent_iter=ent_iter,
                     generate_entity=generate_entity
                     ):
    '''
    I decided to update the entity object from what was used in the previous notbook
    so this function just changes all the entities to the new type
    '''
    print('Analyzing entitites...')
    for sentence in tqdm(list(sentence_iter(data))):
        for i, ent in enumerate(sentence):
            sentence[i] = generate_entity(ent['word'], ent['tag'])
    return data

def resolution_check(
                     data,
                     sentence_iter=sentence_iter,
                     load_pickle=load_pickle,
                     resolved_entities=RESOLVED_ENTITIES,
                     ):
    '''
    Checks to see if entities are resolved
    '''
    resolution_checker = load_pickle(resolved_entities)
    print('Checking resolutions...')
    for sentence in tqdm(list(sentence_iter(data))):
        new_ents = []
        for i, ent in enumerate(sentence):
            
            if 'resolved' in ent and ent['resolved'] == True:
                continue
                
            if ent['word'] in resolution_checker:
                ent['resolved'] = True
            elif '-' in ent['word']:
                words = [x for x in ent['word'].split('-') if x.strip()]
                if all([word in resolution_checker for word in words]):
                    sentence[i] = generate_entity(words[0], ent['tag'], True)
                    #sentence[i]['resolved'] = True
                    for word in words[1:]:
                        word = generate_entity(word, ent['tag'], True)
                        new_ents.append(word)
                elif ent['word'].replace('-', ' ') in resolution_checker:
                    sentence[i] = generate_entity(ent['word'].replace('-', ' '), ent['tag'], True)
                else:
                    ent['resolved'] = False
            elif ent['len'] > 1 and ent['word'][-1] in {'s', 'S'} \
                 and ent['word'][: -1] in resolution_checker:
                    sentence[i] = generate_entity(ent['word'][: -1], ent['tag'], True)
            else:
                ent['resolved'] = False
                
        sentence.extend(new_ents)
            
    return data

def manual_resolution(data, 
                      sentence_iter=sentence_iter,
                      generate_entity=generate_entity,
                      trans=MANUAL_TRANS):
    print('Manually resolving entities...')
    for sentence in tqdm(list(sentence_iter(data))):
        for i, ent in enumerate(sentence):
            if ent['word'] in trans:
                ent = generate_entity(trans[ent['word']]['word'], trans[ent['word']]['tag'], True)
                sentence[i] = ent
    return data
        
def entity_type_set(data, tag, ent_iter=ent_iter):
    '''
    For viewing all unique entities of a type for debuggin perposes
    Types: 'PERSON', 'COUNTRY', 'CITY', 'LOCATION', 'NATIONALITY', 
           'IDEOLOGY', 'HANDLE', 'CAUSE_OF_DEATH', 'TITLE',
           'CRIMINAL_CHARGE', 'URL', 'EMAIL', 'RELIGION', 'MISC'
    '''
    output = set()
    for ent in tqdm(list(ent_iter(data))):
        if ent['tag'] == tag:
            _append = [ent['word'], doc['id']]
            if 'ner' in ent:
                _append.append(ent['ner'])
            output.add(tuple(_append))
    return output

In [6]:
def remove_leading_and_trailing_stops(data,
                                      sentence_iter=sentence_iter,
                                      generate_entity=generate_entity):
    '''
    Removes stopwords that occur at the start or end of the text of an entity
    '''
    
    stops = {'the', 'of', 'and', 'for', 'this', 'what', 'most', 'much',
             'over', 'qc', 'so', 'moreover', 'os', 'has', 'perhaps'}
    changes = []
    for sentence in tqdm(list(sentence_iter(data))):
        for i, ent in enumerate(sentence):
            if not ent['resolved']:
                ent['words'] = [w for w in ent['words'] if w.strip()]
                if len(ent['words']) > 1:
                    if ent['words'][0].lower() in stops:
                        ent['words'] = ent['words'][1:]
                    if ent['words'][-1].lower() in stops:
                        ent['words'] = ent['words'][:-1]
                
                sentence[i] = generate_entity(' '.join(ent['words']), ent['tag'], ent['resolved'])
                if ent['word'] != sentence[i]['word']:
                    changes.append((ent, sentence[i]))
    return data

def remove_press(data,
                  sentence_iter=sentence_iter,
                  generate_entity=generate_entity,
                 pubs=PUBLICATIONS):
    
    pubs = {s.lower() for s in pubs}
    for doc in data:
        for i, sentence in enumerate(doc['sentences']):
            new_sentence = []
            for ent in sentence:
                flag = False
                for word in ent['words']:
                    if word.lower() in pubs:
                        flag = True
                    if not flag:
                        new_sentence.append(ent)
            doc['sentences'][i] = new_sentence
        doc['sentences'] = [sentence for sentence in doc['sentences'] if len(sentence) > 0]
        
    return data

def resolve_s_endings(data,
                      ent_iter=ent_iter,
                      sentence_iter=sentence_iter,
                      generate_entity=generate_entity):
    
    changes = []
    ents = set([ent['word'] for ent in ent_iter(data)])
    
    for sentence in tqdm(list(sentence_iter(data))):
        for i, ent1 in enumerate(sentence):
            if not ent1['resolved']:
                if ent1['len'] > 1 and ent1['word'].endswith('s'):
                    for ent2 in ents:
                        if ent1['word'][-1] == ent2:
                            sentence[i] = generate_entity(ent2, ent1['tag'], ent['resolved'])
                            changes.append((ent1, ent2))
    save_pickle(r'logs\resolve_s_endings.chge', changes)
    return data
    
def resolve_multi_locations(data, 
                            sentence_iter=sentence_iter,
                            generate_entity=generate_entity,
                            locations=KNOWN_LOCATIONS,
                            ):
    locations = load_pickle(locations)
    for sentence in sentence_iter(data):
        new = []
        for i, ent in enumerate(sentence):
            if not ent['resolved']:
                if ent['n_words'] > 1:
                    if all([x.lower() in locations for x in ent['words']]):
                        sentence[i] = generate_entity(locations[ent['words'][0].lower()], ent['tag'], ent['resolved'])
                        for word in ent['words'][1:]:
                            new.append(generate_entity(locations[word.lower()], ent['tag'], ent['resolved']))
        for ent in new:
            sentence.append(ent)
    return data
  
    
def resolve_double_orgs(data, 
                        sentence_iter=sentence_iter,
                        generate_entity=generate_entity,
                        organizations=KNOWN_ORGANIZATIONS,
                        ):
    changes = []
    organizations = load_pickle(organizations)
    for sentence in sentence_iter(data):
        new = []
        for i, ent in enumerate(sentence):
            if not ent['resolved']:
                if ent['n_words'] == 3:
                    if ent['words'][1] in {'and', 'or'}:

                        if ent['words'][0] in organizations and ent['words'][2] in organizations:
                            sentence[i] = generate_entity(organizations[ent['words'][0]], ent['tag'], True)
                            new_ent = generate_entity(organizations[ent['words'][2]], ent['tag'])
                            new.append(new_ent, True)
                            changes.append((sentence[i], new_ent))
        for ent in new:
            sentence.append(ent)
    save_pickle(r'logs\resolve_double_orgs.chge', changes)
    return data


def capitalize_entities(data,
                        sentence_iter=sentence_iter):
    '''
    Capitalizes the first character of entities
    '''
    for sentence in sentence_iter(data):
        for i, ent in enumerate(sentence):
            #if not ent['resolved']:
            if ent['n_words'] > 1 and ent['len'] > 3:
                new = []
                for j in range(ent['n_words']):
                    w0 = ent['words'][j][0].upper()
                    if len(ent['words'][j]) > 1:
                        w1 = ent['words'][j][1:].lower()
                        w = w0 + w1
                    else:
                        w = w0
                    new.append(w)
                sentence[i] = generate_entity(' '.join(new), ent['tag'], ent['resolved'])
    return data

In [8]:
def list_data_structure(data):
    '''
    Aggregate and unpack NER output into a list
    '''
    output = []
    print('Aggregating NER output into a list data structure...')
    for batch in data:
        for date in batch:
            if date == 'n_articles':
                continue
            for id_ in batch[date]:
                batch[date][id_]['datetime'] = date
                if 'ents' in batch[date][id_]: # I decided to use 'sentences' as key
                    batch[date][id_]['sentences'] = batch[date][id_]['ents']
                    del batch[date][id_]['ents']
                output.append(batch[date][id_])
    return output

def tag_correction(
                   data,
                   ent_iter=ent_iter, 
                   translator=TRANSLATOR
                   ):
    '''
    Simply changes the name of the key used for the NER types
    from 'ner' to 'tags' because I decided 'tag' was clearer 
    '''
    print('Correcting tags...')
    for ent in tqdm(list(ent_iter(data))):
        if 'ner' not in ent:
            ent['ner'] = ent['tag'].lower()
            #del ent['ner']
        if ent['tag'] in translator:
            ent['tag'] = translator[ent['tag']]
    return data
    
def stopword_filter(
                    data,
                    #stopwords=STOPWORDS, 
                    stoptags=STOPTAGS,
                    stopents=STOPENTS,
                    ):
    '''
    Filter out stopwords and unwanted enity types (stoptags)
    '''
    print('Filtering stopwords...')
    for doc in tqdm(data):
        filtered_sentences = []
        for sentence in doc['sentences']:
            sentence = [ent for ent in sentence \
                        if ent['tag'] not in stoptags \
                        #and ent['word'] not in stopwords \
                        and ent['word'] not in stopents]
            if sentence: # check not empty
                filtered_sentences.append(sentence)
        doc['sentences'] = filtered_sentences
    return data

def entity_correction(
                      data,
                      sentence_iter=sentence_iter,
                      trans=REPLACE_TRANS,
                      regex=REPLACE_REGEX,
                      merge_map=ENTITY_MERGE_MAP
                      ):
    '''
    Implements some simple corrections that were noticed in the process
    '''
    table = {char: '' for char in '»|/\\\{\}[]!`\"\"£$%^&@?><!*(),.:;\'~#=+_'}
    table = str.maketrans(table)

    def depunc(x, table=table):
        return x.replace('\'s', '').translate(table).strip()
    
    changes = []
    print('Correcting entity errors...')
    for sentence in tqdm(list(sentence_iter(data))):
        deletes = []
        new = []
        for i, ent in enumerate(sentence):
            dont_append_flag = False
            start = ent['word']
            
            if len(ent['word']) > 3:
                ent['word'] = depunc(ent['word'][0: 2]) + ent['word'][2: -2] + depunc(ent['word'][-2: ]).strip()
            elif len(ent['word']) == 3:
                ent['word'] = depunc(ent['word'][0]) + ent['word'][1] + depunc(ent['word'][2]).strip()
            else:
                ent['word'] = depunc(ent['word']).strip()
            
            if len(ent['word']) > 4:
                try:
                    int(ent['word'][:4])
                    ent['word'] = ent['word'][4: ]
                except:
                    pass
            
            if '-led' in ent['word']:
                words = ent['word'].split('-led')
                ent['word'] = words[0].strip()
                for word in words[1: ]:
                    stripped_word = word.strip()
                    if stripped_word:
                        new.append({'word': stripped_word, 'tag': ent['tag'], 'ner': ent['ner']})
            # remove some errors
            ent['word'] = regex.sub(lambda x: trans[re.escape(x.group(0))], ent['word']).strip()

            # merge some obvious entities together that should have been joined
            if ent['word'] in merge_map and sentence[i - 1]['word'] in merge_map[ent['word']]:
                sentence[i - 1]['word'] = merge_map[ent['word']][sentence[i - 1]['word']]
                deletes.append(i)
                break

            words = ent['word'].split(' ')
            if ent['tag'] == 'person':
                for j, word in enumerate(words):
                    if word.isupper():# and len(word) > 1:
                        words[j] = word[0].upper() + word[1: ].lower()
                ent['word'] = ' '.join(words)
            elif ent['tag'] == 'location':
                if len(words) > 1 or len(ent['word']) > 3:
                    for j, word in enumerate(words):
                        if word.isupper() and len(word) > 2:
                            words[j] = word[0].upper() + word[1: ].lower()
                    ent['word'] = ' '.join(words)
            elif ent['tag'] == 'organization':
                for j, word in enumerate(words):
                    if word.isupper() and len(word) > 3:
                        words[j] = word[0].upper() + word[1: ].lower()
                ent['word'] = ' '.join(words)
            changes.append((start, ent['word']))

        for i in sorted(deletes, reverse=True):
            del sentence[i]   
        for ent in new:
            sentence.append(ent)
    save_pickle(r'logs\entity_correction.chge', changes)
    return data

def remove_titles(data, ent_iter=ent_iter):
    '''
    Removes Mr, Mrs, Ms etc...
    '''
    regex = re.compile(r'M(?:s|r|rs)\.? ')# | [\w\d]\. ')
    print('Removing titles...')
    #corrected = []
    for ent in tqdm(list(ent_iter(data))):
        if regex.search(ent['word']):# and not all([ent['word'][0].isupper(), ent['word'][1:3] == '. ']):
            original = ent['word']
            ent['word'] = regex.sub(' ', ent['word']).strip()
            #corrected.append((original, ent['word']))
    return data #, corrected

In [9]:
def lookup_tag_resolution(
                          data,
                          ent_iter=ent_iter,
                          load_pickle=load_pickle,
                          persons=KNOWN_PERSONS,
                          locations=KNOWN_LOCATIONS,
                          organizations=KNOWN_ORGANIZATIONS,
                          ):
    changes = []
    persons = load_pickle(persons)
    locations = load_pickle(locations)
    organizations = load_pickle(organizations)
    print('Resolving tags by database lookup...')
    for ent in tqdm(list(ent_iter(data))):
        if ent['lemma'] in persons:
            changes.append((ent, 'person'))
            ent['tag'] = ent['ner'] = 'person'
        if ent['lemma'] in locations:
            changes.append((ent, 'location'))
            ent['tag'] = ent['ner'] = 'location'
        if ent['lemma'] in organizations or ent['word'] in organizations:
            changes.append((ent, 'organization'))
            ent['tag'] = ent['ner'] = 'organization'
    save_pickle(r'logs\lookup_tag_resolution.chge', changes)
    return data

def lookup_entity_resolution(
                             data,
                             sentence_iter=sentence_iter,
                             generate_entity=generate_entity,
                             persons=KNOWN_PERSONS,
                             locations=KNOWN_LOCATIONS,
                             organizations=KNOWN_ORGANIZATIONS
                             ):
    changes = []
    persons = load_pickle(persons)
    locations = load_pickle(locations)
    organizations = load_pickle(organizations)
    print('Resolving entities by database lookup...')
    for sentence in tqdm(list(sentence_iter(data))):
        for i, ent in enumerate(sentence):
            if not ent['resolved']:
                if ent['lemma'] in persons:
                    changes.append((ent['word'], persons[ent['lemma']]))
                    ent = generate_entity(persons[ent['lemma']], 'person', True)
                    sentence[i] = ent
                if ent['lemma'] in locations:
                    changes.append((ent['word'], locations[ent['lemma']]))
                    ent = generate_entity(locations[ent['lemma']], 'location', True)
                    sentence[i] = ent
                elif ent['word'] in locations:
                    changes.append((ent['word'], locations[ent['word']]))
                    ent = generate_entity(locations[ent['word']], 'location', True)
                    sentence[i] = ent
                if ent['lemma'] in organizations:
                    changes.append((ent['word'], organizations[ent['lemma']]))
                    ent = generate_entity(organizations[ent['lemma']], 'organization', True)
                    sentence[i] = ent
                elif ent['word'] in organizations:
                    changes.append((ent['word'], organizations[ent['word']]))
                    ent = generate_entity(organizations[ent['word']], 'organization', True)
                    sentence[i] = ent
                    
                
    save_pickle(r'logs\lookup_entity_resolution.chge', changes)
    return data

def is_first_or_last_name(ent1, ent2, string=False):
    '''
    Returns true if ent1 is the first or last word of ent 2
    '''
    if not string:
        ent1 = ent1['word']
    return ent2['n_words'] > 1 \
           and any([ent2['words'][0].lower() == ent1.lower(),
           ent2['words'][-1].lower() == ent1.lower()])

def resolve_hyphenated_words(data, sentence_iter=sentence_iter):
    '''
    Removes the hyphen in hyphenated words and trys to match each word to some ealier in the document
    '''
    changes = []
    for doc in tqdm(data):
        for i, sentence1 in enumerate(doc['sentences']):
            to_append = []
            for j, ent1 in enumerate(sentence1):
                if '-' in ent1['word']:
                    resolved_flag = False
                    words = [x for x in ent1['word'].split('-') if x.strip()]
                    n_words = len(words)
                    matches = []
                    n_match = 0
                    for temp in words:

                        temp = temp.strip()
                        for ent2 in sentence1[j - 1:: -1]:
                            if is_first_or_last_name(temp, ent2, True):
                                n_match += 1
                                resolved_flag = True
                                matches.append(ent2)
                                break

                        # Iterate through prevous sentences in reverse
                        if not resolved_flag:
                            for sentence2 in doc['sentences'][i - 1:: -1]:
                                for ent2 in sentence2[:: -1]:
                                    if ent2['n_words'] > 1 \
                                    and ent2['words'][0].lower() != ent2['words'][-1].lower() \
                                    and is_first_or_last_name(temp, ent2, True):
                                        n_match += 1
                                        resolved_flag = True
                                        matches.append(ent2)
                                        break
                                if resolved_flag:
                                    break
                        if n_match == n_words:
                            sentence1[j] = matches[0]
                            for match in matches[1:]:
                                to_append.append(match)
                            changes.append(matches)
                            break

        sentence1.extend(to_append)
    save_pickle(r'logs\resolve_hyphenated_words.chge', changes) 
    return data
                        


def reverse_iter_subset_resolution(
                                   data,
                                   is_first_or_last_name=is_first_or_last_name
                                   ):
    '''
    Resolves single word entities to multi-word entities by finding the most recent
    match in the same document
    '''
    changes = []
    print('Checking if entities are substrings of preceeding entities...')
    for doc in tqdm(data):
        for i, sentence1 in enumerate(doc['sentences']):
            for j, ent1 in enumerate(sentence1):
                if not ent1['resolved'] and ent1['n_words'] == 1:
                    resolved_flag = False
                    # Iterate through current sentence in reverse
                    for ent2 in sentence1[j - 1:: -1]:
                        if is_first_or_last_name(ent1, ent2):
                            changes.append((ent1['word'], ent2['word']))
                            sentence1[j] = ent2
                            resolved_flag = True
                            break
                            
                    # Iterate through prevous sentences in reverse
                    if not resolved_flag:
                        for sentence2 in doc['sentences'][i - 1:: -1]:
                            for ent2 in sentence2[:: -1]:
                                #print(ent2)
                                if ent2['n_words'] > 1 \
                                and ent2['words'][0].lower() != ent2['words'][-1].lower() \
                                and is_first_or_last_name(ent1, ent2):
                                    changes.append((ent1['word'], ent2['word']))
                                    sentence1[j] = ent2
                                    resolved_flag = True
                                    break
                            if resolved_flag:
                                break
    save_pickle(r'logs\reverse_iter_subset_resolution.chge', changes)
    return data

def acronym_match(
                  acroynm, 
                  text, 
                  regex=ACRO_REPLACE_REGEX,
                  trans=ACRO_REPLACE_TRANS
                  ):
    '''
    Matches an acronyms to a name
    '''
    text = regex.sub(lambda x: trans[re.escape(x.group(0))], text)
    words = [x.strip() for x in text.split(' ') if x]
    n_char = len(acroynm)
    return len(words) == n_char and all([acroynm[i] == words[i][0] for i in range(n_char)])

def reverse_iter_abbr_resolution(
                                 data,
                                 acronym_match=acronym_match,
                                 isacronym=ACRO_REGEX
                                 ):
    '''
    Resolves single word, acronym non-person entities '''
    print('Matching acronym to earlier mentions...')
    changes = []
    for doc in tqdm(data):
        for i, sentence1 in enumerate(doc['sentences']):
            for j, ent1 in enumerate(sentence1):
                if ent1['tag'] != 'person' and isacronym.fullmatch(ent1['word']): # not person and abbreviation
                    resolved_flag = False
                    for ent2 in sentence1[j - 1:: -1]:
                        if acronym_match(ent1['word'], ent2['word']):
                            changes.append((ent1['word'], ent2['word']))
                            sentence1[j] = ent2
                            resolved_flag = True
                            break
                            
                    # start iterating through prevous sentences in reverse
                    if not resolved_flag:
                        for sentence2 in doc['sentences'][i - 1:: -1]:
                            for ent2 in sentence2[:: -1]:
                                if not isacronym.fullmatch(ent2['word']) \
                                and acronym_match(ent1['word'], ent2['word']):
                                    changes.append((ent1['word'], ent2['word']))
                                    sentence1[j] = ent2
                                    resolved_flag = True
                                    break
                            if resolved_flag:
                                break
    save_pickle(r'logs\reverse_iter_abbr_resolution.chge', changes)
    return data

In [10]:
def iso_lookup_location_resolution(
                                   data, 
                                   ent_iter=ent_iter,
                                   isacroynm=ACRO_REGEX
                                   ):
    '''
    Looks up location entities in the iso pycountry database
    '''
    print('Resolving locations by database lookup...')
    #block_print() # pycountry has verbose output and I could not disable it easily
    changes = []
    lookup = {}
    for ent in tqdm(list(ent_iter(data))):
        if not ent['resolved'] and ent['tag'] == 'location':
            # Lookup country codes
            if ent['word'] in lookup:
                ent['word'] = lookup[ent['word']]
            else:
                if isacroynm.fullmatch(ent['word']) \
                and ent['ner'] in {'country', 'nationality'} \
                and ent['word'] not in {'EU'}:

                    if ent['n_words'] == 2:
                        match = pycountry.countries.get(alpha_2=ent['word'])
                    elif ent['n_words']  == 3:
                        match = pycountry.countries.get(alpha_3=ent['word'])

                elif ent['ner'] in {'country', 'nationality'}:
                    try:
                        temp = pycountry.countries.search_fuzzy(ent['word'])[0]
                    except LookupError:
                        match = None
                        pass
                else:
                    match = None

                if match is not None:
                    lookup[ent['word']] = match.name
                    if ent['word'] != match.name:
                        changes.append((ent['word'], match.name))
                    ent['word'] = match.name
    save_pickle(r'logs\iso_lookup_location_resolution.chge', changes)
    return data              

In [2]:
INPUT_FOLDER = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\06 NER\temp'

OUTPUT_FILENAME = r'C:\Users\Simon\OneDrive - University of Exeter\__Project__\07 NED\temp\resolved.pkl'

ned_pipe = compose(
                   curry(save_pickle)(OUTPUT_FILENAME),
#                    capitalize_entities,
#                    remove_press,
#                    resolution_check,
#                    lookup_entity_resolution,
#                    resolution_check,
#                    iso_lookup_location_resolution,
#                    resolution_check,
#                    lookup_tag_resolution,
#                    resolution_check,
#                    reverse_iter_abbr_resolution,
#                    resolution_check,
#                    reverse_iter_subset_resolution,
#                    resolution_check,
#                    capitalize_entities,
#                    resolution_check,
#                    resolve_multi_locations,
#                    resolve_s_endings,
#                    resolution_check,
#                    remove_leading_and_trailing_stops,
                   remove_press,
                   resolution_check,
                   resolve_multi_locations,
                   resolve_s_endings,
                   resolution_check,
                   remove_leading_and_trailing_stops,
                   capitalize_entities,
                   filter_empty,
                   resolution_check,
                   lookup_entity_resolution,
                   resolution_check,
                   iso_lookup_location_resolution,
                   resolution_check,
                   lookup_tag_resolution,
                   resolution_check,
                   reverse_iter_abbr_resolution,
                   resolution_check,
                   reverse_iter_subset_resolution,
                   resolution_check,
                   manual_resolution,
                   resolution_check,
                   resolve_double_orgs,
                   remove_leading_and_trailing_stops,
                   filter_empty,
                   resolve_hyphenated_words,
                   resolution_check,
                   #capitalize_entities,
                   analyze_entities,
                   filter_publications,
                   filter_empty,
                   entity_correction,
                   remove_titles, # i.e. Mr. Mrs.
                   stopword_filter,
                   lookup_tag_resolution,
                   analyze_entities,
                   tag_correction,
                   list_data_structure,
                   curry(map)(load_pickle),
                   curry(directory_explorer)('.pkl'),
                   )


In [1]:
ned_pipe(INPUT_FOLDER)

In [18]:
load_pickle(OUTPUT_FILENAME)

In [17]:
load_pickle(r'logs\reverse_iter_subset_resolution.chge')

In [16]:
load_pickle(r'logs\reverse_iter_abbr_resolution.chge')