In [45]:
import sys, codecs, json, math, time, warnings
warnings.simplefilter( action='ignore', category=FutureWarning )

import nltk, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics, eli5
nltk.download('averaged_perceptron_tagger')
from sklearn.metrics import make_scorer
from collections import Counter
import matplotlib.pyplot as plt
from IPython.display import display    
import numpy as np

import logging
import tensorflow as tf
import absl.logging
formatter = logging.Formatter('[%(levelname)s|%(filename)s:%(lineno)s %(asctime)s] %(message)s')
absl.logging.get_absl_handler().setFormatter(formatter)
absl.logging._warn_preinit_stderr = False
logger = tf.get_logger()
logger.setLevel(logging.INFO)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/george/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [10]:
verbose_on = True
max_iter = 125
task3_sample_size = 17000
task4_sample_size = 17000

In [11]:
#only print if verbose_on is true
def printfn(thing): 
    if verbose_on: print(thing)

In [12]:
from nltk import sent_tokenize, word_tokenize, pos_tag

In [13]:
#They should be exactly the same length as the predictions have been made from them
def gen_tok_NER_pair_lists(tag_list, sentence_with_unknown_NER, NER_predictions):
    
    
    tokens = [tok for (tok,_,_) in sentence_with_unknown_NER]
    
    pair_list = []
    
    i = 0
    while i < len(NER_predictions):

        if(NER_predictions[i] == "O"):
            i += 1
            continue
        
        #if(not (NER_predictions[i][2:] in tag_list)):
        #    i += 1
        #    continue
        
        #remove the B- or I-
        actual_tag = NER_predictions[i][2:]
        
        #The indices will correspond, start the entity with the first token with the tag
        entity = tokens[i]
        i += 1
        
        while i < len(NER_predictions) and NER_predictions[i][2:] == actual_tag:
            entity += " " + tokens[i]
            i += 1
        
        pair_list.append((entity, actual_tag))
        i += 1
    
    return pair_list

def gen_NER_dict(tag_list, sentences, sentence_NER_predictions):
    
    dct = {tag : [] for tag in tag_list}
    
    for (sentence, predictions) in zip(sentences,sentence_NER_predictions):
        pairs = gen_tok_NER_pair_lists(tag_list, sentence, predictions)
        for (entity, tag) in pairs:
            dct[tag].append(entity.lower())
    
    dct = {tag : list(set(entities)) for (tag,entities) in dct.items()}
    return dct

#turn the list of tokens into a sentence that can be used for NER
def prepare_sentence(tokens):
    tags = list(map(lambda x: x[1], pos_tag(tokens)))
    sentence = []
    for i in range(len(tokens)):
        sentence.append((tokens[i],tags[i],"O"))
    return sentence

#detect if the chapter string has "CHAPTER . SOMETHING" at the start of it
#and return the string without it
def without_chapter(chapter_str):
    
    #Define the end of a title as there being at least 2 new line characters
    regex = r"(\n|[ ])*.*(?:CHAPTER|(C|c)hapter)[ ]*\.?(?:[ivxlcdm]+|[IVXLCDM]+|\d+)[ ]*\.?(\n|[ ])*.*\n.*\n(\n|[ ])*"
    result = re.match(regex, chapter_str, flags = re.MULTILINE | re.UNICODE)
    
    return chapter_str[len(result[0]):]
    
    #if there is a Chapter N part at the start of the chapter then remove it
    #length = len(result[0])
    #return chapter_str[length:]

#take the file path of ontonotes and return sentences
def load_ontonotes(dataset_file):

    readHandle = codecs.open(dataset_file, 'r', 'utf-8', errors = 'replace')
    str_json = readHandle.read()
    readHandle.close()
    dict_ontonotes = json.loads(str_json)
    orig_list = list(dict_ontonotes.keys())
    sentences = []    
    for str_file in orig_list:
        for str_sent_index in dict_ontonotes[str_file] :

            tokens = []
            ner_tags = []
            # compute IOB tags for named entities (if any)
            ne_type_last = None

            #build up the list of tokens and ner tags
            for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
                strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
                strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
                ne_type = None
                if 'ne' in dict_ontonotes[str_file][str_sent_index] :
                    dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
                    if not 'parse_error' in dict_ne :
                        for str_NEIndex in dict_ne :
                            if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
                                ne_type = dict_ne[str_NEIndex]['type']
                                break
                if ne_type != None :
                                        
                    if ne_type == ne_type_last :
                        strIOB = 'I-' + ne_type
                    else :
                        strIOB = 'B-' + ne_type
                else :
                    strIOB = 'O'

                ne_type_last = ne_type
                tokens.append(strToken)
                ner_tags.append(strIOB)
            list_entry = []

            #use nltk pos tags instead
            for (index, tup) in enumerate(pos_tag(tokens)):
                list_entry.append((tup[0], tup[1], ner_tags[index]))
            sentences.append(list_entry)
    return sentences

In [84]:
title_gazetteer = set([
        "mr", "mr.", "mrs", "mrs.", "miss", "miss.", "ms", "ms.", "madam", 
        "mam.", "sir", "sir.", "lord", "lord.", "mam", "mister", 
        "mister.", "missus", "missus.", "dame", "chairman", "king", 
        "queen", "president"])

In [99]:
#The ontonotes dataset does not label mr miss etc. as B-PERSON it labels
#it as O instead which is not what we want, so go through and change all the tags
#of mr and mrs etc. to B-PERSON

#Do this all in-place so it modifies the list that its called on
def fix_mr_mrs(sentences):
        
    for sent in sentences:
    
        for i in range(len(sent)):

            #print("sentences[i]:")
            #print(sentences[i])
            tok, pos, nertag = sent[i]

            if nertag != "B-PERSON":
                continue

            #at least two words behind
            if i > 1:

                tokm1, posm1, _ = sent[i - 1]
                tokm2, posm2, _ = sent[i - 2]

                if tokm1.lower() in title_gazetteer:

                    sent[i - 1] = tokm1, posm1, "B-PERSON"
                    sent[i] = tok, pos, "I-PERSON"

                elif tokm2.lower() in title_gazetteer:

                    sent[i - 2] = tokm2, posm2, "B-PERSON"
                    sent[i - 1] = tokm1, posm1, "I-PERSON"
                    sent[i] = tok, pos, "I-PERSON"

            #at least one word behind
            if i > 0:

                tokm1, posm1, _ = sent[i - 1]

                if tokm1.lower() in title_gazetteer:

                    sent[i - 1] = tokm1, posm1, "B-PERSON"
                    sent[i] = tok, pos, "I-PERSON"

In [93]:
from nltk.corpus import names
from copy import copy
from random import shuffle
import gc

In [107]:
#the ontonotes data sentences and the sentences 
#to predict from reading the chapter file
def run_NER(sentences, sentences_to_predict):
    
        #given a specific NER tag, return 2 sets consisting of the 
    #"inside parts" and the "before parts"
    def extract_gazetteers(target, sentences):
        before = set()
        inside = set()
        before_tag = "B-" + target
        inside_tag = "I-" + target
        for sent in sentences:
            for tok, _, ner in sent:
                if ner == before_tag:
                    before.add(tok.lower())
                if ner == inside_tag:
                    inside.add(tok.lower())
        return before, inside


    #convert all the NER tags you don't care about into O's in sentences that may have mixed NER tags that we care about
    #with NER tags that we don't care about
    def only_Os(desired_set, sentences):
        actual_tags = []
        for tag in desired_set:
            actual_tags.append("B-" + tag)
            actual_tags.append("I-" + tag)
        return [[(tok, pos, ner if ner in actual_tags else 'O') for tok, pos, ner in sent] for sent in sentences]


    #If the dataset has already been split and we KNOW that none of these NER tags are the ones we care about,
    #then clearly we can just replace ALL NER tags with O
    def all_Os(sentences):
        return [[(tok, pos, 'O') for tok, pos, _ in sent] for sent in sentences]


    #returns a tuples of 2 lists of sentences, where one list contains all the sentences that contain the tags 
    #specified in the argument and the other list does not
    #you just give it the tag strings and it adds the Bs and Is for you
    def split_tags(tag_set, sentences):

        actual_tags = []
        for tag in tag_set:
            actual_tags.append("B-" + tag)
            actual_tags.append("I-" + tag)

        contains_tags = []
        doesnt_contain_tags = []

        for sent in sentences:

            if(any([tag in actual_tags for _,_,tag in sent])):
                contains_tags.append(sent)
            else:
                doesnt_contain_tags.append(sent)

        return (contains_tags, doesnt_contain_tags)

    #the sentences containing task 3 tags and the sentences that dont
    wtask3, wotask3 = split_tags(["DATE", "CARDINAL", "ORDINAL", "NORP"], sentences)


    #the sentences containing task 4 tags (only person) and the sentences that dont
    wtask4, wotask4 = split_tags(["PERSON"], sentences)

    #all other tags can be converted to O because we dont care about finding them
    wtask3 = only_Os(["DATE", "CARDINAL", "ORDINAL", "NORP"], wtask3)
    wtask4 = only_Os(["PERSON"], wtask4)

    #Any sentences that don't contain any entities we care about can
    #ALL be converted to Os!
    #wotask3 = all_Os(wotask3)
    #wotask4 = all_Os(wotask4)

    del wotask3
    del wotask4
    del sentences
    gc.collect()
    
        #take a random sample with the specified size from the list of sentences
    def random_sample(sentences, num):

        if num > len(sentences):
            return sentences

        cp = copy(sentences)
        shuffle(cp)
        return cp[-num:]

    #process data, putting in a ratio of sentences that contain the NER tags we care about
    #and the complement ration of ones we dont
    def process_data(care_about, dont_care):

        num_care = floor(care_ratio * sample_size)
        num_dont_care = sample_size - num_care

        care_sample = random_sample(care_about, num_care)
        dont_care_sample = random_sample(dont_care, num_dont_care)

        processed = care_sample + dont_care_sample
        shuffle(processed)
        return processed

    #task3_processed = process_data(wtask3, wotask3)
    #task4_processed = process_data(wtask4, wotask4)


    #split the items into training and testing denoted by the testing ratio argument
    def random_split(items, ratio):
        shuffle(copy(items))
        train_size = int(ratio * len(items))
        test_size = len(items) - train_size
        return items[:train_size], items[-test_size:]

    #don't need training and testing for handin, just use the whole thing
    #task3_training, task3_testing = random_split(task3_processed, 0.9)
    #task4_training, task4_testing = random_split(task4_processed, 0.9)

    #generate the gazetteers to be used from ontonotes
    #before_name, inside_name = extract_gazetteers("PERSON", wtask4)
    #before_date, inside_date = extract_gazetteers("DATE", wtask3)
    #before_cardinal, inside_cardinal = extract_gazetteers("CARDINAL", wtask3)
    #before_norp, inside_norp = extract_gazetteers("NORP", wtask3)
    #before_ord, inside_ord = extract_gazetteers("ORDINAL", wtask3)

        #assumed to be in the environment of the function that 
    #generates the feature dictionary for task 3
    number_gazetteer = set([
        "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "fourty", "fifty",
        "sixty", "seventy", "eighty", "ninety", "hundred", "onehundred", "one-hundred", "thousand", "million"
        ])

    #Python sets are implemented as hash tables so lookup is always around O(1)
    #even for large sets

    ordinal_gazetteer = set([
        "first", "second", "third", "fourth", "fifth", "sixth", "seventh",
        "eigth", "ninth", "tenth", "eleventh", "twelth", "thirteenth",
        "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth",
        "nineteenth", "twentieth", "thirtieth", "fourtieth", "fiftieth", 
        "sixtieth", "seventieth", "eightieth", "ninetieth", "hundreth", "thousandth",
        "millionth", "firstly", "secondly", "thirdly", "fourthly", "fifthly", "sixthly",
        "seventhly", "eigthly", "ninethly", "tenthly"
    ])

    date_gazetteer = set([
        "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "day", "days", 
        "week", "weeks", "month", "months", "year", "years", "decade", "decades", "century", "centuries", 
        "millennium", "millennia", "jan", "january", "feb", "february", "mar", "march", "apr", "april", 
        "may", "jun", "june", "jul", "july", "aug", "august", "sep", "september", "oct", "october", 
        "nov", "november", "dec", "december", "holiday", "holidays"
    ])

    time_gazetteer = set(["milliseconds", "millis", "milis", "second", "seconds", "minute", "minutes", "hour", "hours", "time", "clock", "o'clock", "past", "oclock"])

    title_gazetteer = set([
        "mr", "mr.", "mrs", "mrs.", "miss", "miss.", "ms", "ms.", "madam", 
        "mam.", "sir", "sir.", "lord", "lord.", "mam", "mister", 
        "mister.", "missus", "missus.", "dame", "chairman", "king", 
        "queen", "president"])

    # a feature for detecting e.g. 1st, 2nd, 3rd, 4th etc.
    def is_st_th(candidate):
        suffixes = set(["st", "nd", "rd", "th"])
        return candidate[-2:] in suffixes and candidate[:-2].isdigit()

    #use the nltk name corpus
    #Keep all names uppercase as by convention names will only ever really 
    #appear in books as uppercase, this rules out things like "will" being
    #detected as a name
    name_gazetteer = set(names.words("male.txt") + names.words("female.txt"))

    #task 4 has less features as it only revolves around names
    def gen_feature_dict_task4(sentence, i):

        def gen_features(index, relative_str):
            return {
                relative_str + "word.lower()": sentence[index][0].lower(),
                relative_str + "word_prefix": sentence[index][0][:3],
                relative_str + "word_suffix": sentence[index][0][-3:],
                relative_str + "postag": sentence[index][1],

                #relative_str + "postag[:2]": sentence[index][1][:2],
                #relative_str + "is_mr_mrs": sentence[index][0][:2].lower() == "mr" or sentence[index][0][:3].lower() == "mrs",
                #relative_str + "in_title_gaz": sentence[index][0].lower() in title_gazetteer,
                relative_str + "not_alnum": not sentence[index][0].isalnum(),
                relative_str + "is_single_letter": sentence[index][0].isalpha() and len(sentence[index][0]) == 1
            }

        #extra features are limited to center 3 in the context window of size 5
        def gen_extra_features(index, relative_str):
            return {
                relative_str + "istitle": sentence[index][0].istitle(),
                relative_str + "isupper": sentence[index][0].isupper(),
                relative_str + "in_nltk_name_gaz" :  sentence[index][0] in name_gazetteer
            }

        features = gen_features(i, "")
        #features.update({"word" : sentence[i][0]})
        features.update(gen_extra_features(i, ""))

        #there is at least 1 word behind
        if i > 0:#generate a list of feature dictionaries for each word in the sentence
            features.update(gen_features(i - 1, "-1:"))
            features.update(gen_extra_features(i - 1, "-1:"))
            
            bias = 0
            
            if sentence[i][0][0].isupper():    
                if i > 1:
                    if sentence[i - 1][0] in title_gazetteer or sentence[i - 2][0] in title_gazetteer:
                        bias = 100
                else:
                    if sentence[i - 1][0] in title_gazetteer:
                        bias = 100
            
            features.update({"bias" : bias})
        
            
        #there are at least 2 words behind
        if i > 1:
            features.update(gen_features(i - 2, "-2:"))
            features.update(gen_extra_features(i - 2, "-2:"))

        #there are at least 3 words behind
        #if i > 2:
        #    features.update(gen_features(i - 3, "-3:"))
            
        #there is at least 1 word ahead
        if i < len(sentence) - 1:
            features.update(gen_features(i + 1, "+1:"))
            features.update(gen_extra_features(i + 1, "+1:"))

        #there are at least 2 words ahead
        if i < len(sentence) - 2:
            features.update(gen_features(i + 2, "+2:"))
            features.update(gen_extra_features(i + 2, "+2:"))

        #there are at least 3 words ahead
        #if i < len(sentence) - 3:
        #    features.update(gen_features(i + 3, "+3:"))
            
        return features

    def gen_feature_dict_task3(sentence, i):

        #token = sentence[i][0]
        #pos_tag = sentence[i][1]

        def gen_features(index, relative_str):

            return {
                relative_str + "word.lower()": sentence[index][0].lower(),
                relative_str + "word_prefix": sentence[index][0][:3],
                relative_str + "word_suffix": sentence[index][0][-3:],
                relative_str + "postag": sentence[index][1],
                #relative_str + "postag[:2]": sentence[index][1][:2],
                relative_str + "isdigit": sentence[index][0].isdigit()
            }

        #extra features are limited to center 3 in the context window of size 5
        def gen_extra_features(index, relative_str):
            return {
                relative_str + "not_alnum": not sentence[index][0].isalnum(),
                relative_str + "istitle": sentence[index][0].istitle(),
                relative_str + "isupper": sentence[index][0].isupper(),
                #python sets are around O(1) lookup so its fine to have large gazetteers
                relative_str + "in_num_gaz": sentence[index][0].lower() in number_gazetteer,
                relative_str + "in_ord_gaz": sentence[index][0].lower() in ordinal_gazetteer,
                relative_str + "in_date_gaz": sentence[index][0].lower() in date_gazetteer,
                relative_str + "in_time_gaz": sentence[index][0].lower() in time_gazetteer,
                #relative_str + "in_nltk_name_gaz": sentence[index][0].lower() in name_gazetteer,

                #COMMENT THESE OUT MAYBE COS NOT SURE ABOUT THEM
    #                relative_str + "in_B_date": sentence[index][0].lower() in before_date,
    #                relative_str + "in_I_date": sentence[index][0].lower() in inside_date,
    #                relative_str + "in_B_card": sentence[index][0].lower() in before_cardinal,
    #                relative_str + "in_I_card": sentence[index][0].lower() in inside_cardinal,
    #                relative_str + "in_B_norp": sentence[index][0].lower() in before_norp,
    #                relative_str + "in_I_norp": sentence[index][0].lower() in inside_norp,
    #                relative_str + "in_B_ord": sentence[index][0].lower() in before_ord,
    #                relative_str + "in_I_ord": sentence[index][0].lower() in inside_ord,

                #e.g. 1st, 2nd, 3rd, 4th etc.
                relative_str + "is_st_th": is_st_th(sentence[index][0])
            }

        features = gen_features(i, "")
        #features.update({"word" : sentence[i][0]})
        features.update(gen_extra_features(i, ""))

        #there is at least 1 word behind
        if i > 0:
            features.update(gen_features(i - 1, "-1:"))
            features.update(gen_extra_features(i - 1, "-1:"))

        #there are at least 2 words behind
        if i > 1:
            features.update(gen_features(i - 2, "-2:"))
            features.update(gen_extra_features(i - 2, "-2:"))

        #there are at least 3 words behind
        #if i > 2:
        #    features.update(gen_features(i - 3, "-3:"))

        #there is at least 1 word ahead
        if i < len(sentence) - 1:
            features.update(gen_features(i + 1, "+1:"))
            features.update(gen_extra_features(i + 1, "+1:"))

        #there are at least 2 words ahead
        if i < len(sentence) - 2:
            features.update(gen_features(i + 2, "+2:"))
            features.update(gen_extra_features(i + 2, "+2:"))

        #there are at least 3 words ahead
        #if i < len(sentence) - 3:
        #    features.update(gen_features(i + 3, "+3:"))

        return features

        #helper functions
    def sent2labels(sentence):
        return [label for _, _, label in sentence] 
    def sent2tokens(sent):
        return [token for token, _, _ in sentence]

    def sentences2labels(sentences):
        return [sent2labels(sent) for sent in sentences]


        #generate a list of feature dictionaries for each word in the sentence
    def gen_task3_features(sentence):
        return [gen_feature_dict_task3(sentence, i) for i in range(len(sentence))]

    def gen_task4_features(sentence):
        return [gen_feature_dict_task4(sentence, i) for i in range(len(sentence))]

    display_label_subset = ["B-DATE", "I-DATE", "B-CARDINAL", "I-CARDINAL", "B-ORDINAL", "I-ORDINAL", "B-NORP", "I-NORP"]


    #generate lists of feature dictionaries for each sentence
    def task3(ontonotes_data, sentences_to_predict):


        feature_lists = [gen_task3_features(sent) for sent in ontonotes_data]
        feature_dicts_to_predict = [gen_task3_features(sent) for sent in sentences_to_predict]
        label_lists = [sent2labels(sent) for sent in ontonotes_data]

        crf = sklearn_crfsuite.CRF(
            algorithm = "lbfgs",
            c1 = 1,
            c2 = 0.149853957,
            max_iterations = max_iter,
            all_possible_transitions = True,
            verbose = verbose_on
        )

        crf.fit(feature_lists, label_lists)
        predictions = crf.predict(feature_dicts_to_predict)
        dct = gen_NER_dict(["CARDINAL", "ORDINAL", "DATE", "NORP"], sentences_to_predict, predictions)
        
        return dct, crf

    #try seeing if different hyperparamaters for each model improves
    #their respsective performances, might be worth them not both being the same
    def task4(ontonotes_data, sentences_to_predict):

        feature_lists = [gen_task4_features(sent) for sent in ontonotes_data]
        feature_dicts_to_predict = [gen_task4_features(sent) for sent in sentences_to_predict]
        label_lists = [sent2labels(sent) for sent in ontonotes_data]
        crf = sklearn_crfsuite.CRF(
            algorithm = "lbfgs",
            c1 = 1,
            c2 = 0.149853957,
            max_iterations = max_iter,
            all_possible_transitions = True,
            verbose = verbose_on)
        crf.fit(feature_lists, label_lists)

        #REPORT STUFF GOES HERE

        predictions = crf.predict(feature_dicts_to_predict)

        dct = gen_NER_dict(["PERSON"], sentences_to_predict, predictions)
        
        #predictions = crf.predict(feature_dicts_to_predict)
        return dct, crf
        #return extract_entities_from_sequences(["PERSON"], predictions, False)

    task3sample = random_sample(wtask3,task3_sample_size)
    del wtask3
    gc.collect()
    task3dict, task3model = task3(task3sample, sentences_to_predict)
    
    task4sample = random_sample(wtask4,task4_sample_size)
    del wtask4
    gc.collect()
    fix_mr_mrs(task4sample)
    task4dict, task4model = task4(task4sample, sentences_to_predict)

    #combine into one dict as required
    task3dict.update(task4dict)

    return task3dict, task3model, task4model

In [108]:
def exec_ner( file_chapter = None, ontonotes_file = None ) :

    # INSERT CODE TO TRAIN A CRF NER MODEL TO TAG THE CHAPTER OF TEXT (subtask 3)
    # USING NER MODEL AND REGEX GENERATE A SET OF BOOK CHARACTERS AND FILTERED SET OF NE TAGS (subtask 4)

    chapter_str = ""
    with open(file_chapter, 'r') as f:
        chapter_str = without_chapter(f.read())
        
    sentences_to_predict = []
    for sent in sent_tokenize(chapter_str):    # DO NOT CHANGE THE CODE IN THIS FUNCTION

    #
    # subtask 1 >> extract chapter headings and create a table of contents from a provided plain text book (from www.gutenberg.org)
    # Input >> www.gutenberg.org sourced plain text file for a whole book
    # Output >> toc.json = { <chapter_number_text> : <chapter_title_text> }
    #

        tokens = word_tokenize(sent)
        sentences_to_predict.append(prepare_sentence(tokens))
    
    #ontonotes_sentences = load_ontonotes(ontonotes_file)
    
    dictNE = run_NER(ontonotes_sentences, sentences_to_predict)
    
    return dictNE
    
    #printfn("---NER DICTIONARY---\n")
    #printfn(dictNE)

    # DO NOT CHANGE THE BELOW CODE WHICH WILL SERIALIZE THE ANSWERS FOR THE AUTOMATED TEST HARNESS TO LOAD AND MARK

    """
    # write out all PERSON entries for character list for subtask 4
    writeHandle = codecs.open( 'characters.txt', 'w', 'utf-8', errors = 'replace' )
    if 'PERSON' in dictNE :
        for strNE in dictNE['PERSON'] :
            writeHandle.write( strNE.strip().lower()+ '\n' )
    writeHandle.close()

    # FILTER NE dict by types required for subtask 3
    listAllowedTypes = [ 'DATE', 'CARDINAL', 'ORDINAL', 'NORP' ]
    listKeys = list( dictNE.keys() )
    for strKey in listKeys :
        for nIndex in range(len(dictNE[strKey])) :
            dictNE[strKey][nIndex] = dictNE[strKey][nIndex].strip().lower()
        if not strKey in listAllowedTypes :
            del dictNE[strKey]

    # write filtered NE dict
    writeHandle = codecs.open( 'ne.json', 'w', 'utf-8', errors = 'replace' )
    strJSON = json.dumps( dictNE, indent=2 )
    writeHandle.write( strJSON + '\n' )
    writeHandle.close()
    """

In [103]:
dct, t3model, t4model = exec_ner("eval_chapter.txt", "ontonotes_parsed.json")

loading training data to CRFsuite: 100%|██████████| 17000/17000 [00:14<00:00, 1139.39it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 90721
Seconds required: 2.020

L-BFGS optimization
c1: 1.000000
c2: 0.149854
num_memories: 6
max_iterations: 125
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=1.89  loss=562406.05 active=68769 feature_norm=1.00
Iter 2   time=2.02  loss=377430.02 active=67930 feature_norm=5.12
Iter 3   time=1.00  loss=322649.02 active=54700 feature_norm=4.53
Iter 4   time=6.04  loss=183378.57 active=44465 feature_norm=3.35
Iter 5   time=2.89  loss=164959.10 active=43071 feature_norm=3.52
Iter 6   time=0.92  loss=133010.23 active=44247 feature_norm=4.72
Iter 7   time=0.95  loss=120258.21 active=43264 feature_norm=6.29
Iter 8   time=0.96  loss=109522.80 active=45210 feature_norm=6.65
Iter 9   time=1.05  loss=95949.59 active=43517 feature_norm=8.11
Iter 1

Iter 121 time=0.99  loss=18223.06 active=11308 feature_norm=77.82
Iter 122 time=0.97  loss=18221.76 active=11300 feature_norm=77.86
Iter 123 time=0.97  loss=18216.50 active=11277 feature_norm=77.88
Iter 124 time=0.94  loss=18214.78 active=11277 feature_norm=77.93
Iter 125 time=0.94  loss=18210.09 active=11264 feature_norm=77.94
L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 129.577

Storing the model
Number of active features: 11264 (90721)
Number of active attributes: 5691 (53007)
Number of active labels: 9 (9)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.005



loading training data to CRFsuite: 100%|██████████| 17000/17000 [00:10<00:00, 1662.68it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 78511
Seconds required: 1.477

L-BFGS optimization
c1: 1.000000
c2: 0.149854
num_memories: 6
max_iterations: 125
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.57  loss=188410.02 active=57936 feature_norm=1.00
Iter 2   time=0.29  loss=139518.79 active=50197 feature_norm=1.61
Iter 3   time=0.29  loss=120741.68 active=48508 feature_norm=1.81
Iter 4   time=0.29  loss=83430.76 active=43732 feature_norm=2.28
Iter 5   time=0.29  loss=69547.10 active=43819 feature_norm=2.72
Iter 6   time=0.29  loss=53308.31 active=39029 feature_norm=3.69
Iter 7   time=0.29  loss=46211.25 active=38459 feature_norm=4.30
Iter 8   time=0.29  loss=43555.07 active=37589 feature_norm=5.08
Iter 9   time=0.29  loss=40050.14 active=37525 feature_norm=5.46
Iter 10  ti

Iter 122 time=0.58  loss=11074.70 active=8521  feature_norm=59.95
Iter 123 time=0.57  loss=11073.28 active=8510  feature_norm=59.97
Iter 124 time=0.30  loss=11072.73 active=8478  feature_norm=60.00
Iter 125 time=0.30  loss=11070.15 active=8480  feature_norm=60.04
L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 44.945

Storing the model
Number of active features: 8480 (78511)
Number of active attributes: 5618 (51633)
Number of active labels: 3 (3)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.004



In [None]:
dct1, t3model1, t4model1 = exec_ner("eval_chapter.txt", "ontonotes_parsed.json")

loading training data to CRFsuite: 100%|██████████| 17000/17000 [00:17<00:00, 947.59it/s] 



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 248314
Seconds required: 2.691

L-BFGS optimization
c1: 1.000000
c2: 0.149854
num_memories: 6
max_iterations: 125
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=2.10  loss=553597.53 active=153734 feature_norm=1.00
Iter 2   time=2.12  loss=358741.43 active=152077 feature_norm=4.84
Iter 3   time=1.15  loss=306726.88 active=113570 feature_norm=4.29
Iter 4   time=5.38  loss=192327.31 active=75614 feature_norm=2.97
Iter 5   time=1.05  loss=189185.52 active=102094 feature_norm=4.10
Iter 6   time=1.06  loss=143338.90 active=90874 feature_norm=4.51
Iter 7   time=2.09  loss=125878.33 active=85623 feature_norm=5.36
Iter 8   time=1.04  loss=118598.85 active=86360 feature_norm=5.66
Iter 9   time=1.05  loss=103267.59 active=79185 feature_norm=7.03


In [54]:
def report(trained_model, bestN, worstN, task):
	
	task4_tagset = ["B-PERSON", "I-PERSON"]
	task3_tagset = ["B-DATE", "I-DATE", "B-CARDINAL", "I-CARDINAL", "B-ORDINAL", "I-ORDINAL", "B-NORP", "I-NORP"]

	tagset = task3_tagset if task == 3 else task4_tagset

	print("TRANSITION WEIGHTS LEARNT ON DATASET:\n")
	display( eli5.show_weights(trained_model, top=(bestN,worstN), targets = tagset, show=['transition_features']))
	print("\nBEST PERFORMING FEATURES FOR EACH NER TAG:\n")
	display( eli5.show_weights(trained_model, top=(bestN,worstN), targets = tagset, show=['targets']) )

In [104]:
dct

{'CARDINAL': ['one', 'three', 'two', 'about so', 'seven'],
 'ORDINAL': ['first'],
 'DATE': ['about a month',
  'at least half-a-dozen years',
  'many years ago',
  'every day',
  'some days'],
 'NORP': ['indians'],
 'PERSON': ['mrs. mell',
  'copperfield',
  'salem house',
  'miss creakle',
  'mrs. creakle',
  'steerforth',
  'lie',
  'mrs.',
  'towzer',
  'mr. sharp',
  'mr. creakle',
  'traddles',
  'tungay',
  'tommy traddles',
  'hah',
  'j. steerforth',
  'charley',
  'turn',
  'mr. mell']}

In [105]:
report(t3model, 30, 30, 3)

TRANSITION WEIGHTS LEARNT ON DATASET:



From \ To,B-DATE,I-DATE,B-CARDINAL,I-CARDINAL,B-ORDINAL,I-ORDINAL,B-NORP,I-NORP
B-DATE,-4.696,4.032,-2.461,-3.442,-1.486,-0.542,0.0,-1.202
I-DATE,-4.646,4.89,-1.985,-2.914,-1.07,-0.773,-1.249,-1.212
B-CARDINAL,-1.383,-4.45,-3.632,3.676,-0.636,-0.613,0.515,-1.099
I-CARDINAL,-1.628,-3.286,-2.95,4.16,-0.625,-0.109,0.322,-1.352
B-ORDINAL,-0.997,-4.075,-1.612,-2.368,-1.002,3.885,0.486,-0.258
I-ORDINAL,-0.342,-0.567,-0.208,-0.181,0.0,4.529,0.0,0.0
B-NORP,-3.325,-4.232,-3.321,-3.524,-2.198,-1.123,-3.268,2.703
I-NORP,-2.428,-1.711,-1.909,-1.73,-0.562,0.0,-1.266,3.745



BEST PERFORMING FEATURES FOR EACH NER TAG:



Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7
+5.455,word_prefix:198,,,,,,
+5.134,word_prefix:199,,,,,,
+4.792,word_prefix:197,,,,,,
+4.203,word_prefix:ann,,,,,,
+4.132,word_suffix:kly,,,,,,
+3.795,word_prefix:tod,,,,,,
+3.687,word_prefix:dai,,,,,,
+3.679,word_suffix:30s,,,,,,
+3.664,word_prefix:yes,,,,,,
+3.497,word_suffix:80s,,,,,,

Weight?,Feature
+5.455,word_prefix:198
+5.134,word_prefix:199
+4.792,word_prefix:197
+4.203,word_prefix:ann
+4.132,word_suffix:kly
+3.795,word_prefix:tod
+3.687,word_prefix:dai
+3.679,word_suffix:30s
+3.664,word_prefix:yes
+3.497,word_suffix:80s

Weight?,Feature
+3.578,word_suffix:0's
+3.223,word_prefix:old
+3.221,-1:word_suffix:pt.
+3.116,word_prefix:199
+2.984,word_prefix:198
+2.869,word_prefix:dat
+2.834,-1:word_prefix:fis
+2.831,word_prefix:ago
+2.708,word_prefix:cen
+2.523,-1:word_suffix:ent

Weight?,Feature
+3.846,word_prefix:mil
+3.088,in_num_gaz
+2.951,postag:CD
+2.935,word_prefix:Doz
+2.735,not_alnum
+2.676,word_suffix:ons
+2.621,word_prefix:bil
+2.612,word_suffix:nds
+2.551,word_prefix:Sev
+2.359,word_suffix:eds

Weight?,Feature
+3.503,word_prefix:mil
+2.850,word_prefix:tha
+2.595,word_prefix:bil
+2.418,postag:CD
+2.160,-1:postag:DT
+2.095,-1:word_prefix:app
+2.085,word_suffix:rds
+2.039,word_suffix:and
+1.931,-1:word_prefix:num
+1.877,word_suffix:/32

Weight?,Feature
+5.037,in_ord_gaz
+4.869,is_st_th
+3.173,word_suffix:hth
+2.490,word_suffix:ary
+2.408,word_prefix:sec
+2.316,word_prefix:las
+2.257,word_prefix:onc
+2.068,word_suffix:nce
+1.351,-1:word_suffix:sed
+1.238,+1:word_suffix:ths

Weight?,Feature
1.863,word_prefix:sec
1.328,+2:in_ord_gaz
1.114,+1:word_suffix:ion
1.027,in_ord_gaz
1.026,word_suffix:-
1.026,word_prefix:-
0.951,-1:not_alnum
0.909,+1:in_ord_gaz
0.65,-2:in_num_gaz
0.626,-2:postag:IN

Weight?,Feature
+4.087,word_suffix:ese
+3.590,istitle
+3.549,word_suffix:eni
+3.533,word_prefix:dem
+3.367,word_prefix:Dem
+3.154,word_suffix:ist
+3.148,word_suffix:aqi
+3.051,word_suffix:iph
+2.978,word_prefix:Pai
+2.925,word_suffix:ans

Weight?,Feature
+2.600,-1:word_prefix:Cen
+2.470,word_suffix:sts
+2.155,istitle
+2.046,-1:postag:DT
+1.863,-1:word_suffix:rth
+1.857,-1:word_suffix:New
+1.681,-1:word_prefix:Sou
+1.679,word_suffix:ine
+1.533,word_suffix:ers
+1.526,-1:word_suffix:uth


In [106]:
report(t4model, 200, 30, 4)

TRANSITION WEIGHTS LEARNT ON DATASET:



From \ To,B-PERSON,I-PERSON
B-PERSON,-6.301,2.141
I-PERSON,-6.056,1.166



BEST PERFORMING FEATURES FOR EACH NER TAG:



Weight?,Feature
Weight?,Feature
+4.622,word_suffix:ega
+4.179,word_prefix:Rum
+3.862,word_suffix:Tai
+3.791,word_prefix:Ta'
+3.682,word_prefix:Din
+3.676,-1:word_suffix:yor
+3.653,word_prefix:Uda
+3.418,word_prefix:Nad
+3.410,word_prefix:Aba
+3.398,-1:word_suffix:lor

Weight?,Feature
+4.622,word_suffix:ega
+4.179,word_prefix:Rum
+3.862,word_suffix:Tai
+3.791,word_prefix:Ta'
+3.682,word_prefix:Din
+3.676,-1:word_suffix:yor
+3.653,word_prefix:Uda
+3.418,word_prefix:Nad
+3.410,word_prefix:Aba
+3.398,-1:word_suffix:lor

Weight?,Feature
+3.054,-1:word_prefix:Mrs
+2.828,-1:word_prefix:Pre
+2.815,-1:word_prefix:ano
+2.696,-1:postag:DT
+2.667,-1:postag:PRP
+2.633,-1:word_prefix:Mis
+2.576,word_prefix:Zed
+2.365,-1:word_prefix:Mr.
+2.365,-1:word_suffix:Mr.
+2.203,-1:postag:CD


In [40]:
ontonotes_sentences = load_ontonotes("ontonotes_parsed.json")

In [68]:
def split_tags(tag_set, sentences):

    actual_tags = []
    for tag in tag_set:
        actual_tags.append("B-" + tag)
        actual_tags.append("I-" + tag)

    contains_tags = []
    doesnt_contain_tags = []

    for sent in sentences:

        if(any([tag in actual_tags for _,_,tag in sent])):
            contains_tags.append(sent)
        else:
            doesnt_contain_tags.append(sent)

    return (contains_tags, doesnt_contain_tags)

In [69]:
wpeople, wopeople = split_tags(["PERSON"], ontonotes_sentences)

In [83]:
#Found the problem!!
wpeople[6000]

[('To', 'TO', 'O'),
 ('overcome', 'VB', 'O'),
 ('this', 'DT', 'O'),
 (',', ',', 'O'),
 ('Mr.', 'NNP', 'O'),
 ('Codover', 'NNP', 'B-PERSON'),
 ("'s", 'POS', 'O'),
 ('team', 'NN', 'O'),
 ('injected', 'VBD', 'O'),
 ('a', 'DT', 'O'),
 ('gene', 'NN', 'O'),
 ('to', 'TO', 'O'),
 ('make', 'VB', 'O'),
 ('GDNF', 'NNP', 'O'),
 ('right', 'NN', 'O'),
 ('into', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('monkey', 'NN', 'O'),
 ("'s", 'POS', 'O'),
 ('brains', 'NNS', 'O'),
 ('.', '.', 'O')]