In [89]:
from __future__ import absolute_import, division, print_function, unicode_literals

import sys, codecs, json, math, time, warnings, re, logging
warnings.simplefilter( action='ignore', category=FutureWarning )

import nltk, numpy, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics

LOG_FORMAT = ('%(levelname) -s %(asctime)s %(message)s')
logger = logging.getLogger( __name__ )
logging.basicConfig( level=logging.INFO, format=LOG_FORMAT )
logger.info('logging started')

INFO 2021-05-10 18:01:34,551 logging started


In [90]:
verbose_on = False
max_iter = 150
task3_sample_size = 17000
task4_sample_size = 17000

In [91]:
#only print if verbose_on is true
def printfn(thing): 
	if verbose_on: print(thing)

In [92]:
#from the match result clean multilines
#if there is a multiline between 2 non-space characters then replace with space
#if there is a newline with a space either side of it then just remove it
def clean_multilines(unclean):
    
    cleaning = []
    cleaning[:] = unclean
    
    i = 1
    end_index = len(cleaning) - 1
    
    while i < end_index:
        
        if(cleaning[i] != '\n'):
            i += 1
            continue
        
        space_nb = cleaning[i - 1] == ' ' or cleaning[i + 1] == ' '
        newline_nb = cleaning[i - 1] == '\n' or cleaning[i + 1] == '\n'
        
        if space_nb or newline_nb:
            del cleaning[i]
            end_index -= 1
        else:
            cleaning[i] = ' ' #replace with space
            i += 1
    
    return "".join(cleaning)

In [93]:
import re

In [94]:
#tell if they are 2 different number systems i.e. arabic and roman numerals
def diff_num_systems(str1, str2):
    return (str1.isdigit() and (not str2.isdigit())) or ((not str1.isdigit()) and str2.isdigit())

def extract_toc(chapter_str):
    
    regex = "".join([
        
        #chapter marker
        r"(?:CHAPTER|Chapter)", 
        
        #space between chapter and number
        r"[ ]*", 
        
        #chapter number identifier, either digits or roman numerals
        r"([IVXLCDM]+|\d+)", 
        
        #space between number then possibly a '.' symbol, then any number of new lines between
        #chapter/number and the title
        r"[ ]*\.?[ ]*\n*[ ]*",
        
        #many of newline followed by not a new line or a non-new line character forming
        #part of the title, to allow for it to span across multiple lines   
        #encapsulate the whole thing in a capture group to get the title in a group
        r"((?:\n(?!\n)|.)*)",
        
        #end of a chapter declaration is 2 new lines or another chapter keyword (if in table of contents)
        r"(?:CHAPTER|Chapter|\n\n)"
        
    ])
    
    match_pairs = [mobj.groups() for mobj in re.finditer(regex, chapter_str, flags = re.UNICODE | re.MULTILINE)]
    
    #the table of contents dictionary
    toc = {}
    first_num, first_title = match_pairs[0]
    toc[first_num] = clean_multilines(first_title)
    
    i = 1
    
    while i < len(match_pairs):
        
        num, title = match_pairs[i]
        
        if num == first_num or diff_num_systems(num, first_num):
            break
        
        toc[num] = clean_multilines(title)
        i += 1
    
    #If we have already looked at all chapters (i.e. there was no table of contents detected)
    #then just return it straight away
    if i == len(match_pairs): return toc
    
    #the index will now start after table of contents
    
    #the chapters that appear in the book
    chapters = {}
    first_chapter_num, first_chapter_title = match_pairs[i]
    chapters[first_chapter_num] = clean_multilines(first_chapter_title)
    
    while i < len(match_pairs):
        
        num, title = match_pairs[i]
        chapters[num] = clean_multilines(title)
        i += 1
    
    return chapters

In [95]:
#Given an entire chapter that has been read as a string, 
#extract all questions from it
def extract_questions(chapter_str):
    
	#Check there was a sentence end before the start i.e. punctuation mark
    #Start with a capital letter, any number of non-sentence ending characters followed by a question mark
    regex = r"(?:(?<=([‘“\"\'\.\?\!]))[ ]*)([A-Z][^\?\.!]*\?)"
    matches = re.findall(regex, chapter_str, flags = re.MULTILINE | re.DOTALL | re.UNICODE)
    return set(match.replace('\n', ' ') for _, match in matches)

In [96]:
from nltk import sent_tokenize, word_tokenize, pos_tag

In [97]:
#They should be exactly the same length as the predictions have been made from them
def gen_tok_NER_pair_lists(tag_list, sentence_with_unknown_NER, NER_predictions):
	
	
	tokens = [tok for (tok,_,_) in sentence_with_unknown_NER]
	
	pair_list = []
	
	i = 0
	while i < len(NER_predictions):

		if(NER_predictions[i] == "O"):
			i += 1
			continue
		
		#if(not (NER_predictions[i][2:] in tag_list)):
		#	i += 1
		#	continue
		
		#remove the B- or I-
		actual_tag = NER_predictions[i][2:]
		
		#The indices will correspond, start the entity with the first token with the tag
		entity = tokens[i]
		i += 1
		
		while i < len(NER_predictions) and NER_predictions[i][2:] == actual_tag:
			entity += " " + tokens[i]
			i += 1
		
		pair_list.append((entity, actual_tag))
		i += 1
	
	return pair_list

def gen_NER_dict(tag_list, sentences, sentence_NER_predictions):
	
	dct = {tag : [] for tag in tag_list}
	
	for (sentence, predictions) in zip(sentences,sentence_NER_predictions):
		pairs = gen_tok_NER_pair_lists(tag_list, sentence, predictions)
		for (entity, tag) in pairs:
			dct[tag].append(entity.lower())
	
	dct = {tag : list(set(entities)) for (tag,entities) in dct.items()}
	return dct

#turn the list of tokens into a sentence that can be used for NER
def prepare_sentence(tokens):
	tags = list(map(lambda x: x[1], pos_tag(tokens)))
	sentence = []
	for i in range(len(tokens)):
		sentence.append((tokens[i],tags[i],"O"))
	return sentence

#detect if the chapter string has "CHAPTER . SOMETHING" at the start of it
#and return the string without it
def without_chapter(chapter_str):
	
	#Define the end of a title as there being at least 2 new line characters
	regex = r"(\n|[ ])*.*(?:CHAPTER|(C|c)hapter)[ ]*\.?(?:[ivxlcdm]+|[IVXLCDM]+|\d+)[ ]*\.?(\n|[ ])*.*\n.*\n(\n|[ ])*"
	result = re.match(regex, chapter_str, flags = re.MULTILINE | re.UNICODE)
	
	return chapter_str[len(result[0]):]
	
	#if there is a Chapter N part at the start of the chapter then remove it
	#length = len(result[0])
	#return chapter_str[length:]

#take the file path of ontonotes and return sentences
def load_ontonotes(dataset_file):

	readHandle = codecs.open(dataset_file, 'r', 'utf-8', errors = 'replace')
	str_json = readHandle.read()
	readHandle.close()
	dict_ontonotes = json.loads(str_json)
	orig_list = list(dict_ontonotes.keys())
	sentences = []

	for str_file in orig_list:
		for str_sent_index in dict_ontonotes[str_file] :

			tokens = []
			ner_tags = []
			# compute IOB tags for named entities (if any)
			ne_type_last = None

			#build up the list of tokens and ner tags
			for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
				strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
				strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
				ne_type = None
				if 'ne' in dict_ontonotes[str_file][str_sent_index] :
					dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
					if not 'parse_error' in dict_ne :
						for str_NEIndex in dict_ne :
							if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
								ne_type = dict_ne[str_NEIndex]['type']
								break
				if ne_type != None :
					if ne_type == ne_type_last :
						strIOB = 'I-' + ne_type
					else :
						strIOB = 'B-' + ne_type
				else :
					strIOB = 'O'

				ne_type_last = ne_type
				tokens.append(strToken)
				ner_tags.append(strIOB)
			list_entry = []

			#use nltk pos tags instead
			for (index, tup) in enumerate(pos_tag(tokens)):
				list_entry.append((tup[0], tup[1], ner_tags[index]))
			sentences.append(list_entry)
	return sentences



In [98]:
from nltk.corpus import names
from copy import copy
from random import shuffle
import gc

In [99]:
#the ontonotes data sentences and the sentences 
#to predict from reading the chapter file
def run_NER(sentences, sentences_to_predict):
    
		#given a specific NER tag, return 2 sets consisting of the 
	#"inside parts" and the "before parts"
	def extract_gazetteers(target, sentences):
		before = set()
		inside = set()
		before_tag = "B-" + target
		inside_tag = "I-" + target
		for sent in sentences:
			for tok, _, ner in sent:
				if ner == before_tag:
					before.add(tok.lower())
				if ner == inside_tag:
					inside.add(tok.lower())
		return before, inside


	#convert all the NER tags you don't care about into O's in sentences that may have mixed NER tags that we care about
	#with NER tags that we don't care about
	def only_Os(desired_set, sentences):
		actual_tags = []
		for tag in desired_set:
			actual_tags.append("B-" + tag)
			actual_tags.append("I-" + tag)
		return [[(tok, pos, ner if ner in actual_tags else 'O') for tok, pos, ner in sent] for sent in sentences]


	#If the dataset has already been split and we KNOW that none of these NER tags are the ones we care about,
	#then clearly we can just replace ALL NER tags with O
	def all_Os(sentences):
		return [[(tok, pos, 'O') for tok, pos, _ in sent] for sent in sentences]


	#returns a tuples of 2 lists of sentences, where one list contains all the sentences that contain the tags 
	#specified in the argument and the other list does not
	#you just give it the tag strings and it adds the Bs and Is for you
	def split_tags(tag_set, sentences):

		actual_tags = []
		for tag in tag_set:
			actual_tags.append("B-" + tag)
			actual_tags.append("I-" + tag)

		contains_tags = []
		doesnt_contain_tags = []

		for sent in sentences:

			if(any([tag in actual_tags for _,_,tag in sent])):
				contains_tags.append(sent)
			else:
				doesnt_contain_tags.append(sent)

		return (contains_tags, doesnt_contain_tags)

	#the sentences containing task 3 tags and the sentences that dont
	wtask3, wotask3 = split_tags(["DATE", "CARDINAL", "ORDINAL", "NORP"], sentences)


	#the sentences containing task 4 tags (only person) and the sentences that dont
	wtask4, wotask4 = split_tags(["PERSON"], sentences)

	#all other tags can be converted to O because we dont care about finding them
	wtask3 = only_Os(["DATE", "CARDINAL", "ORDINAL", "NORP"], wtask3)
	wtask4 = only_Os(["PERSON"], wtask4)

	#Any sentences that don't contain any entities we care about can
	#ALL be converted to Os!
	#wotask3 = all_Os(wotask3)
	#wotask4 = all_Os(wotask4)

	del wotask3
	del wotask4
	del sentences
	gc.collect()
	
		#take a random sample with the specified size from the list of sentences
	def random_sample(sentences, num):

		if num > len(sentences):
			return sentences

		cp = copy(sentences)
		shuffle(cp)
		return cp[-num:]

	#process data, putting in a ratio of sentences that contain the NER tags we care about
	#and the complement ration of ones we dont
	def process_data(care_about, dont_care):

		num_care = floor(care_ratio * sample_size)
		num_dont_care = sample_size - num_care

		care_sample = random_sample(care_about, num_care)
		dont_care_sample = random_sample(dont_care, num_dont_care)

		processed = care_sample + dont_care_sample
		shuffle(processed)
		return processed

	#task3_processed = process_data(wtask3, wotask3)
	#task4_processed = process_data(wtask4, wotask4)


	#split the items into training and testing denoted by the testing ratio argument
	def random_split(items, ratio):
		shuffle(copy(items))
		train_size = int(ratio * len(items))
		test_size = len(items) - train_size
		return items[:train_size], items[-test_size:]

	#don't need training and testing for handin, just use the whole thing
	#task3_training, task3_testing = random_split(task3_processed, 0.9)
	#task4_training, task4_testing = random_split(task4_processed, 0.9)

	#generate the gazetteers to be used from ontonotes
	#before_name, inside_name = extract_gazetteers("PERSON", wtask4)
	#before_date, inside_date = extract_gazetteers("DATE", wtask3)
	#before_cardinal, inside_cardinal = extract_gazetteers("CARDINAL", wtask3)
	#before_norp, inside_norp = extract_gazetteers("NORP", wtask3)
	#before_ord, inside_ord = extract_gazetteers("ORDINAL", wtask3)

		#assumed to be in the environment of the function that 
	#generates the feature dictionary for task 3
	number_gazetteer = set([
		"one", "two", "three", "four", "five", "six", "seven", "eight",
		"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
		"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "fourty", "fifty",
		"sixty", "seventy", "eighty", "ninety", "hundred", "onehundred", "one-hundred", "thousand", "million"
		])

	#Python sets are implemented as hash tables so lookup is always around O(1)
	#even for large sets

	ordinal_gazetteer = set([
		"first", "second", "third", "fourth", "fifth", "sixth", "seventh",
		"eigth", "ninth", "tenth", "eleventh", "twelth", "thirteenth",
		"fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth",
		"nineteenth", "twentieth", "thirtieth", "fourtieth", "fiftieth", 
		"sixtieth", "seventieth", "eightieth", "ninetieth", "hundreth", "thousandth",
		"millionth", "firstly", "secondly", "thirdly", "fourthly", "fifthly", "sixthly",
		"seventhly", "eigthly", "ninethly", "tenthly"
	])

	date_gazetteer = set([
		"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "day", "days", 
		"week", "weeks", "month", "months", "year", "years", "decade", "decades", "century", "centuries", 
		"millennium", "millennia", "jan", "january", "feb", "february", "mar", "march", "apr", "april", 
		"may", "jun", "june", "jul", "july", "aug", "august", "sep", "september", "oct", "october", 
		"nov", "november", "dec", "december", "holiday", "holidays"
	])

	time_gazetteer = set(["second", "seconds", "minute", "minutes", "hour", "hours", "time", "clock", "o'clock", "past", "oclock"])

	title_gazetteer = set([
		"mr", "mr.", "mrs", "mrs.", "miss", "miss.", "madam", 
		"mam.", "sir", "sir.", "lord", "lord.", "mam", "mister", 
		"mister.", "missus", "missus.", "dame", "chairman", "king", 
		"queen", "president"])

	# a feature for detecting e.g. 1st, 2nd, 3rd, 4th etc.
	def is_st_th(candidate):
		suffixes = set(["st", "nd", "rd", "th"])
		return candidate[-2:] in suffixes and candidate[:-2].isdigit()

	#use the nltk name corpus
	#Keep all names uppercase as by convention names will only ever really 
	#appear in books as uppercase, this rules out things like "will" being
	#detected as a name
	#name_gazetteer = set(names.words("male.txt") + names.words("female.txt"))

	#task 4 has less features as it only revolves around names
	def gen_feature_dict_task4(sentence, i):

		def gen_features(index, relative_str):
			return {
				relative_str + "word.lower()": sentence[index][0].lower(),
				relative_str + "word_prefix": sentence[index][0][:3],
				relative_str + "word_suffix": sentence[index][0][-3:],
				relative_str + "postag": sentence[index][1],
				relative_str + "is_mr_mrs": sentence[index][0][:2].lower() == "mr" or sentence[index][0][:3].lower() == "mrs",
				#relative_str + "in_title_gaz": sentence[index][0].lower() in title_gazetteer,
				relative_str + "not_alnum": not sentence[index][0].isalnum(),
				relative_str + "is_single_letter": sentence[index][0].isalpha() and len(sentence[index][0]) == 1
			}

		#extra features are limited to center 3 in the context window of size 5
		def gen_extra_features(index, relative_str):
			return {
				relative_str + "istitle": sentence[index][0].istitle(),
				relative_str + "isupper": sentence[index][0].isupper()
				#relative_str + "in_nltk_name_gaz" :  sentence[index][0] in name_gazetteer
			}

		features = gen_features(i, "")
		features.update(gen_extra_features(i, ""))

		#there is at least 1 word behind
		if i > 0:#generate a list of feature dictionaries for each word in the sentence
			features.update(gen_features(i - 1, "-1:"))
			features.update(gen_extra_features(i - 1, "-1:"))

		#there are at least 2 words behind
		if i > 1:
			features.update(gen_features(i - 2, "-2:"))

		#there are at least 3 words behind
		#if i > 2:
		#	features.update(gen_features(i - 3, "-3:"))
			
		#there is at least 1 word ahead
		if i < len(sentence) - 1:
			features.update(gen_features(i + 1, "+1:"))
			features.update(gen_extra_features(i + 1, "+1:"))

		#there are at least 2 words ahead
		if i < len(sentence) - 2:
			features.update(gen_features(i + 2, "+2:"))

		#there are at least 3 words ahead
		#if i < len(sentence) - 3:
		#	features.update(gen_features(i + 3, "+3:"))
			
		return features

	def gen_feature_dict_task3(sentence, i):

		#token = sentence[i][0]
		#pos_tag = sentence[i][1]

		def gen_features(index, relative_str):

			return {
				relative_str + "word.lower()": sentence[index][0].lower(),
				relative_str + "word_prefix": sentence[index][0][:3],
				relative_str + "word_suffix": sentence[index][0][-3:],
				relative_str + "postag": sentence[index][1],
				relative_str + "isdigit": sentence[index][0].isdigit()
			}

		#extra features are limited to center 3 in the context window of size 5
		def gen_extra_features(index, relative_str):
			return {
				relative_str + "not_alnum": not sentence[index][0].isalnum(),
				relative_str + "istitle": sentence[index][0].istitle(),
				relative_str + "isupper": sentence[index][0].isupper(),
				#python sets are around O(1) lookup so its fine to have large gazetteers
				relative_str + "in_num_gaz": sentence[index][0].lower() in number_gazetteer,
				relative_str + "in_ord_gaz": sentence[index][0].lower() in ordinal_gazetteer,
				relative_str + "in_date_gaz": sentence[index][0].lower() in date_gazetteer,
				relative_str + "in_time_gaz": sentence[index][0].lower() in time_gazetteer,
				#relative_str + "in_nltk_name_gaz": sentence[index][0].lower() in name_gazetteer,

				#COMMENT THESE OUT MAYBE COS NOT SURE ABOUT THEM
	#				relative_str + "in_B_date": sentence[index][0].lower() in before_date,
	#				relative_str + "in_I_date": sentence[index][0].lower() in inside_date,
	#				relative_str + "in_B_card": sentence[index][0].lower() in before_cardinal,
	#				relative_str + "in_I_card": sentence[index][0].lower() in inside_cardinal,
	#				relative_str + "in_B_norp": sentence[index][0].lower() in before_norp,
	#				relative_str + "in_I_norp": sentence[index][0].lower() in inside_norp,
	#				relative_str + "in_B_ord": sentence[index][0].lower() in before_ord,
	#				relative_str + "in_I_ord": sentence[index][0].lower() in inside_ord,

				#e.g. 1st, 2nd, 3rd, 4th etc.
				relative_str + "is_st_th": is_st_th(sentence[index][0])
			}

		features = gen_features(i, "")
		features.update(gen_extra_features(i, ""))

		#there is at least 1 word behind
		if i > 0:
			features.update(gen_features(i - 1, "-1:"))
			features.update(gen_extra_features(i - 1, "-1:"))

		#there are at least 2 words behind
		if i > 1:
			features.update(gen_features(i - 2, "-2:"))

		#there are at least 3 words behind
		#if i > 2:
		#	features.update(gen_features(i - 3, "-3:"))

		#there is at least 1 word ahead
		if i < len(sentence) - 1:
			features.update(gen_features(i + 1, "+1:"))
			features.update(gen_extra_features(i + 1, "+1:"))

		#there are at least 2 words ahead
		if i < len(sentence) - 2:
			features.update(gen_features(i + 2, "+2:"))

		#there are at least 3 words ahead
		#if i < len(sentence) - 3:
		#	features.update(gen_features(i + 3, "+3:"))

		return features

		#helper functions
	def sent2labels(sentence):
		return [label for _, _, label in sentence] 
	def sent2tokens(sent):
		return [token for token, _, _ in sentence]

	def sentences2labels(sentences):
		return [sent2labels(sent) for sent in sentences]


		#generate a list of feature dictionaries for each word in the sentence
	def gen_task3_features(sentence):
		return [gen_feature_dict_task3(sentence, i) for i in range(len(sentence))]

	def gen_task4_features(sentence):
		return [gen_feature_dict_task4(sentence, i) for i in range(len(sentence))]

	display_label_subset = ["B-DATE", "I-DATE", "B-CARDINAL", "I-CARDINAL", "B-ORDINAL", "I-ORDINAL", "B-NORP", "I-NORP"]


	#generate lists of feature dictionaries for each sentence
	def task3(ontonotes_data, sentences_to_predict):


		feature_lists = [gen_task3_features(sent) for sent in ontonotes_data]
		feature_dicts_to_predict = [gen_task3_features(sent) for sent in sentences_to_predict]
		label_lists = [sent2labels(sent) for sent in ontonotes_data]

		crf = sklearn_crfsuite.CRF(
			algorithm = "lbfgs",
			c1 = 1,
			c2 = 0.149853957,
			max_iterations = max_iter,
			all_possible_transitions = True,
			verbose = verbose_on
		)

		crf.fit(feature_lists, label_lists)
		predictions = crf.predict(feature_dicts_to_predict)
		dct = gen_NER_dict(["CARDINAL", "ORDINAL", "DATE", "NORP"], sentences_to_predict, predictions)
		
		return dct#, crf

	#try seeing if different hyperparamaters for each model improves
	#their respsective performances, might be worth them not both being the same
	def task4(ontonotes_data, sentences_to_predict):

		feature_lists = [gen_task4_features(sent) for sent in ontonotes_data]
		feature_dicts_to_predict = [gen_task4_features(sent) for sent in sentences_to_predict]
		label_lists = [sent2labels(sent) for sent in ontonotes_data]
		crf = sklearn_crfsuite.CRF(
			algorithm = "lbfgs",
			c1 = 1,
			c2 = 0.149853957,
			max_iterations = max_iter,
			all_possible_transitions = True,
			verbose = verbose_on)
		crf.fit(feature_lists, label_lists)

		#REPORT STUFF GOES HERE

		predictions = crf.predict(feature_dicts_to_predict)

		dct = gen_NER_dict(["PERSON"], sentences_to_predict, predictions)
        
		#predictions = crf.predict(feature_dicts_to_predict)
		return dct#, crf
		#return extract_entities_from_sequences(["PERSON"], predictions, False)

	task3sample = random_sample(wtask3,task3_sample_size)
	del wtask3
	gc.collect()
	task3dict = task3(task3sample, sentences_to_predict)
	
	task4sample = random_sample(wtask4,task4_sample_size)
	del wtask4
	gc.collect()
	task4dict = task4(task4sample, sentences_to_predict)

	#combine into one dict as required
	task3dict.update(task4dict)

	return task3dict

In [100]:
def exec_ner( file_chapter = None, ontonotes_file = None ) :

	# INSERT CODE TO TRAIN A CRF NER MODEL TO TAG THE CHAPTER OF TEXT (subtask 3)
	# USING NER MODEL AND REGEX GENERATE A SET OF BOOK CHARACTERS AND FILTERED SET OF NE TAGS (subtask 4)

	chapter_str = ""
	with open(file_chapter, 'r') as f:
		chapter_str = without_chapter(f.read())
		
	sentences_to_predict = []
	for sent in sent_tokenize(chapter_str):	# DO NOT CHANGE THE CODE IN THIS FUNCTION

	#
	# subtask 1 >> extract chapter headings and create a table of contents from a provided plain text book (from www.gutenberg.org)
	# Input >> www.gutenberg.org sourced plain text file for a whole book
	# Output >> toc.json = { <chapter_number_text> : <chapter_title_text> }
	#

		tokens = word_tokenize(sent)
		sentences_to_predict.append(prepare_sentence(tokens))
	
	ontonotes_sentences = load_ontonotes(ontonotes_file)
	
	dictNE = run_NER(ontonotes_sentences, sentences_to_predict)
	
	#printfn("---NER DICTIONARY---\n")
	#printfn(dictNE)

	# DO NOT CHANGE THE BELOW CODE WHICH WILL SERIALIZE THE ANSWERS FOR THE AUTOMATED TEST HARNESS TO LOAD AND MARK

	# write out all PERSON entries for character list for subtask 4
	writeHandle = codecs.open( 'characters.txt', 'w', 'utf-8', errors = 'replace' )
	if 'PERSON' in dictNE :
		for strNE in dictNE['PERSON'] :
			writeHandle.write( strNE.strip().lower()+ '\n' )
	writeHandle.close()

	# FILTER NE dict by types required for subtask 3
	listAllowedTypes = [ 'DATE', 'CARDINAL', 'ORDINAL', 'NORP' ]
	listKeys = list( dictNE.keys() )
	for strKey in listKeys :
		for nIndex in range(len(dictNE[strKey])) :
			dictNE[strKey][nIndex] = dictNE[strKey][nIndex].strip().lower()
		if not strKey in listAllowedTypes :
			del dictNE[strKey]

	# write filtered NE dict
	writeHandle = codecs.open( 'ne.json', 'w', 'utf-8', errors = 'replace' )
	strJSON = json.dumps( dictNE, indent=2 )
	writeHandle.write( strJSON + '\n' )
	writeHandle.close()

In [101]:
def exec_regex_toc( file_book = None ) :

	book_str = ""
	with open(file_book, 'r') as f:
		book_str = f.read()
	
	dictTOC = extract_toc(book_str)

	#printfn("---TOC DICTIONARY---\n")
	#printfn(dictTOC)
	
	# DO NOT CHANGE THE BELOW CODE WHICH WILL SERIALIZE THE ANSWERS FOR THE AUTOMATED TEST HARNESS TO LOAD AND MARK

	writeHandle = codecs.open( 'toc.json', 'w', 'utf-8', errors = 'replace' )
	strJSON = json.dumps( dictTOC, indent=2 )
	writeHandle.write( strJSON + '\n' )
	writeHandle.close()


In [102]:
def exec_regex_questions( file_chapter = None ) :

	chapter_str = ""
	with open(file_chapter, 'r') as f:
		chapter_str = f.read()
	
	setQuestions = extract_questions(chapter_str)
	#printfn("---QUESTIONS---")
	#printfn(setQuestions)

	writeHandle = codecs.open( 'questions.txt', 'w', 'utf-8', errors = 'replace' )
	for strQuestion in setQuestions :
		writeHandle.write( strQuestion + '\n' )
	writeHandle.close()

In [103]:
if __name__ == '__main__':
	if len(sys.argv) < 4 :
		raise Exception( 'missing command line args : ' + repr(sys.argv) )
ontonotes_file = sys.argv[1]
book_file = sys.argv[2]
chapter_file = sys.argv[3]

logger.info( 'ontonotes = ' + repr(ontonotes_file) )
logger.info( 'book = ' + repr(book_file) )
logger.info( 'chapter = ' + repr(chapter_file) )

# DO NOT CHANGE THE CODE IN THIS FUNCTION

#
# subtask 1 >> extract chapter headings and create a table of contents from a provided plain text book (from www.gutenberg.org)
# Input >> www.gutenberg.org sourced plain text file for a whole book
# Output >> toc.json = { <chapter_number_text> : <chapter_title_text> }
#

exec_regex_toc( book_file )

#
# subtask 2 >> extract every question from a provided plain text chapter of text
# Input >> www.gutenberg.org sourced plain text file for a chapter of a book
# Output >> questions.txt = plain text set of extracted questions. one line per question.
#

exec_regex_questions( chapter_file )

#
# subtask 3 (NER) >> train NER using ontonotes dataset, then extract DATE, CARDINAL, ORDINAL, NORP entities from a provided chapter of text
# Input >> www.gutenberg.org sourced plain text file for a chapter of a book
# Output >> ne.json = { <ne_type> : [ <phrase>, <phrase>, ... ] }
#
# subtask 4 (text classifier) >> compile a list of characters from the target chapter
# Input >> www.gutenberg.org sourced plain text file for a chapter of a book
# Output >> characters.txt = plain text set of extracted character names. one line per character name.
#

exec_ner( chapter_file, ontonotes_file )

Exception: missing command line args : ['/home/george/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/home/george/.local/share/jupyter/runtime/kernel-98842567-b82b-401b-92b9-cff3e3fedd39.json']

In [104]:
#REMOVE THIS CELL FROM HANDIN
def do_testing():
	
	ontonotes_file = "ontonotes_parsed.json"
	book_file = "eval_book.txt"
	chapter_file = "eval_chapter.txt"
	
	logger.info( 'ontonotes = ' + repr(ontonotes_file) )
	logger.info( 'book = ' + repr(book_file) )
	logger.info( 'chapter = ' + repr(chapter_file) )

	exec_regex_toc( book_file )
	exec_regex_questions( chapter_file )
	exec_ner( chapter_file, ontonotes_file )

do_testing()

INFO 2021-05-10 18:01:59,953 ontonotes = 'ontonotes_parsed.json'
INFO 2021-05-10 18:01:59,955 book = 'eval_book.txt'
INFO 2021-05-10 18:01:59,957 chapter = 'eval_chapter.txt'


KeyboardInterrupt: 