In [1]:
import sys, codecs, json, math, time, warnings
warnings.simplefilter( action='ignore', category=FutureWarning )

import nltk, scipy, sklearn, sklearn_crfsuite, sklearn_crfsuite.metrics, eli5
from sklearn.metrics import make_scorer
from collections import Counter
import matplotlib.pyplot as plt
from IPython.display import display    

import logging
import tensorflow as tf
import absl.logging
formatter = logging.Formatter('[%(levelname)s|%(filename)s:%(lineno)s %(asctime)s] %(message)s')
absl.logging.get_absl_handler().setFormatter(formatter)
absl.logging._warn_preinit_stderr = False
logger = tf.get_logger()
logger.setLevel(logging.INFO)

In [9]:
from nltk import pos_tag

In [None]:
#DATE,CARDINAL, ORDINAL, NORP

max_iter = 150
#max_files = 100
display_label_subset = ["B-DATE", "I-DATE", "B-CARDINAL", "I-CARDINAL", "B-ORDINAL", "I-ORDINAL", "B-NORP", "I-NORP"]


In [36]:
#load ALL the sentences from the ontonotes dataset without splitting

#ignore the POS tags in the ontonotes dataset as we are going to use
#NLTK POS tags instead so that the training instances are the same
#as when we use with the book which will use the NLTK function to
#determine POS tags of tokens
def load_ontonotes_dataset(max_files):

	dataset_file = 'ontonotes_parsed.json'
	# load parsed ontonotes dataset
	readHandle = codecs.open( dataset_file, 'r', 'utf-8', errors = 'replace' )
	str_json = readHandle.read()
	readHandle.close()
	dict_ontonotes = json.loads( str_json )
    
	orig_list = list(dict_ontonotes.keys())
    
    #if we have specified a max number smaller than the list size then only read this many files
	final_list = orig_list[:max_files]#orig_list if max_files > len(orig_list) else orig_list[max_files:]
    
	print("original list length:", len(orig_list))
	print("final list length:", len(final_list))
    
	sentences = []
	i = 0
    
	for str_file in final_list :
		print("on file number:", i)
		i += 1
		for str_sent_index in dict_ontonotes[str_file] :
            
            #BELOW, think we should use all as we are just going to work out
            #our own pos tags anyway?
            
			# ignore sents with non-PENN POS tags
			#if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
			#	continue
			#if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
			#	continue
			tokens = []
			ner_tags = []
			# compute IOB tags for named entities (if any)
			ne_type_last = None
            
            #build up the list of tokens and ner tags
			for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
				strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
				strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
				ne_type = None
				if 'ne' in dict_ontonotes[str_file][str_sent_index] :
					dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
					if not 'parse_error' in dict_ne :
						for str_NEIndex in dict_ne :
							if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
								ne_type = dict_ne[str_NEIndex]['type']
								break
				if ne_type != None :
					if ne_type == ne_type_last :
						strIOB = 'I-' + ne_type
					else :
						strIOB = 'B-' + ne_type
				else :
					strIOB = 'O'
				ne_type_last = ne_type
				tokens.append(strToken)
				ner_tags.append(strIOB)
				#list_entry.append( ( strToken, strIOB ) )
            
			list_entry = []
            
            #use nltk pos tags instead
			for (index, tup) in enumerate(pos_tag(tokens)):
				#print("tup is:", tup)
				list_entry.append((tup[0], tup[1], ner_tags[index]))
            #here is where we want to inser the NLTK pos tags
			sentences.append( list_entry )
	return sentences

In [37]:
sentences = load_ontonotes_dataset(500)

original list length: 13109
final list length: 500
on file number: 0
on file number: 1
on file number: 2
on file number: 3
on file number: 4
on file number: 5
on file number: 6
on file number: 7
on file number: 8
on file number: 9
on file number: 10
on file number: 11
on file number: 12
on file number: 13
on file number: 14
on file number: 15
on file number: 16
on file number: 17
on file number: 18
on file number: 19
on file number: 20
on file number: 21
on file number: 22
on file number: 23
on file number: 24
on file number: 25
on file number: 26
on file number: 27
on file number: 28
on file number: 29
on file number: 30
on file number: 31
on file number: 32
on file number: 33
on file number: 34
on file number: 35
on file number: 36
on file number: 37
on file number: 38
on file number: 39
on file number: 40
on file number: 41
on file number: 42
on file number: 43
on file number: 44
on file number: 45
on file number: 46
on file number: 47
on file number: 48
on file number: 49
on file n

on file number: 435
on file number: 436
on file number: 437
on file number: 438
on file number: 439
on file number: 440
on file number: 441
on file number: 442
on file number: 443
on file number: 444
on file number: 445
on file number: 446
on file number: 447
on file number: 448
on file number: 449
on file number: 450
on file number: 451
on file number: 452
on file number: 453
on file number: 454
on file number: 455
on file number: 456
on file number: 457
on file number: 458
on file number: 459
on file number: 460
on file number: 461
on file number: 462
on file number: 463
on file number: 464
on file number: 465
on file number: 466
on file number: 467
on file number: 468
on file number: 469
on file number: 470
on file number: 471
on file number: 472
on file number: 473
on file number: 474
on file number: 475
on file number: 476
on file number: 477
on file number: 478
on file number: 479
on file number: 480
on file number: 481
on file number: 482
on file number: 483
on file number: 484


In [44]:
sentences[2]

[('The', 'DT', 'O'),
 ('world', 'NN', 'O'),
 ("'s", 'POS', 'O'),
 ('fifth', 'JJ', 'B-ORDINAL'),
 ('Disney', 'NNP', 'B-ORG'),
 ('park', 'NN', 'O'),
 ('will', 'MD', 'O'),
 ('soon', 'RB', 'O'),
 ('open', 'VB', 'O'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'O'),
 ('public', 'NN', 'O'),
 ('here', 'RB', 'O'),
 ('.', '.', 'O')]