# Knowledge Graph construction from unstructured text.

## Literature Review.
## Code from key research papers.

### End-to-End construction of NLP Knowledge Graph" published in ACL-IJCNLP 2021
* https://github.com/Ishani-Mondal/SciKG

In [None]:
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, FlairEmbeddings, TransformerWordEmbeddings
embedding = TransformerWordEmbeddings('allenai/scibert_scivocab_uncased')

f=open('./data/Sameas_Hyp_Entire_Test.txt')
content=f.read()

f1=open('./data/Final_big_test.txt','w')
f1.write('sentences1'+"\t"+'type1'+'\tsentences2\t'+'type2'+"\tis_similar"+"\n")
for line in content.split("\n"):
    if(line!=""):
        if(line.split("\t")[0]!='sentences1'):
            sentence1=Sentence(line.split("\t")[0])
            embedding.embed(sentence1)
            avg1=[]
            for token in sentence1:
                #print(token.embedding)
                avg1.append(token.embedding)

            avg1=sum(avg1)
            
            sentence2=Sentence(line.split("\t")[2])
            embedding.embed(sentence2)
            avg2=[]
            for token in sentence2:
                avg2.append(token.embedding)

            avg2=sum(avg2)
            if(float(np.dot(np.array(avg1), np.array(avg2))/(norm(np.array(avg1))*norm(np.array(avg2))))>0.85):
                f1.write(line.split("\t")[0]+"\t"+line.split("\t")[1]+"\t"+line.split("\t")[2]+"\t"+line.split("\t")[3]+"\t"+line.split("\t")[4]+"\n")

### Scalable Knowledge Graph Construction from Text Collections

https://dstlry.github.io/

### {TDMS}ci: A Specialized Corpus for Scientific Literature Entity Tagging of Tasks Datasets and Metrics

https://arxiv.org/abs/2101.10273

https://github.com/IBM/science-result-extractor

model training script: https://github.com/IBM/science-result-extractor/blob/master/data/TDMSci/modelTrainingScript/training_tdm.py

In [2]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings, CharacterEmbeddings, BertEmbeddings
from typing import List


# define columns
columns = {0: 'text', 1: 'pos', 2: 'ner'} 

# this is the folder in which train, test and dev files reside
data_folder = '../conllFormat/'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train_1500_v2.conll',
                              test_file='dev_150_v2.conll',
                              dev_file='test_500_v2.conll')



# 2. what tag do we want to predict?
tag_type = 'ner'



# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)


# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    BertEmbeddings('bert-base-cased'), 	
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('../model/flair/tdm-bert',
              learning_rate=0.1,
              mini_batch_size=16,
              max_epochs=150)


ModuleNotFoundError: No module named 'flair'

### Seq2KG: An End-to-End Neural Model for Domain Agnostic Knowledge Graph (not Text Graph) Construction from Text

* https://proceedings.kr.org/2020/77/kr2020-0077-stewart-et-al.pdf
* https://github.com/Michael-Stewart-Webdev/Seq2KG

In [3]:
## JOINT MODEL

import data_utils as dutils
from data_utils import load_documents
from data_utils import Vocab, CategoryHierarchy, EntityTypingDataset, batch_to_wordpieces, wordpieces_to_bert_embs
from logger import logger
from model import E2EETModel
import torch.optim as optim

from progress_bar import ProgressBar
import time, json, sys
import torch
from evaluate import ModelEvaluator
import numpy as np
from sinusoidal_positional_embeddings import *
import pandas as pd

from bert_serving.client import BertClient

from config import Config, device
cf = Config()

torch.manual_seed(125)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(125)
torch.cuda.manual_seed(125)



# Train the model, evaluating it every 10 epochs.
def train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy_tr, hierarchy_et, ground_truth_triples, epoch_start = 1):

	logger.info("Training model.")
	
	# Set up a new Bert Client, for encoding the wordpieces
	bc = BertClient()


	modelEvaluator = ModelEvaluator(model, data_loaders['dev'], word_vocab, wordpiece_vocab, hierarchy_tr, hierarchy_et, ground_truth_triples, cf)
	
	#optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE, momentum=0.9)
	optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=cf.LEARNING_RATE)#, momentum=0.9)
	model.cuda()
	print(cf.LEARNING_RATE)

	num_batches = len(data_loaders["train"])
	max_epochs = 1000
	progress_bar = ProgressBar(num_batches = num_batches, max_epochs = max_epochs, logger = logger)
	avg_loss_list = []

	# Train the model

	for epoch in range(epoch_start, max_epochs + 1):
		epoch_start_time = time.time()
		epoch_losses = []	

		for (i, (batch_x, batch_y_tr, batch_y_et, batch_z_tr, batch_z_et, _, batch_tx, _, _, _)) in enumerate(data_loaders["train"]):
		
			if len(batch_x) < cf.BATCH_SIZE:
				continue
		
			# 1. Convert wordpiece ids into wordpiece tokens
			wordpieces = batch_to_wordpieces(batch_x, wordpiece_vocab)
			wordpiece_embs  = wordpieces_to_bert_embs(wordpieces, bc)

			# 2. Create sin embeddings and concatenate them to the bert embeddings


			wordpiece_embs = wordpiece_embs.to(device)
			batch_y_tr = batch_y_tr.float().to(device)
			batch_y_et = batch_y_et.float().to(device)
			batch_z = batch_z_tr.float().to(device)

			# 3. Feed these vectors to our model
			
			if cf.POSITIONAL_EMB_DIM > 0:
				sin_embs = SinusoidalPositionalEmbedding(embedding_dim=cf.POSITIONAL_EMB_DIM, padding_idx = 0, left_pad = True)
				sin_embs = sin_embs( torch.ones([batch_x.size()[0], batch_x.size()[1]])).to(device)
				joined_embs = torch.cat((wordpiece_embs, sin_embs), dim=2)
			else:
				joined_embs = wordpiece_embs

			# if len(batch_x) < cf.BATCH_SIZE:
			# 	zeros = torch.zeros((cf.BATCH_SIZE - len(batch_x), joined_embs.size()[1], joined_embs.size()[2])).to(device)
			# 	joined_embs = torch.cat((joined_embs, zeros), dim=0)
			# 	print(joined_embs)
			# 	print(joined_embs.size())

			model.zero_grad()
			model.train()

			y_hat_tr, y_hat_et = model(joined_embs)

			loss = model.calculate_loss(y_hat_tr, y_hat_et, batch_x, batch_y_tr, batch_y_et, batch_z)

			# 4. Backpropagate
			loss.backward()
			optimizer.step()
			epoch_losses.append(loss)

			# 5. Draw the progress bar
			progress_bar.draw_bar(i, epoch, epoch_start_time)

		avg_loss = sum(epoch_losses) / float(len(epoch_losses))
		avg_loss_list.append(avg_loss)

		progress_bar.draw_completed_epoch(avg_loss, avg_loss_list, epoch, epoch_start_time)

		modelEvaluator.evaluate_every_n_epochs(1, epoch)




# Convert the ground truth triples csv into a list of lists.
def parse_ground_truth_triples(df):

	ground_truth_triples = []

	current_sent_index = 0
	current_triples = []

	num_dev_documents = len(load_documents(cf.DEV_FILENAME))

	ground_truth_triples_dict = { k: [] for k in range(num_dev_documents) }


	for i, row in enumerate(df.itertuples()):
		sent_index = int(getattr(row, 'index'))
		head = getattr(row, 's1').split()
		rel  = str(getattr(row, 'r')).split()
		tail = getattr(row, 's2').split()
		if sent_index not in ground_truth_triples_dict:
			ground_truth_triples_dict[sent_index] = []
		ground_truth_triples_dict[sent_index].append([head, rel, tail])

	for k in range(num_dev_documents):
		ground_truth_triples.append([])
		for t in ground_truth_triples_dict[k]:
			ground_truth_triples[-1].append(t)
	
	return ground_truth_triples

def main(opts):


	if len(opts) == 0:
		raise ValueError("Usage: train.py <dataset>")
	dataset = opts[0]
	if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']:
		raise ValueError("Dataset must be either cateringServices, automotiveEngineering, or bbn.")

	cf.load_config(dataset)

	logger.info("Loading files...")


	data_loaders = dutils.load_obj_from_pkl_file('data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl')
	word_vocab = dutils.load_obj_from_pkl_file('word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl')
	wordpiece_vocab = dutils.load_obj_from_pkl_file('wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
	hierarchy_tr = dutils.load_obj_from_pkl_file('hierarchy_tr', cf.ASSET_FOLDER + '/hierarchy_tr.pkl')
	hierarchy_et = dutils.load_obj_from_pkl_file('hierarchy_et', cf.ASSET_FOLDER + '/hierarchy_et.pkl')
	total_wordpieces = dutils.load_obj_from_pkl_file('total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl')
	
	ground_truth_triples_df = pd.read_csv(cf.GROUND_TRUTH_TRIPLES_FILE)

	ground_truth_triples = parse_ground_truth_triples(ground_truth_triples_df)
	

	logger.info("Building model.")
	model = E2EETModel(	embedding_dim = cf.EMBEDDING_DIM + cf.POSITIONAL_EMB_DIM,
						hidden_dim = cf.HIDDEN_DIM,
						vocab_size = len(wordpiece_vocab),
						label_size_tr = len(hierarchy_tr),
						label_size_et = len(hierarchy_et),
						total_wordpieces = total_wordpieces,
						max_seq_len = cf.MAX_SENT_LEN,
						batch_size = cf.BATCH_SIZE)
	model.cuda()

	train(model, data_loaders, word_vocab, wordpiece_vocab, hierarchy_tr, hierarchy_et, ground_truth_triples)

if __name__ == "__main__":
	print('hello')
	main(sys.argv[1:])

ModuleNotFoundError: No module named 'data_utils'

In [4]:
### model.py
import torch
import torch.nn as nn
from config import device

torch.manual_seed(125)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed(125)


class E2EETModel(nn.Module):

	def init_hidden(self):
		# Before we've done anything, we dont have any hidden state.
		# Refer to the Pytorch documentation to see exactly
		# why they have this dimensionality.
		# The axes semantics are (num_layers, minibatch_size, hidden_dim)
		#return (torch.zeros(4, self.batch_size, self.hidden_dim, device=device),
		#		torch.zeros(4, self.batch_size, self.hidden_dim, device=device))

		return (torch.zeros(4, self.batch_size, self.hidden_dim, device=device)) #GRU version

	def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size_tr, label_size_et, total_wordpieces, max_seq_len, batch_size):
		super(E2EETModel, self).__init__()

		self.embedding_dim = embedding_dim
		self.hidden_dim = hidden_dim
		self.vocab_size = vocab_size
		self.label_size_tr = label_size_tr
		self.label_size_et = label_size_et

		#self.layer_1 = nn.Linear(embedding_dim, hidden_dim)
		#self.layer_2 = nn.Linear(hidden_dim, hidden_dim)
		
		#self.dropout = nn.Dropout()
		self.hidden2tag_tr = nn.Linear(hidden_dim * 2, label_size_tr)
		self.hidden2tag_et = nn.Linear(hidden_dim * 2, label_size_et)

		self.hidden2tag2_tr = nn.Linear(label_size_tr, label_size_tr)
		self.hidden2tag2_et = nn.Linear(label_size_et, label_size_et)
		#self.dropout = nn.Dropout(p=0.1)

		self.recurrent_layer_tr = nn.GRU(self.embedding_dim, self.hidden_dim, bidirectional = True, num_layers = 2, dropout = 0.5)

		self.recurrent_layer_et = nn.GRU(self.embedding_dim, self.hidden_dim, bidirectional = True, num_layers = 2, dropout = 0.5)

		self.max_seq_len = max_seq_len
		self.batch_size = batch_size

	def forward(self, batch_x):

		self.hidden_tr = self.init_hidden()
		self.hidden_et = self.init_hidden()

		#batch_x = torch.relu(self.layer_1(batch_x))
		#batch_x = self.dropout(torch.relu(self.layer_1(batch_x)))
		#y_hat = self.hidden2tag(batch_x)


		#seq_lens = []
		#for x in batch_x:
		#	seq_lens.append(batch_x.size()[1])
		#print(batch_x.size())
		#batch_x = torch.cat((batch_x, torch.zeros((self.batch_size - batch_x.size()[0], self.max_seq_len, self.embedding_dim)).to(device)))
		#seq_lens = [100, 100, 0, 0, 0, 0, 0, 0, 0, 0]
		
		# GRU
		batch_tr = torch.nn.utils.rnn.pack_padded_sequence(batch_x, [self.max_seq_len] * self.batch_size, batch_first=True, enforce_sorted=False)
		batch_tr, self.hidden_tr = self.recurrent_layer_tr(batch_tr, self.hidden_tr)
		batch_tr, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_tr, batch_first = True)
		batch_tr = batch_tr.contiguous()
		batch_tr = batch_tr.view(-1, batch_tr.shape[2])


		batch_et = torch.nn.utils.rnn.pack_padded_sequence(batch_x, [self.max_seq_len] * self.batch_size, batch_first=True, enforce_sorted=False)
		batch_et, self.hidden_et = self.recurrent_layer_tr(batch_et, self.hidden_et)
		batch_et, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_et, batch_first = True)
		batch_et = batch_et.contiguous()
		batch_et = batch_et.view(-1, batch_et.shape[2])


		# Feed forward
		#batch = torch.relu(self.layer_1(batch_x))
		#batch = self.dropout(torch.relu(self.layer_1(batch_x)))

		#print(batch.size())

		y_hat_tr = torch.relu(self.hidden2tag_tr(batch_tr))
		y_hat_tr = self.hidden2tag2_tr(y_hat_tr)		
		y_hat_tr = y_hat_tr.view(self.batch_size, self.max_seq_len, self.label_size_tr)


		y_hat_et = torch.relu(self.hidden2tag_et(batch_et))
		y_hat_et = self.hidden2tag2_et(y_hat_et)		
		y_hat_et = y_hat_et.view(self.batch_size, self.max_seq_len, self.label_size_et)


		return y_hat_tr, y_hat_et
		


	def calculate_loss(self, y_hat_tr, y_hat_et, batch_x, batch_y_tr, batch_y_et, batch_z):
		non_padding_indexes = torch.ByteTensor((batch_x > 0))
		loss_fn = nn.BCEWithLogitsLoss()
		loss_tr = loss_fn(y_hat_tr[non_padding_indexes], batch_y_tr[non_padding_indexes])
		loss_et = loss_fn(y_hat_et[non_padding_indexes], batch_y_et[non_padding_indexes])
		return (loss_tr + loss_et) / 2


	# Predict the labels of a batch of wordpieces using a threshold of 0.5.
	def predict_labels(self, preds):
		#preds_s = torch.sigmoid(preds)		
		hits  = (preds > 0).float()
		return hits

	# Evaluate a given batch_x, predicting the labels.
	def evaluate(self, batch_x):
		preds_tr, preds_et = self.forward(batch_x)
		return (self.predict_labels(preds_tr), self.predict_labels(preds_et))



	# Evaluate a given batch_x, but convert the predictions for each wordpiece into the predictions of each token using
	# the token_idxs_to_wp_idxs map.
	def predict_token_labels(self, batch_x, token_idxs_to_wp_idxs):
		preds_tr, preds_et = self.forward(batch_x)
		labels = []
		for preds in (preds_tr, preds_et):

			avg_preds = torch.zeros(list(batch_x.shape)[0], list(batch_x.shape)[1], list(preds.shape)[2])
		
			for i, batch in enumerate(batch_x):
				for j, wp_idxs in enumerate(token_idxs_to_wp_idxs[i]):		
					avg_preds[i][j] = preds[i][wp_idxs].mean(dim=0)

			labels.append(self.predict_labels(avg_preds))
		return labels

# 5.05 vs 4.81 (e2e, filtering)

ModuleNotFoundError: No module named 'torch'

### Generating Knowledge Graphs by Employing Natural Language Processing
* https://arxiv.org/abs/2011.01103
* https://github.com/danilo-dessi/skg

In [5]:
from classes.EntityCleaner import EntityCleaner
from classes.StatisticsRefiner import StatisticsRefiner
from classes.Mapper import Mapper
from classes.Selector import Selector
from classes.CSORelationshipsBuilder import RelationsBuilder
from classes.BestLabelFinder import BestLabelFinder

from gensim.models.keyedvectors import KeyedVectors
import sys
import pandas as pd
import ast
import networkx as nx
import Levenshtein.StringMatcher as ls
import datetime
import nltk
import numpy as np
from scipy import spatial
import operator
import random
import collections


class GraphBuilder:

	def __init__(self, inputFile):
		self.inputFile = inputFile
		self.inputDataFrame = None
		self.inputTexts = None
		self.entities = None
		self.relations = None
		self.g = nx.DiGraph()
		self.validEntities = set()


	def loadData(self):
		self.inputDataFrame = pd.read_csv(self.inputFile)#.head(1000)


	def parse(self):
		self.entities = [ast.literal_eval(e) for e in self.inputDataFrame['entities_column'].tolist()]
		self.relations = [ast.literal_eval(r) for r in self.inputDataFrame['relations_column'].tolist()]
		self.inputTexts = [ast.literal_eval(t) for t in self.inputDataFrame['sentences'].tolist()]

		tmp_input_texts = []
		for paper_number in range(len(self.inputTexts)):
			paper_sentences = []
			for sentence_number in range(len(self.inputTexts[paper_number])):
				sentence = self.inputTexts[paper_number][sentence_number].lower()
				paper_sentences += [sentence]
			tmp_input_texts += [paper_sentences]
		self.inputTexts = tmp_input_texts

		newInputEntities = []
		for eList in self.entities:
			newEList = []
			for eSentence in eList:
				newESentence = []
				for e in eSentence:
					newESentence += [e.lower()]
				newEList += [newESentence]
			newInputEntities += [newEList]
		self.entities = newInputEntities

		newInputRelations = []
		for rList in self.relations:
			newRList = []
			for rSentence in rList:
				newRSentence = []
				for (s,p,o) in rSentence:
					newRSentence += [(s.lower(), p.lower(), o.lower())]
				newRList += [newRSentence]
			newInputRelations += [newRList]
		self.relations = newInputRelations


		
	def removeNoConnectedNodes(self):
		isolated_nodes = [n for n,d in self.g.degree() if d == 0]
		self.g.remove_nodes_from(isolated_nodes)


	def removeSelfEdges(self):
		self.g.remove_edges_from(self.g.selfloop_edges())


	def validate(self):
		allEntities = [] 
		for i in range(len(self.entities)):
			for eList in self.entities[i]:
				allEntities += [e for e in eList]

		allEntities = set(allEntities)	
		refiner = StatisticsRefiner(allEntities, self.entities, self.relations, 2, 10)
		self.validEntities, self.entities,  self.relations = refiner.validate()
		print('Entities after:', len(self.validEntities))


	def cleanEntities(self):
		entityCleaner = EntityCleaner(self.entities, self.relations, self.validEntities)
		entityCleaner.run()
		self.entities = entityCleaner.getEntitiesCleaned()
		self.relations = entityCleaner.getRelationsCleaned()
	 

	#BestLabelFinder module execution
	def build_triples(self):
		finder = BestLabelFinder(self.inputTexts, self.entities, self.relations)
		finder.run()
		return finder.get_triples()


	# Mapping of relations with our taxonomy using Mapper
	def get_mapped_triples(self, triples):
		m = Mapper(triples)
		m.run()
		return m.get_triples()

	def map_on_definitive_triples(self, triples):
		m = Mapper(triples)
		m.map_on_definitive_predicates()
		return m.get_triples()



	def save_pandas(self, triples, destination):
		columns_order = ['s', 'p', 'o', 'source', 'support', 'abstracts']
		data = [{'s' : s, 'p' : p, 'o' : o, 'source' : source, 'support' : support, 'abstracts' : list(set(abstracts))} for (s,p,o, source, support, abstracts) in triples]
		df = pd.DataFrame(data, columns=columns_order)
		df = df[columns_order]
		df.to_csv(destination, sep=';')

	def save_kg(self, triples, destination):
		columns_order = ['s', 'p', 'o']
		data = [{'s' : s, 'p' : p, 'o' : o} for (s,p,o) in triples]
		df = pd.DataFrame(data, columns=columns_order)
		df = df[columns_order]
		df.to_csv(destination, sep=';')



	def build_g(self, selected_triples):

		id_gen = 0
		entity2id = {}
		id2entity = {}

		# a single id to each entity
		for (s,p,o, source, support, abstracts) in selected_triples:
			if s not in entity2id:
				entity2id[s] = id_gen
				id2entity[id_gen] = s
				self.g.add_node(id_gen, label=s)
				id_gen += 1
			if o not in entity2id:
				entity2id[o] = id_gen
				id2entity[id_gen] = o
				self.g.add_node(id_gen, label=o)
				id_gen += 1

		# graph generation
		for (s,p,o, source, support, abstracts) in selected_triples:
			idS = entity2id[s]
			idO = entity2id[o]
			self.g.add_edge(idS, idO, label=p, support=support, source=source)

		
	def pipeline(self):

		print('# LOAD AND PARSE DATA')
		print(str(datetime.datetime.now()))
		self.loadData()
		self.parse()
		print()


		print('# ENTITIES VALIDATION')
		print(str(datetime.datetime.now()))
		self.validate()
		print()

		print('# ENTITIES CLEANING')
		print(str(datetime.datetime.now()))
		self.cleanEntities()


		print('# TRIPLES GENERATION')
		print(str(datetime.datetime.now()))
		triples = self.build_triples()
		print('Number of triples:', len(triples))


		print('# TRIPLES MAPPING')
		print(str(datetime.datetime.now()))
		triples = self.get_mapped_triples(triples)
		print('Number of triples:', len(triples))
		self.save_pandas(triples, 'out/all_triples.csv')


		
		print('# TRIPLES SELECTION')
		print(str(datetime.datetime.now()))
		s = Selector(triples)
		s.run()
		selected_triples = s.get_selected_triples()
		discarded_triples = s.get_discarded_triples()
		print('Number of selected triples:', len(selected_triples))
		selected_triples = self.map_on_definitive_triples(selected_triples)
		discarded_triples = self.map_on_definitive_triples(discarded_triples)
		self.save_pandas(selected_triples, 'out/selected_triples.csv')
		self.save_pandas(discarded_triples, 'out/discarded_triples.csv')

		print('GRAPH BUILDING')
		print(str(datetime.datetime.now()))
		self.build_g(selected_triples)
		rb = RelationsBuilder(self.g)
		rb.run()

		new_triples_from_CSO = rb.get_triples()


		# select only subjet, predicte, and object information for the kg
		kg_triples = [(s,p,o) for (s,p,o,support, source, abstracts) in selected_triples]
		kg_triples += new_triples_from_CSO
		kg_triples = set(kg_triples)
		print('### KG Triples:', len(kg_triples), '###')
		self.save_kg(kg_triples, 'triples.csv')

		#self.removeNoConnectedNodes()
		#self.removeSelfEdges()
		#nx.write_graphml(self.g, 'kg.graphml')

		


if __name__ == '__main__':
	builder = GraphBuilder('csv_e_r_full.csv')
	builder.pipeline()












ModuleNotFoundError: No module named 'classes'

### T2KG: An End-to-End System for Creating Knowledge Graph from Unstructured Text

* https://files.stample.com/browserUpload/9e312f19-d85a-48e5-8b96-73bf59d96c98

* 

### Multi-Task Identification of Entities, Relations, and Coreference for Scientific Knowledge Graph Construction

* https://paperswithcode.com/paper/multi-task-identification-of-entities

* 


In [6]:
import json
import pdb
from operator import itemgetter
def ReadJson(senfn, refn, nerfn , docs):
    with open(senfn) as f:
        docs_sent = [json.loads(jsonline) for jsonline in f.readlines()]
    with open(refn) as f:
        docs_re = [json.loads(jsonline) for jsonline in f.readlines()]
    with open(nerfn) as f:
        docs_ner = [json.loads(jsonline) for jsonline in f.readlines()]
    for i in range(len(docs_sent)):
        if docs_sent[i]['doc_key'] != docs_re[i]['doc_key'] or docs_sent[i]['doc_key'] != docs_re[i]['doc_key']:
            pdb.set_trace()
        else:
            year = docs_sent[i]['doc_key'].split('_')[1]
            venue = docs_sent[i]['doc_key'].split('_')[0]
            docs[docs_sent[i]['doc_key']] = {'ner': docs_ner[i]['ner'], 'relation': docs_re[i]['relation'], 'coref':docs_re[i]['coref'], 'sentences': docs_sent[i]['sentences'], 'year': year, 'venue':venue}
            PropgateHyponym(docs[docs_sent[i]['doc_key']])
            Map2doc(docs[docs_sent[i]['doc_key']])


def ReadJsonACL(senfn, docs):
    with open(senfn) as f:
        docs_sent = [json.loads(jsonline) for jsonline in f.readlines()]

    for i in range(len(docs_sent)):
            year = docs_sent[i]['doc_key'].split('_')[0][1:]
            if year.startswith('0') or year.startswith('1'):
                year = '20' + year
            elif year.startswith('9') or year.startswith('8') or year.startswith('7'):
                year = '19' + year
            
            if len(year) > 4:pdb.set_trace()
            venue = docs_sent[i]['doc_key'].split('_')[0]
            try:
                year =int(year)
            except:
                print docs_sent[i]['doc_key']
                return
            docs[docs_sent[i]['doc_key']] = {'ner': docs_sent[i]['ner'], 'relation': docs_sent[i]['relations'], 'coref':[], 'sentences': docs_sent[i]['sentences'], 'year': year, 'venue':'ACL'}

            Map2doc(docs[docs_sent[i]['doc_key']])

def ReadJsonTitle(senfn, nerfn , docs):
    with open(senfn) as f:
        docs_sent = [json.loads(jsonline) for jsonline in f.readlines()]

    with open(nerfn) as f:
        docs_ner = [json.loads(jsonline) for jsonline in f.readlines()]
    for i in range(len(docs_sent)):
        if docs_sent[i]['doc_key'] != docs_ner[i]['doc_key']:
            pdb.set_trace()
        else:
            year = docs_sent[i]['doc_key'].split('_')[1]
            venue = docs_sent[i]['doc_key'].split('_')[0]
            relations = CreateRelation(docs_ner[i]['ner'])
            docs[docs_sent[i]['doc_key']] = {'ner': docs_ner[i]['ner'],'relation': relations, 'sentences': docs_sent[i]['sentences'], 'coref':[], 'year': year, 'venue':venue}

            # PropgateHyponym(docs[docs_sent[i]['doc_key']])
            Map2doc(docs[docs_sent[i]['doc_key']])

def CreateRelation(sents):
    all_rel = []
    for sent in sents:
        nerdir = {}
        for ner in sent:
            if ner[2] not in nerdir:
                nerdir[ner[2]] = []
            nerdir[ner[2]].append(ner[:2])
        rels = []
        if 'Task' in nerdir and 'Method' in nerdir:
            for span1 in nerdir['Task']:
                for span2 in nerdir['Method']:
                    rel = span2 + span1 + ['USED-FOR']
                    rels.append(rel)
        all_rel.append(rels)
    return all_rel
            
def PropgateHyponym(doc):
    for i, sent in enumerate(doc['relation']):
        hyp_dir = {}
        new_rels = []
        rel_span_sets = set()
        for relation in sent:
            if relation[-1] == 'HYPONYM-OF':

                if tuple(relation[2:4]) in hyp_dir:
                    hyp_dir[tuple(relation[2:4])].append(tuple(relation[:2]))
                else:
                    hyp_dir[tuple(relation[2:4])] = [tuple(relation[:2])]
            rel_span_sets.add(tuple(relation[:4]))
            rel_span_sets.add(tuple(relation[2:4] + relation[:2]))
        if hyp_dir:
            for relation in sent:
                if relation[-1] == 'HYPONYM-OF':continue
                span1 = tuple(relation[:2])
                span2 = tuple(relation[2:4])
                rel = relation[-1]
                if span1 in hyp_dir:
                    for new_span in hyp_dir[span1]:
                        new_rel_span = list(new_span) + list(span2)
                        if tuple(new_rel_span) not in rel_span_sets:
                            new_rels.append(new_rel_span + [rel])
                if span2 in hyp_dir:
                    for new_span in hyp_dir[span2]:
                        new_rel_span = list(span1) + list(new_span)
                        if tuple(new_rel_span) not in rel_span_sets:
                            new_rels.append(new_rel_span + [rel])
            if new_rels:
                doc['relation'][i] += new_rels
def Map2doc(doc):
    flat_sentences = []
    flat_sentences_id = []
    flat_token_id = []
    map_token_id = []
    i = 0
    flat_ners = []
    flat_relations = []
    for idx, sent in enumerate(doc['sentences']):
        flat_sentences += sent
        sentids = []
        for word in sent:
            sentids.append(i)
            i += 1
        map_token_id.append(sentids)
        for ner in doc['ner'][idx]:
            start = sentids[ner[0]]
            end = sentids[ner[1]]
            flat_ners.append([start,end,ner[2]])
        for relation in doc['relation'][idx]:
            start1 = sentids[relation[0]]
            end1 = sentids[relation[1]]
            start2 = sentids[relation[2]]
            end2 = sentids[relation[3]]
            flat_relations.append([start1,end1,start2,end2,relation[-1]])
    doc['sentences'] = flat_sentences
    doc['relation'] = flat_relations
    doc['ner'] = flat_ners
    BuildKG(doc)

def BuildKG(doc):
    NERdir = {}
    RELdir = {}
    for ner in doc['ner']:
        phrase = ' '.join(doc['sentences'][ner[0]:(ner[1]+1)])
        if phrase == 'system' or phrase == 'systems':
            ner[2] = 'Generic'
        
        NERdir[(ner[0],ner[1])] = [ner[2], phrase]
    for relation in doc['relation']:
        start1 = relation[0]
        end1 = relation[1]
        start2 = relation[2]
        end2 = relation[3]
        ntype1 = 'None'
        ntype2 = 'None'
        phrase1 = ' '.join(doc['sentences'][start1:(end1+1)])
        phrase2 = ' '.join(doc['sentences'][start2:(end2+1)])
        if (start1, end1) in NERdir:
            ntype1 = NERdir[(start1, end1)][0]
        else:
            for ner in NERdir:
                if (start1 > ner[0] and start1 < ner[1]) or (end1 > ner[0] and end1 < ner[1]):
                    ntype1 = NERdir[ner][0]+'_partial'
                break
        if (start2, end2) in NERdir:
            ntype2 = NERdir[(start2, end2)][0]
        else:
            for ner in NERdir:
                if (start2 > ner[0] and start2 < ner[1]) or (end2 > ner[0] and end2 < ner[1]):
                    ntype2 = NERdir[ner][0]+'_partial'
                break
        RELdir[(start1, end1,start2, end2)] = [relation[-1],(ntype1,ntype2),(phrase1, phrase2)]
    

            

    doc['NERdir'] = NERdir
    doc['RELdir'] = RELdir

    
def sort_dict(dictionary):
    sorted_dct = sorted(dictionary.items(), key=itemgetter(1),reverse=True)
    return sorted_dct


def ReadTopLst(venue_type, topnum):
    top_dir = {}
    top_dir_len = {}
    top_dir_count = {}
    types = ['Method','Task']

    for ntype in types:
        fn = './NER_analy/' + venue_type + '_' + ntype + '.rank'
        i = 0
        for line in open(fn):
            if i > topnum:break
            phrase, count = line.rstrip().split('\t')
            newphrase = phrase.replace('-','').replace(' ','')
            i += 1
            if newphrase not in top_dir_count or count > top_dir_count[newphrase][0]:
                top_dir_len[phrase] = len(newphrase)
                top_dir[phrase] = newphrase
                top_dir_count[newphrase] = [count, phrase]
            else:
                continue
    top_dir_len = sort_dict(top_dir_len)
    toplst = []
    for token in top_dir_len:
        phrase = token[0]
        toplst.append([top_dir[phrase], phrase])
    return toplst
        
def ReadTopLsts(venue_types, topnum, acronym_dir):
    top_dir = {}
    top_dir_len = {}
    top_dir_count = {}
    types = ['Method','Task']
    for venue_type in venue_types:
        for ntype in types:
            fn = './NER_analy/' + venue_type + '_' + ntype + '.rank'
            i = 0
            for line in open(fn):
                if i > topnum:break
                phrase, count = line.rstrip().split('\t')
                words = []
                for word in phrase.split():
                    if word.isupper() and word in acronym_dir and len(phrase.split()) > 1:
                        words.append(acronym_dir[word])
                    else:
                        words.append(word)
                phrase = ' '.join(words)
                newphrase = phrase.replace('-','').replace(' ','')
                i += 1
                if newphrase not in top_dir_count or count > top_dir_count[newphrase][0]:
                    top_dir_len[phrase] = len(newphrase)
                    top_dir[phrase] = newphrase
                    top_dir_count[newphrase] = [count, phrase]
                else:
                    continue
    top_dir_len = sort_dict(top_dir_len)
    toplst = []
    for token in top_dir_len:
        phrase = token[0]
        toplst.append([top_dir[phrase], phrase])
    return toplst
        
        
        
            
def NormalizedLst(docs, aspects='None', aspect_values = 'None'):
    # aspects = ['year','venue'], aspect_values = [set('1988','2000'),set('EMNLP','ACL')]
    ner_rankdir = {}
    venue_set = set()
    for doc_key in docs:
        doc = docs[doc_key]
        key = False
        if aspects != 'None':
            for i in range(len(aspects)):
                aspect = aspects[i]
                aspect_value = aspect_values[i]
                if doc[aspect] not in aspect_value:
                    key = True
                    break
        if key:continue
        venue_set.add(doc['year'] + '_' + doc['venue'])
        for span in doc['NERdir']:
            phrase = doc['NERdir'][span]
            if phrase[0] == 'Generic':continue
            if phrase[1].endswith('system') or phrase[1].endswith('systems'):
                phrase[0] = 'Task'
            if phrase[0] not in ner_rankdir:
                ner_rankdir[phrase[0]] = {}
            if phrase[1] not in ner_rankdir[phrase[0]]:
                ner_rankdir[phrase[0]][phrase[1]] = 0
            ner_rankdir[phrase[0]][phrase[1]] += 1
    ner_rankdir = CombineDir(ner_rankdir)
    acronym_dir = GetAcronymDirNoType(ner_rankdir)
    rankdir = FilterNERNotype(ner_rankdir,acronym_dir)
    return rankdir, acronym_dir
    # for ner_type in ner_rankdir:
    #     ner_rankdir[ner_type] = sort_dict(ner_rankdir[ner_type])


def NormalizePhrase(phrase, rankdir, acronym_dir, toplst):
    replacewords = set(['model','approach','method','algorithm','technique','module', 'application','models','approachs','methods','algorithms','techniques','modules', 'applications', 'problem','problems','task','tasks', 'system', 'systems', 'score', 'scores','framework','frameworks', 'design', 'designs', 'formulation'])
    words = []
    phrase = phrase.split()
    for word in phrase:
        if word in acronym_dir and word not in set(['-LRB-','-RRB-']):
            word = acronym_dir[word]
        words.append(word)
    phrase = ' '.join(words)
    if '-LRB-' in phrase and '-RRB-' in phrase:
    # if '-LRB-' in phrase and phrase.endswith('-RRB-'):
        full = phrase.split('-LRB-')[0]
        full = Lower(full)
        
        if full in acronym_dir:
            full = acronym_dir[full]
        fullnorm = full.replace('-','').replace(' ','')
        for norm in toplst:
            if norm[0] in fullnorm:
                return norm[1]
        lastword = full.split()[-1]
        if lastword in replacewords:
            full = ' '.join(full.split()[:-1])
            if full in acronym_dir:
                full = acronym_dir[full]

                
        if full in rankdir:
            return full
        if full.endswith('s'):
            if full[:-1] in rankdir:
                return full[:-1]
        if full.endswith('es'):
            if full[:-2] in rankdir:
                return full[:-2]
        return full
    else:
        full = Lower(phrase)
        if full in acronym_dir:
            full = acronym_dir[full]
        fullnorm = full.replace('-','').replace(' ','')
        for norm in toplst:
            if norm[0] in fullnorm:
                return norm[1]
                                            
        lastword = full.split()[-1]
        # if lastword in replacewords:
        #     newfull = ' '.join(full.split()[:-1])
        #     if newfull in rankdir:
        #         return newfull
        if lastword in replacewords:
            full = ' '.join(full.split()[:-1])

        if full.endswith('s'):
            if full[:-1] in rankdir:
                
                return full[:-1]
        if full.endswith('es'):
            if full[:-2] in rankdir:
                return full[:-2]
        if full in rankdir:
            return full
        return full
        # acronym = Lower(acronym)
    
    
    
def topNER(docs, aspects='None', aspect_values = 'None'):
    # aspects = ['year','venue'], aspect_values = [set('1988','2000'),set('EMNLP','ACL')]
    ner_rankdir = {}
    venue_set = set()
    for doc_key in docs:
        doc = docs[doc_key]
        key = False
        if aspects != 'None':
            for i in range(len(aspects)):
                aspect = aspects[i]
                aspect_value = aspect_values[i]
                if doc[aspect] not in aspect_value:
                    key = True
                    break
        if key:continue
        venue_set.add(doc['year'] + '_' + doc['venue'])
        for span in doc['NERdir']:
            phrase = doc['NERdir'][span]
            if phrase[0] == 'Generic':continue
            phrase[1] = Lower(phrase[1])
            if phrase[1].endswith('system') or phrase[1].endswith('systems'):
                phrase[0] = 'Task'
            words = phrase[1].split(' ')
            if phrase[0] not in ner_rankdir:
                ner_rankdir[phrase[0]] = {}
            if phrase[1] not in ner_rankdir[phrase[0]]:
                ner_rankdir[phrase[0]][phrase[1]] = 0
            ner_rankdir[phrase[0]][phrase[1]] += 1
    acronym_dir = GetAcronymDir(ner_rankdir)
    ner_rankdir = FilterNER(ner_rankdir,acronym_dir)

    for ner_type in ner_rankdir:
        ner_rankdir[ner_type] = sort_dict(ner_rankdir[ner_type])

    print venue_set
    return ner_rankdir

def CombineDir(dct):
    new_dct = {}
    for aspect in dct:
        for phrase in dct[aspect]:
            if phrase in new_dct:
                new_dct[phrase] += dct[aspect][phrase]
            else:
                new_dct[phrase] = dct[aspect][phrase]
    return new_dct
def GetAcronymDir(ner_rankdir):
    acronym_counts = {}
    for ner_type in ner_rankdir:
        for key in ner_rankdir[ner_type]:
            # if '-LRB-' in key and key.endswith('-RRB-'):
            if '-LRB-' in key and '-RRB-' in key:
                full = key.split('-LRB-')[0]
                acronym = key.split('-LRB-')[1].split('-RRB-')[0]
                if len(full) < len(acronym):
                    full,acronym = acronym,full
                full = Lower(full)
                acronym = Lower(acronym)
                if acronym not in acronym_counts:
                    acronym_counts[acronym] = {full:1}
                else:
                    if full in acronym_counts[acronym]:
                        acronym_counts[acronym][full] += 1
                    else:
                        acronym_counts[acronym][full] = 1
    new_acronym_counts = {}
    for acronym in acronym_counts:
            # acronym_counts[acronym] = sort_dict(acronym_counts[acronym])[0][0]
        acronym_counts[acronym] = MergePlural(acronym_counts[acronym])
        sorted_result = sort_dict(acronym_counts[acronym])[0]
        if sorted_result[1] < 2:
            continue
        else:
            new_acronym_counts[acronym] = sorted_result[0]

    return new_acronym_counts

def GetAcronymDirNoType(ner_rankdir):
    acronym_counts = {}

    for key in ner_rankdir:
            if '-LRB-' in key and '-RRB-' in key:
                full = key.split('-LRB-')[0]
                acronym = key.split('-LRB-')[1].split('-RRB-')[0]
                if len(full) < len(acronym):
                    full,acronym = acronym,full
                full = Lower(full)
                acronym = Lower(acronym)
                if acronym not in acronym_counts:
                    acronym_counts[acronym] = {full:1}
                else:
                    if full in acronym_counts[acronym]:
                        acronym_counts[acronym][full] += 1
                    else:
                        acronym_counts[acronym][full] = 1
    new_acronym_counts = {}
    for acronym in acronym_counts:
            # acronym_counts[acronym] = sort_dict(acronym_counts[acronym])[0][0]
        acronym_counts[acronym] = MergePlural(acronym_counts[acronym])
        sorted_result = sort_dict(acronym_counts[acronym])[0]
        if sorted_result[1] < 2:
            continue
        else:
            new_acronym_counts[acronym] = sorted_result[0]

    return new_acronym_counts
                
def MergePlural(mix_dict):
    merged_dict = {}
    plural_mappings = {}
    for key in mix_dict:
        if key + 's' in mix_dict:
            plural_mappings[key+'s'] = key
            continue
        if key + 'es' in mix_dict:
            plural_mappings[key+'es'] = key
            continue
    for key in mix_dict:
        if key in plural_mappings:
            new_key = plural_mappings[key]
            if new_key in merged_dict:
                merged_dict[new_key] += mix_dict[key]
            else:
                merged_dict[new_key] = mix_dict[key]
        else:
            new_key = key
            if new_key in merged_dict:
                merged_dict[new_key] += mix_dict[key]
            else:
                merged_dict[new_key] = mix_dict[key]
            
    return merged_dict


def FilterNER(ner_rankdir, acronym_dir):
    mappings_all = {}
    acro_mappings = {}
    plural_mappings = {}
    for ner_type in ner_rankdir:
        for key1 in ner_rankdir[ner_type]:
            # if '-LRB-' in key1 and key1.endswith('-RRB-'):
            if '-LRB-' in key1 and '-RRB-' in key1:
                full = key1.split('-LRB-')[0]
                acronym = key1.split('-LRB-')[1].split('-RRB-')[0]
                mappings_all[key1] = Lower(full)
                if len(full) < len(acronym):
                    full,acronym = acronym,full
                acro_mappings[full] = acronym
    new_ner_rankdir = {}
    # replace all bracket phrases with their full name
    for ner_type in ner_rankdir:
        new_ner_rankdir[ner_type] = {}
        for key1 in ner_rankdir[ner_type]:
            if key1 in mappings_all:
                if mappings_all[key1] in new_ner_rankdir[ner_type]:
                    new_ner_rankdir[ner_type][mappings_all[key1]] += ner_rankdir[ner_type][key1]
                else:
                    if key1 in ner_rankdir[ner_type]:
                        new_ner_rankdir[ner_type][mappings_all[key1]] = ner_rankdir[ner_type][key1]
                    
            else:
                if key1 in new_ner_rankdir[ner_type]:
                    new_ner_rankdir[ner_type][key1] += ner_rankdir[ner_type][key1]
                else:
                    if key1 in ner_rankdir[ner_type]:
                        new_ner_rankdir[ner_type][key1] = ner_rankdir[ner_type][key1]
                    

    for ner_type in new_ner_rankdir:
        for key1 in new_ner_rankdir[ner_type]:
            if key1 + 's' in new_ner_rankdir[ner_type]:
                plural_mappings[key1+'s'] = key1
                continue
            if key1 + 'es' in new_ner_rankdir[ner_type]:
                plural_mappings[key1+'es'] = key1
                continue


    ner_rankdir_final = {}
    for ner_type in new_ner_rankdir:
        ner_rankdir_final[ner_type] = {}
        for key in new_ner_rankdir[ner_type]:
            new_key = Lower(key)
            if new_key in plural_mappings:
                single_term = plural_mappings[new_key]
                single_term = single_term
                if single_term in acronym_dir:
                    single_term = acronym_dir[single_term]
                if single_term in ner_rankdir_final[ner_type]:
                        ner_rankdir_final[ner_type][single_term] += new_ner_rankdir[ner_type][key]
                        
                else:
                        ner_rankdir_final[ner_type][single_term] = new_ner_rankdir[ner_type][key]

            else:
                lower_key = Lower(key)
                if lower_key in acronym_dir:
                    lower_key = acronym_dir[lower_key]
                    if lower_key in plural_mappings:
                        lower_key = plural_mappings[lower_key]
                    
                if lower_key in ner_rankdir_final[ner_type]:
                    ner_rankdir_final[ner_type][lower_key] += new_ner_rankdir[ner_type][key]
                else:
                    ner_rankdir_final[ner_type][lower_key] = new_ner_rankdir[ner_type][key]
                    
    # pdb.set_trace()
    rankdir = {}
    replacewords = set(['model','approach','method','algorithm','technique','module', 'application','models','approachs','methods','algorithms','techniques','modules', 'applications', 'problem','problems','task','tasks', 'system', 'systems', 'score', 'scores', 'framework','frameworks','design', 'designs'])
    for ner_type in ner_rankdir_final:
        rankdir[ner_type] = {}
        for key in ner_rankdir_final[ner_type]:
            if not key:continue
            words = key.split()
            if words[-1] in replacewords:
                if len(words) == 1:continue 
                new_phrase = ' '.join(words[:-1])
                if new_phrase in acronym_dir:
                    new_phrase = acronym_dir[new_phrase]
                # if new_phrase not in rankdir[ner_type]:
                #     rankdir[ner_type][new_phrase] = 0
                # rankdir[ner_type][new_phrase] += ner_rankdir_final[ner_type][key]
                if new_phrase in ner_rankdir_final[ner_type]:
                    if new_phrase not in rankdir[ner_type]:
                        rankdir[ner_type][new_phrase] = 0
                    rankdir[ner_type][new_phrase] += ner_rankdir_final[ner_type][key]
                else:
                    if key not in rankdir[ner_type]:
                        rankdir[ner_type][key] = 0
                    rankdir[ner_type][key] += ner_rankdir_final[ner_type][key]
                        
            else:
                if key not in rankdir[ner_type]:
                    rankdir[ner_type][key] = 0
                rankdir[ner_type][key] += ner_rankdir_final[ner_type][key]

    return rankdir

def FilterNERNotype(ner_rankdir, acronym_dir):
    mappings_all = {}
    acro_mappings = {}
    plural_mappings = {}
    for key1 in ner_rankdir:
            if '-LRB-' in key1 and '-RRB-' in key1:
                full = key1.split('-LRB-')[0]
                acronym = key1.split('-LRB-')[1].split('-RRB-')[0]
                
                mappings_all[key1] = Lower(full)
                
    new_ner_rankdir = {}
    # replace all bracket phrases with their full name

    for key1 in ner_rankdir:
            if key1 in mappings_all:
                if mappings_all[key1] in new_ner_rankdir:
                    new_ner_rankdir[mappings_all[key1]] += ner_rankdir[key1]
                else:
                    if key1 in ner_rankdir:
                        new_ner_rankdir[mappings_all[key1]] = ner_rankdir[key1]
                    
            else:
                if key1 in new_ner_rankdir:
                    new_ner_rankdir[key1] += ner_rankdir[key1]
                else:
                    if key1 in ner_rankdir:
                        new_ner_rankdir[key1] = ner_rankdir[key1]
                    


    for key1 in new_ner_rankdir:
            if key1 + 's' in new_ner_rankdir:
                plural_mappings[key1+'s'] = key1
            if key1 + 'es' in new_ner_rankdir:
                plural_mappings[key1+'es'] = key1



    ner_rankdir_final = {}
    for key in new_ner_rankdir:
            new_key = Lower(key)
            if new_key in plural_mappings:
                single_term = plural_mappings[new_key]
                single_term = Lower(single_term)
                if single_term in acronym_dir:
                    single_term = acronym_dir[single_term]
                if single_term in ner_rankdir_final:
                        ner_rankdir_final[single_term] += new_ner_rankdir[key]
                        
                else:
                        ner_rankdir_final[single_term] = new_ner_rankdir[key]
            else:
                lower_key = new_key
                if lower_key in acronym_dir:
                    lower_key = acronym_dir[lower_key]
                    if lower_key in plural_mappings:
                        lower_key = plural_mappings[lower_key]
                if lower_key in ner_rankdir_final:
                    ner_rankdir_final[lower_key] += new_ner_rankdir[key]
                else:
                    ner_rankdir_final[lower_key] = new_ner_rankdir[key]
                    

    return ner_rankdir_final

def Lower(string):
    words = string.split()
    new_string = []
    for word in words:
        if not word:continue
        # if not word.isupper():
        #     new_string.append(word.lower())
        # else:
        #     new_string.append(word)
        if word[:1].isupper() and word[1:].islower():
            new_string.append(word.lower())
        else:
            new_string.append(word)
    return ' '.join(new_string)
            
def CountMissingNER(docs):
    allnum = 0
    overlapnum = 0
    nonnum = 0
    ner_labels = ["Task", "Generic", "Metric", "Material", "OtherScientificTerm", "Method"]
    for doc in docs:
        for rel in docs[doc]['RELdir']:
            rel = docs[doc]['RELdir'][rel]
            if 'None' == rel[1][0]:
                nonnum += 1
                # print rel[2][0]
            elif 'partial' in rel[1][0]:
                overlapnum += 1
            elif rel[1][0] not in ner_labels:
                print rel[1][0]
            if 'None' in rel[1][1]:
                nonnum += 1
            elif 'partial' in rel[1][1]:
                overlapnum += 1
            elif rel[1][1] not in ner_labels:
                print rel[1][1]
            # if rel[1][0] in ner_labels:
            #     if rel[1][0] == 'Task':
            #         print rel[2][0]
            allnum += 2
    print allnum
    print overlapnum
    print nonnum
    print float(nonnum)/allnum
    print float(overlapnum)/allnum

def PrintK(dictionary, aspect, k , name):
    strings = []
    k = min(k,len(dictionary[aspect]))
    for i in range(k):
        strings.append(dictionary[aspect][i][0] + '\t' + str(dictionary[aspect][i][1]))
    fid = open('./NER_analy/' + name + '_'+ aspect + '.rank','w')
    fid.write('\n'.join(strings).encode('utf-8'))
    fid.close()
    

def VoteRelationType(rel_dir_phrase):
    phrase_rel_dir = {}
    phrase_ner_dir = {}
    phrase_count = {}
    for rel in rel_dir_phrase:
        for aspect in rel_dir_phrase[rel]:

            for phrase in rel_dir_phrase[rel][aspect]:
                if phrase in phrase_count:
                    phrase_count[phrase] += rel_dir_phrase[rel][aspect][phrase]
                else:
                    phrase_count[phrase] = rel_dir_phrase[rel][aspect][phrase]
                if phrase not in phrase_rel_dir:
                    phrase_rel_dir[phrase] = {}
                if rel not in phrase_rel_dir[phrase]:
                    phrase_rel_dir[phrase][rel] = 0
                if phrase not in phrase_ner_dir:
                    phrase_ner_dir[phrase] = {}
                if aspect not in phrase_ner_dir[phrase]:
                    phrase_ner_dir[phrase][aspect] = 0
                phrase_rel_dir[phrase][rel] += rel_dir_phrase[rel][aspect][phrase]
                phrase_ner_dir[phrase][aspect] += rel_dir_phrase[rel][aspect][phrase]
                    

    new_dict = {}
    for phrase in phrase_count:
        phrase_rel_dir[phrase] = sort_dict(phrase_rel_dir[phrase])
        phrase_ner_dir[phrase] = sort_dict(phrase_ner_dir[phrase])
        rel = phrase_rel_dir[phrase][0][0]
        aspect = phrase_ner_dir[phrase][0][0]
        if len(phrase_rel_dir[phrase])> 1:
            if rel == 'CONJUNCTION':
                rel = phrase_rel_dir[phrase][1][0]
        if len(phrase_ner_dir[phrase])> 1:
            if aspect == 'None' or aspect == 'OtherScientificTerm':
                aspect = phrase_ner_dir[phrase][1][0]
            if aspect == 'None' or aspect == 'OtherScientificTerm':
                if len(phrase_ner_dir[phrase])> 2:
                    aspect = phrase_ner_dir[phrase][2][0]
                
        # for token in phrase_dir[phrase]:
        #     if ('USED-FOR', 'Task') == token[0]:
        #         (rel,aspect) = ('USED-FOR', 'Task')
        #     elif ('USED-FOR_Reverse', 'Method') == token[0]:
        #         (rel,aspect) = ('USED-FOR_Reverse', 'Method')
                

        if rel not in new_dict:
            new_dict[rel] = {}
        if aspect not in new_dict[rel]:
            new_dict[rel][aspect] = {}
        new_dict[rel][aspect][phrase] = phrase_count[phrase]

    return new_dict
        
    
    
    
# docs = {}
# for i in range(16):
#     senfn = '/homes/luanyi/pubanal/data/AI2/json/'+str(i)+'.json'
#     nerfn = '/homes/luanyi/pubanal/data/AI2/automatic_graph/results/ner_outputs/'+str(i)+'.output.json'
#     refn = '/homes/luanyi/pubanal/data/AI2/automatic_graph/results/re_outputs/'+str(i)+'.output.json'
#     ReadJson(senfn, refn, nerfn, docs)
    
# venue_sets = [set(['ICASSP','INTERSPEECH']), set(['AAAI','IJCAI']), set(['ACL','EMNLP','IJCNLP']), set(['ECCV','CVPR','ICCV']), set(['NIPS','ICML'])]
# speech = set(['ICASSP','INTERSPEECH'])
# AI = set(['AAAI','IJCAI'])
# NLP = set(['ACL','EMNLP','IJCNLP'])
# CV = set(['ECCV','CVPR','ICCV'])
# ML = set(['NIPS','ICML'])
# # speech = topNER(docs, ['venue'], [venue_sets[0]])
# # AI = topNER(docs, ['venue'], [venue_sets[1]])
# # NLP = topNER(docs, ['venue'], [venue_sets[2]])
# # CV = topNER(docs, ['venue'], [venue_sets[3]])
# # ML = topNER(docs, ['venue'], [venue_sets[4]])
# b00 = set([str(i) for i in range(1990,2001)])
# p00_to_05 = set([str(i) for i in range(2001,2006)])
# p06_to_10 = set([str(i) for i in range(2006,2011)])
# p10_to_17 = set([str(i) for i in range(2011,2017)])
# # p1 = topNER(docs, ['year'], [b00])
# # p2 = topNER(docs, ['year'], [p00_to_05])
# # p3 = topNER(docs, ['year'], [p06_to_10])
# # p4 = topNER(docs, ['year'], [p10_to_17])
# # p1_speech = topNER(docs, ['year','venue'], [b00,venue_sets[0]])
# # p2_speech = topNER(docs, ['year','venue'], [p00_to_05, venue_sets[0]])
# # p3_speech = topNER(docs, ['year','venue'], [p06_to_10, venue_sets[0]])
# p4_speech = topNER(docs, ['year','venue'], [p10_to_17, venue_sets[0]])
# # p1_ai = topNER(docs, ['year','venue'], [b00,venue_sets[1]])
# # p2_ai = topNER(docs, ['year','venue'], [p00_to_05, venue_sets[1]])
# # p3_ai = topNER(docs, ['year','venue'], [p06_to_10, venue_sets[1]])
# # p4_ai = topNER(docs, ['year','venue'], [p10_to_17, venue_sets[1]])
# # p1_NLP = topNER(docs, ['year','venue'], [b00,venue_sets[2]])
# # p2_NLP = topNER(docs, ['year','venue'], [p00_to_05, venue_sets[2]])
# # p3_NLP = topNER(docs, ['year','venue'], [p06_to_10, venue_sets[2]])
# # p4_NLP = topNER(docs, ['year','venue'], [p10_to_17, venue_sets[2]])
# # p1_cv = topNER(docs, ['year','venue'], [b00,venue_sets[3]])
# # p2_cv = topNER(docs, ['year','venue'], [p00_to_05, venue_sets[3]])
# # p3_cv = topNER(docs, ['year','venue'], [p06_to_10, venue_sets[3]])
# # p4_cv = topNER(docs, ['year','venue'], [p10_to_17, venue_sets[3]])
# # p1_ml = topNER(docs, ['year','venue'], [b00,venue_sets[4]])
# # p2_ml = topNER(docs, ['year','venue'], [p00_to_05, venue_sets[4]])
# # p3_ml = topNER(docs, ['year','venue'], [p06_to_10, venue_sets[4]])
# # p4_ml = topNER(docs, ['year','venue'], [p10_to_17, venue_sets[4]])
# years = {'p1':b00, 'p2':p00_to_05, 'p3':p06_to_10, 'p4':p10_to_17}
# venues = {'speech':speech, 'AI':AI, 'NLP':NLP, 'CV':CV, 'ML':ML}
# ner_types = ["Task", "Generic", "Metric", "Material", "OtherScientificTerm", "Method"]
# # rankdir, acronym_dir = NormalizedLst(docs, ['year','venue'], [years['p4'], venues['speech']])
# # phrase = 'Gaussian mixture models'
# # NormalizePhrase(phrase, rankdir, acronym_dir)

# # pdb.set_trace()
# print 'aa'
# # topNER(docs, ['year','venue'], [years['p4'], venues['speech']])
# print 'finish'
# for year_key in years:
#     year = years[year_key]
#     print year
#     for venue_key in venues:
#         venue = venues[venue_key]
#         x = topNER(docs, ['year','venue'], [year, venue])
#         for ner_type in ner_types:
#             PrintK(x,ner_type,1000, venue_key + '_' + year_key)
        
# for year_key in years:
#     print year_key
#     year = years[year_key]
#     x = topNER(docs, ['year'], [year])
#     for ner_type in ner_types:
#         PrintK(x,ner_type,1000, year_key)
# for venue_key in venues:
#     venue = venues[venue_key]
#     x = topNER(docs, ['venue'], [venue])
#     for ner_type in ner_types:
#         PrintK(x,ner_type,1000, venue_key)
# # CountMissingNER(docs)


SyntaxError: Missing parentheses in call to 'print'. Did you mean print(docs_sent[i]['doc_key'])? (Temp/ipykernel_4652/260256666.py, line 38)