## 1. Parse the xml file
Extract entity and entity context from xml file

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import csv
import string

In [2]:
from xml.dom import minidom

NUM_TOKENS = 2
xmldoc = minidom.parse('1.1.text.xml')


def getTokens(node):
    tokens = []
    if node is not None:
        if node.nodeType == node.TEXT_NODE:
            tokens.extend(node.data.split())
        elif node.nodeType == node.ELEMENT_NODE:
            for sub_array in [item.data.split() for item in node.childNodes]:
                tokens.extend(sub_array)
    return tokens

In [3]:
def getWordsBefore(entity_node, num_words):
    # The commented line below is for reference only. Basic return implementation, not sufficient.
    # return getTokens(entity_node.previousSibling)[-num_words:]

    words_before = []
    current_node = entity_node

    # Retrieve tokens from previous sibling nodes considering entity collision edge cases
    while True:
        # Calculate remaining offset/number of words per iteration left
        offset = num_words - len(words_before)
        if offset > 0:  # Check if we need to retrieve (more) tokens
            # Assign previousSibling of the current node as current node per iteration
            current_node = current_node.previousSibling
            if current_node is not None:
                # Get last n tokens from the left
                current_tokens = getTokens(current_node)[-offset:]
                # feel free to change the below line if you dont want to apply filter (.,)
                current_tokens = [token for token in current_tokens if token not in ['.', ',']]
                # Prepend results
                current_tokens.extend(words_before)
                words_before = current_tokens
                continue  # Proceed iteration
            break  # End iteration if we reach the beginning of abstract content
        break  # End iteration if n words retrieved
    return words_before

In [4]:
def getWordsAfter(entity_node, num_words):
    # The commented line below is for reference only. Basic return implementation, not sufficient.
    # return getTokens(entity_node.nextSibling)[-num_words:]

    words_after = []
    current_node = entity_node

    # Retrieve tokens from previous sibling nodes considering entity collision edge cases
    while True:
        # Calculate remaining offset/number of words per iteration left
        offset = num_words - len(words_after)
        if offset > 0:  # Check if we need to retrieve (more) tokens
            # Assign nextSibling of the current node as current node per iteration
            current_node = current_node.nextSibling
            if current_node is not None:
                # Get first n tokens from the right
                current_tokens = getTokens(current_node)[:offset]
                # Feel free to change the below line if you dont want to apply filter (.,)
                current_tokens = [token for token in current_tokens if token not in ['.', ',']]
                # Append results
                words_after.extend(current_tokens)
                continue  # Proceed iteration
            break  # End iteration if we reach the end of abstract content
        break  # End iteration if n words retrieved
    return words_after

In [5]:
#dictionary to store the context
result = {}
absId = []
entId = []
conxt = []
for entity in xmldoc.getElementsByTagName('entity'):

    textId = entity.parentNode.parentNode.getAttribute("id")
    entityId = entity.getAttribute("id")
    absId.append(textId)
    entId.append(entityId)

    entityWords = getTokens(entity)
    wordsBefore = getWordsBefore(entity, NUM_TOKENS)
    wordsAfter = getWordsAfter(entity, NUM_TOKENS)

    contextTokens = wordsBefore + entityWords + wordsAfter
    contextString = " ".join(contextTokens)
    conxt.append(contextString.lower())

    #print(textId, entityId, contextString)
    #constract a dataframe with the abstract Id, entity Id and the context k=2
    entity_context= pd.DataFrame({'absID': absId, 'entityID': entId, 'context':conxt})
    
   
    # Build result object (disctionary)

    if not result.get(textId):
        result[textId] = {}

    result[textId][entityId] = contextString

entity_context.head(5)

Unnamed: 0,absID,entityID,context
0,H01-1001,H01-1001.1,oral communication is ubiquitous
1,H01-1001,H01-1001.2,development of storage media and networks one ...
2,H01-1001,H01-1001.3,store a conversation for documentation.
3,H01-1001,H01-1001.4,in a large database traditional information
4,H01-1001,H01-1001.5,database traditional information retrieval tec...


The above dataframe consists of the abstract Id, entitity Id and the entitiy context with a window of 2, like collecting 2 words before and 2 word after the entity. 

 ## 2. Get context embedding
 Context is of type string. We need to tokenize it so that we can get emmedings for each word in the context.
 There are 5259 entities in the given abstracts. 

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import itertools

In [34]:
#tokenize tokens of the entitiy context
context_string = entity_context['context']
context_tokens = [word_tokenize(i) for i in context_string]
print(context_tokens)
#type(context_tokens[0])
len(context_tokens)

[['oral', 'communication', 'is', 'ubiquitous'], ['development', 'of', 'storage', 'media', 'and', 'networks', 'one', 'could'], ['store', 'a', 'conversation', 'for', 'documentation', '.'], ['in', 'a', 'large', 'database', 'traditional', 'information'], ['database', 'traditional', 'information', 'retrieval', 'techniques', 'use', 'a'], ['use', 'a', 'histogram', 'of', 'keywords'], ['histogram', 'of', 'keywords', 'as', 'the'], ['as', 'the', 'document', 'representation', 'but', 'oral'], ['representation', 'but', 'oral', 'communication', 'may', 'offer'], ['offer', 'additional', 'indices', 'such', 'as'], ['an', 'alternative', 'index', 'could', 'be'], ['of', 'the', 'automatic', 'detection', 'of', 'those'], ['of', 'larger', 'database', 'and', 'detect'], ['a', 'large', 'database', 'of', 'tv'], ['database', 'of', 'tv', 'shows', 'emotions', 'and'], ['tv', 'shows', 'emotions', 'and', 'other'], ['and', 'other', 'indices', 'such', 'as'], ['as', 'the', 'dominance', 'distribution', 'of', 'speakers', 'mig

5259

At this point context is represented as a list of lists. A list which has 5259 lists within. 
The length of the list is the numeber of tokens which makes up the context. 
Now for each token we will take the embeddinig of the token from the file that contains vocabulary words and embeddings of the words. 

In [84]:
#load the pickled file with token embeddings 
import pickle
embeddings = pd.read_pickle('GloVe.pkl')

In [85]:
#print(embeddings)
type(embeddings['Vector'][0][0])
len(embeddings)

4297

In [77]:
context_vector = {}

for i in range(len(context_tokens)):
    tokens_vector = []
    for token in context_tokens[i]:
        #context_vector[i] = {}
        for j in range(len(embeddings)):
            if token == embeddings['Word'][j]: 
                vector = embeddings['Vector'][j]  
                tokens_vector.append(vector)
                context_vector[i] = tokens_vector
                #print(token, embeddings['Vector'][j])     

In [78]:
#print(context_vector)
len(context_vector)

5256

In [79]:
#convert the dictionary into a dataframe
context_df = pd.DataFrame(context_vector.items(), columns=['id', 'context_vector'])
context_df.head()
#type(context_df['context_vector'][0][0][0])

Unnamed: 0,id,context_vector
0,0,"[[-0.40088, 0.38661, -0.53597, -0.081736, 0.37..."
1,1,"[[-0.10137, -0.27174, -0.31147, -0.75354, 0.05..."
2,2,"[[-0.52579, 0.3754, -0.41452, -0.48975, 0.0334..."
3,3,"[[-0.44399, 0.12817, -0.25247, -0.18582, -0.16..."
4,4,"[[-1.0001, 0.45166, 0.38212, -0.098643, 0.1492..."


In [80]:
#represent each entity with its id and contex
ent_Id = entity_context['entityID']
context_vector = context_df['context_vector']
entity_context_df = pd.DataFrame({'entity_Id': ent_Id, 'context_vector':context_vector })
entity_context_df.head()

Unnamed: 0,entity_Id,context_vector
0,H01-1001.1,"[[-0.40088, 0.38661, -0.53597, -0.081736, 0.37..."
1,H01-1001.2,"[[-0.10137, -0.27174, -0.31147, -0.75354, 0.05..."
2,H01-1001.3,"[[-0.52579, 0.3754, -0.41452, -0.48975, 0.0334..."
3,H01-1001.4,"[[-0.44399, 0.12817, -0.25247, -0.18582, -0.16..."
4,H01-1001.5,"[[-1.0001, 0.45166, 0.38212, -0.098643, 0.1492..."


Each entitty is now represented by its id and the vector embeddings of the context words.
We pickle context embeddings for later use. 

In [81]:
entity_context_df.to_pickle("context.pkl")