## 1. Start parsing abstracts

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import csv
import string

In [2]:
#extracting text from xml files
from xml.dom import minidom

xmldoc = minidom.parse('1.1.text.xml')
def getTokens(node):
    tokens = []
    if node is not None:
        if node.nodeType == node.TEXT_NODE:
            tokens.extend(node.data.split())
        elif node.nodeType == node.ELEMENT_NODE:
            for sub_array in [item.data.split() for item in node.childNodes]:
                tokens.extend(sub_array)
    return tokens

In [3]:
#extracting all enities 
all_entities = []
for entity in xmldoc.getElementsByTagName('entity'):
    entityWords = getTokens(entity)
    all_entities.append(entityWords)

#print(all_entities)
len(all_entities)

5259

In [4]:
def geTitle(nodelist):
    titles = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            titles.append(node.data)
    return ''.join(titles)

In [5]:
#extract all titles. 
all_titles = []
for title in xmldoc.getElementsByTagName('title'):
    title_text = geTitle(title.childNodes)
    all_titles.append(title_text.lower())
#print(all_titles)  
len(all_titles)

350

In [8]:
def getAbstractText(abstractNode):
    # Extract abstract text
    abstractElements = []
    for item in abstractNode.childNodes:
        # If plain text node
        if item.nodeType == item.TEXT_NODE:
            abstractElements.append(item.data)
        # If xml tag node
        elif item.nodeType == item.ELEMENT_NODE:
            for sub_array in [sub_item.data for sub_item in item.childNodes]:
                abstractElements.append(sub_array)
    return ''.join(abstractElements)


all_abstracts = []
for abstract in xmldoc.getElementsByTagName('abstract'):
    abstractText = getAbstractText(abstract)
    all_abstracts.append(abstractText)

#print(all_abstracts)
len(all_abstracts)
type(all_abstracts)

list

In [65]:
#350 titles + 350 abstracts
all_data = all_titles + all_abstracts

print(all_data[0])
#len(all_data)
#type(all_data)

activity detection for information access to oral communication


## 2. Text preprocessing


In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import itertools

In [14]:
#tokenize text
tokens = [word_tokenize(i) for i in all_data]
tokens = list(itertools.chain.from_iterable(tokens))

#print(tokens)
len(tokens)

43385

In [70]:
#remove, dublicated words and punctuation
#stop_words = set(stopwords.words('english')) 
punctuation = ['.', ',', '//', ':', ';', ')', '(', '%', '-']
#filtered_txt = [w for w in tokens if not w in stop_words]  
filtered_txt = [] 

#for w in tokens: 
#    if w not in stop_words and w not in punctuation: 
#        filtered_txt.append(w)
  
for w in tokens: 
    if w not in punctuation: 
        filtered_txt.append(w.lower())

#remove dublicates
filtered_tokens = list(set(filtered_txt))        
#print(filtered_tokens)
len(filtered_tokens)

['but', 'apt', 'reported', 'lexsys', 'groups', 'slavonic', 'marcus', 'lemma-', 'clustering', 'bootstrap', 'valuable', 'preferred', 'unordered', "'95", 'cf', 'bio-medical', 'weaknesses', 'abductive', 'walked', 'pairing', 'respectful', 'pos', 'date', 'facilitates', 'firmly', 'sentence-', 'demonstrations', 'filter', 'principle', 'p', 'convert', 'body', 'weighted', 'imitation', 'erroneous', 'imposed', '-based', 'memorize', 'l', 'tractability', 'reduplication', 'connectives', 'infomagnets', 'functional', 'powerful', 'communities', 'independence', 'data', 'spontaneous', 'counts', 'unix', 'inapplicable', 'educational', 'ambiguity', 'practice', 'resultant', 'anchoring', 'theoretical', 'compression', 'indeed', 'retrieves', 'particularly', 'corrector', 'syntactic', 'approaches', 'modifier', 'correlations', 'try', 'downstream', 'spatial', 'array-based', 'decrease', 'topical', 'heads', 'quality', 'category', 'french-dutch', 'practically', 'gen', 'salient', 'error', 'interfaces', 'converse', 'softw

4946

After pre-processing our corpus consists of 4946 tokens. 

In [73]:
#write vocabulary into a csv file
with open('vocab.txt', 'w') as f:
    for item in filtered_tokens:
        f.write("%s\n" % item)

In [74]:
#convert tokens of the vocabulary into a dataframe.
df_vocabulary = pd.DataFrame(filtered_tokens)
df_vocabulary.columns = ['Word']
#print(df_vocabulary)
df_vocabulary.head(5)
len(df_vocabulary)

4946

# 3. Fetching Word Embeddings

In [75]:
#get Global vectors and store them in the embeddings_dict 
embeddings_dict = {}
f=open('glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_dict[word] = vector

In [19]:
print(len(embeddings_dict))

400000


In [20]:
#Convert dictionary into a DataFrame
#put an id for each word
embeddings_df = pd.DataFrame(list(embeddings_dict.items()), columns=['Word', 'Vector'])
term_id = range(len(embeddings_dict))
embeddings_df.insert(0, "Id", term_id, True) 
embeddings_df.head(5)

Unnamed: 0,Id,Word,Vector
0,0,the,"[0.04656, 0.21318, -0.0074364, -0.45854, -0.03..."
1,1,",","[-0.25539, -0.25723, 0.13169, -0.042688, 0.218..."
2,2,.,"[-0.12559, 0.01363, 0.10306, -0.10123, 0.09812..."
3,3,of,"[-0.076947, -0.021211, 0.21271, -0.72232, -0.1..."
4,4,to,"[-0.25756, -0.057132, -0.6719, -0.38082, -0.36..."


In [76]:
#get glove word embeddings for the words in our vocabulary. 
# constract a dictionary which contains words in our vocabulary and their respective embedding from GloVe. 
vocab_dictionary = {}
vocab_words = []
test_words = []
vector_embeddings = []
for i in range (len(df_vocabulary)):
    if df_vocabulary['Word'][i] in embeddings_dict:
        vocab_words.append(df_vocabulary['Word'][i])
        vector_embeddings.append(embeddings_dict[df_vocabulary['Word'][i]])

#print(vocab_words)
#print(vector_embeddings)
len(vocab_words)
#type(vector_embeddings)

4297

In [77]:
#word_id = range(len(df_vocabulary))
word_vector_final= pd.DataFrame({'Word': vocab_words, 'Embedding Vector': vector_embeddings})     
word_vector_final.index = range(len(word_vector_final)) 
display(word_vector_final.head(5))
len(word_vector_final)

Unnamed: 0,Word,Embedding Vector
0,but,"[-0.0093601, 0.22789, -0.10275, 0.0010893, 0.2..."
1,apt,"[-0.41618, -0.37872, 0.4119, 0.015433, 0.62914..."
2,reported,"[-0.21093, 0.51757, 0.042662, 0.013656, -0.577..."
3,groups,"[-0.12724, 0.27224, -0.055019, -0.0030393, -0...."
4,slavonic,"[-0.39907, -0.76034, -0.17752, 0.22556, -0.110..."


4297

In [78]:
#merge the words in your vocabulary file with the ID and embeddings from glove. 
#get word embedding of the words in the vocabulary by simply joininig the dataframes on word 
vocabulary = pd.merge(df_vocabulary, embeddings_df, on='Word')
len(vocabulary)

4297

In [79]:
#vocabulary['Vector'][0]
len(vocabulary['Vector'][0])

300

From all the tokens in the corpus we have 4297 for which we have found an embedding from global vectos 

In [81]:
vocabulary = vocabulary[['Id','Word', 'Vector']]
display(vocabulary.head(10))

Unnamed: 0,Id,Word,Vector
0,34,but,"[-0.0093601, 0.22789, -0.10275, 0.0010893, 0.2..."
1,19219,apt,"[-0.41618, -0.37872, 0.4119, 0.015433, 0.62914..."
2,293,reported,"[-0.21093, 0.51757, 0.042662, 0.013656, -0.577..."
3,503,groups,"[-0.12724, 0.27224, -0.055019, -0.0030393, -0...."
4,48913,slavonic,"[-0.39907, -0.76034, -0.17752, 0.22556, -0.110..."
5,7070,marcus,"[0.52047, -0.44494, -0.2596, -0.14926, 0.53999..."
6,45209,clustering,"[-0.49152, 0.83904, 0.48919, -0.53234, -0.0898..."
7,79856,bootstrap,"[-0.54975, 0.13891, 0.58945, -0.24693, 0.30243..."
8,4640,valuable,"[0.34956, 0.24624, 0.20949, -0.15911, 0.3061, ..."
9,4968,preferred,"[-0.20157, 0.31404, 0.42666, -0.17512, 0.39641..."


Pickle the vocabulary together with word emeddings for later use

In [82]:
import pickle
vocabulary.to_pickle("GloVe.pkl")

In [83]:
vocab = vocabulary.drop(columns=['Vector'])
display(vocab.head(5))

Unnamed: 0,Id,Word
0,34,but
1,19219,apt
2,293,reported
3,503,groups
4,48913,slavonic


In [84]:
#store vocabulary into a csv
vocab.to_csv('vocab.csv', index=False) 