In [17]:
# Reads the .xml files and create a .csv with only the needed information 
# Uses the AwardTitle and the AbstractNarration information to create the dataset

import os, glob
import csv

# Get the data from the .xml tags
def filterData(data, tag):
    startTag = "<%s>" % (tag)
    endTag = "</%s>" % (tag)
    start = data.find(startTag) + len(startTag)
    end = data.find(endTag)
    return data[start:end]

#Clean the data removing unnecessary tag
def cleanData(data):
    startTag = "<![CDATA["
    endTag = "]]>"
    if startTag in data:
        data = data[len(startTag):-len(endTag)]
    return data

csvfile = open('abstract_narrations.csv', 'w')
writer = csv.writer(csvfile)
writer.writerow(['title', 'narration'])

emptyTags = ["<AwardTitle/>", "<AbstractNarration/>"]

for filename in glob.glob('files/*.xml'):
    with open(os.path.join(os.getcwd(), filename), 'r', encoding='UTF8') as f: # open in readonly mode
        data = f.read()
        if all(x not in data for x in emptyTags):
            
            title = filterData(data,"AwardTitle")
            title = cleanData(title)
            
            narration = filterData(data,"AbstractNarration")
            narration = cleanData(narration)
            
            writer.writerow([title, narration])
csvfile.close()

In [None]:
#
# Case the file abstract_narrations.csv already exists the code above is not necessary
#

In [69]:
import pandas as pd
df = pd.read_csv('abstract_narrations.csv')

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [72]:
df['narration'] = df['narration'].str.lower()
df['narration'] = df['narration'].str.replace('&lt;br/&gt;', '')
df['narration'] = df['narration'].apply(tokenizer.tokenize)

In [73]:
df.head()

Unnamed: 0,title,narration
0,Adaptive dynamic coordination of damping contr...,"[in, the, last, decades, global, environmental..."
1,RAPID: On-mask Chemical Modulation of Respirat...,"[non, technical, abstract, spread, of, infecti..."
2,Collaborative Research: Biomass burning smoke ...,"[microbes, are, found, in, all, environments, ..."
3,SBIR Phase I: AK-423: A broad-spectrum antivi...,"[the, broader, impact, commercial, potential, ..."
4,The Nature of Coupled Heat and Mass Transport ...,"[the, goal, of, this, project, is, to, underst..."


In [74]:
all_stopwords = stopwords.words('english')
df['narration'] = df['narration'].apply(lambda x: [word for word in x if not word in all_stopwords])

In [75]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [76]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [77]:
import collections
vocabulary_size = 25000

def build_dataset(dataset):
    narration = dataset['narration'].tolist()
    narration = flatten(narration)

    words = [['UNK', -1]]
    words.extend(collections.Counter(narration).most_common(vocabulary_size - 1))

    # Dicionário
    dictionary = dict()
    for word, _ in words:
        dictionary[word] = len(dictionary)

    data = list()
    unk_count = 0

    for word in narration:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)

    words[0][1] = unk_count

    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    assert len(dictionary) == vocabulary_size

    return data, words, dictionary, reverse_dictionary
    

In [78]:
data, words, dictionary, reverse_dictionary = build_dataset(test)