In [None]:
# Reads the .xml files and create a .csv with only the needed information 
# Uses the AwardTitle and the AbstractNarration information to create the dataset

import os, glob
import csv

# Get the data from the .xml tags
def filterData(data, tag):
    startTag = "<%s>" % (tag)
    endTag = "</%s>" % (tag)
    start = data.find(startTag) + len(startTag)
    end = data.find(endTag)
    return data[start:end]

#Clean the data removing unnecessary tag
def cleanData(data):
    startTag = "<![CDATA["
    endTag = "]]>"
    if startTag in data:
        data = data[len(startTag):-len(endTag)]
    return data

csvfile = open('abstract_narrations.csv', 'w')
writer = csv.writer(csvfile)
writer.writerow(['title', 'narration'])

emptyTags = ["<AwardTitle/>", "<AbstractNarration/>"]

for filename in glob.glob('files/*.xml'):
    with open(os.path.join(os.getcwd(), filename), 'r', encoding='UTF8') as f: # open in readonly mode
        data = f.read()
        if all(x not in data for x in emptyTags):
            
            title = filterData(data,"AwardTitle")
            title = cleanData(title)
            
            narration = filterData(data,"AbstractNarration")
            narration = cleanData(narration)
            
            writer.writerow([title, narration])
csvfile.close()

In [None]:
#
# Case the file abstract_narrations.csv already exists the code above is not necessary
#

In [1]:
import pandas as pd
df = pd.read_csv('abstract_narrations.csv')

In [2]:
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cristine.scheibler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/cristine.scheibler/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df['narration'] = df['narration'].str.lower()
df['narration'] = df['narration'].str.replace('&lt;br/&gt;', '')
df['narration'] = df['narration'].apply(tokenizer.tokenize)

In [4]:
df.head()

Unnamed: 0,title,narration
0,Adaptive dynamic coordination of damping contr...,"[in, the, last, decades, global, environmental..."
1,RAPID: On-mask Chemical Modulation of Respirat...,"[non, technical, abstract, spread, of, infecti..."
2,Collaborative Research: Biomass burning smoke ...,"[microbes, are, found, in, all, environments, ..."
3,SBIR Phase I: AK-423: A broad-spectrum antivi...,"[the, broader, impact, commercial, potential, ..."
4,The Nature of Coupled Heat and Mass Transport ...,"[the, goal, of, this, project, is, to, underst..."


In [5]:
all_stopwords = stopwords.words('english')
df['narration'] = df['narration'].apply(lambda x: [word for word in x if not word in all_stopwords])

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.05)

In [7]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [8]:
import collections
vocabulary_size = 25000

def build_dataset(dataset):
    narration = dataset['narration'].tolist()
    narration = flatten(narration)

    words = [['UNK', -1]]
    words.extend(collections.Counter(narration).most_common(vocabulary_size - 1))

    # Dicionário
    dictionary = dict()
    for word, _ in words:
        dictionary[word] = len(dictionary)

    data = list()
    unk_count = 0

    for word in narration:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)

    words[0][1] = unk_count

    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    assert len(dictionary) == vocabulary_size

    return data, words, dictionary, reverse_dictionary
    
def build_dataset_with_existing_dictionary(narration, dictionary):
    data = list()
    for word in narration:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # 'UNK'
        data.append(index)
    return data


In [None]:
data, words, dictionary, reverse_dictionary = build_dataset(train)
test_data = {}

for index, row in test.iterrows():
    test_data[row['title']] = build_dataset_with_existing_dictionary(row['narration'],dictionary)

In [14]:
sample = next(iter(test_data))
print('\nPalavras mais comuns (+UNK)', words[:10])
print('\nAmostra: ', sample, test_data[sample])


Palavras mais comuns (+UNK) [['UNK', 37878], ('project', 39960), ('research', 35125), ('using', 20186), ('students', 19098), ('data', 18760), ('support', 18453), ('award', 15211), ('broader', 15130), ('impacts', 15062)]

Amostra:  CAREER: A Comprehensive and Lightweight Framework for Transcriptome Analysis [407, 2147, 1707, 193, 107, 125, 3552, 707, 1063, 68, 298, 414, 811, 702, 434, 193, 257, 1051, 7629, 298, 90, 707, 1063, 117, 67, 152, 289, 3099, 67, 326, 265, 193, 1113, 12143, 837, 23, 109, 215, 31, 191, 222, 438, 23, 109, 215, 159, 1131, 27, 134, 14774, 5, 816, 193, 1602, 415, 126, 171, 4214, 573, 1762, 908, 1140, 5864, 1453, 1456, 1250, 908, 2121, 1886, 908, 3457, 843, 472, 503, 2331, 146, 1092, 126, 51, 324, 5, 556, 521, 529, 179, 387, 3332, 13727, 1098, 865, 2358, 1949, 18259, 590, 335, 629, 11, 5, 816, 81, 1, 32, 11, 188, 676, 3546, 51, 1275, 707, 10413, 1063, 3, 1707, 5, 86, 817, 11, 5, 224, 1788, 797, 160, 1524, 1707, 12037, 4214, 6961, 11, 7691, 1453, 7304, 2121, 707, 1063