In [2]:
#read the data
import pandas as pd
df = pd.read_csv('abstract_narrations.csv')

In [None]:
import nltk
import random
import numpy as np
import tensorflow as tf
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [4]:
df['narration'] = df['narration'].str.lower()
df['narration'] = df['narration'].str.replace('&lt;br/&gt;', '')
df['narration'] = df['narration'].apply(tokenizer.tokenize)

In [5]:
df.head()

Unnamed: 0,title,narration
0,Adaptive dynamic coordination of damping contr...,"[in, the, last, decades, global, environmental..."
1,RAPID: On-mask Chemical Modulation of Respirat...,"[non, technical, abstract, spread, of, infecti..."
2,Collaborative Research: Biomass burning smoke ...,"[microbes, are, found, in, all, environments, ..."
3,SBIR Phase I: AK-423: A broad-spectrum antivi...,"[the, broader, impact, commercial, potential, ..."
4,The Nature of Coupled Heat and Mass Transport ...,"[the, goal, of, this, project, is, to, underst..."


In [6]:
all_stopwords = stopwords.words('english')
df['narration'] = df['narration'].apply(lambda x: [word for word in x if not word in all_stopwords])

In [7]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [8]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [9]:
import collections
vocabulary_size = 25000

def build_dataset(dataset):
    narration = dataset['narration'].tolist()
    narration = flatten(narration)

    words = [['UNK', -1]]
    words.extend(collections.Counter(narration).most_common(vocabulary_size - 1))

    # Dicionário
    dictionary = dict()
    for word, _ in words:
        dictionary[word] = len(dictionary)

    data = list()
    unk_count = 0

    for word in narration:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)

    words[0][1] = unk_count

    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    assert len(dictionary) == vocabulary_size

    return data, words, dictionary, reverse_dictionary
    
def build_dataset_with_existing_dictionary(narration, dictionary):
    data = list()
    for word in narration:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # 'UNK'
        data.append(index)
    return data


In [10]:
data, words, dictionary, reverse_dictionary = build_dataset(train)
test_data = {}

for index, row in test.iterrows():
    test_data[row['title']] = build_dataset_with_existing_dictionary(row['narration'],dictionary)

In [11]:
sample = next(iter(test_data))
print('\nPalavras mais comuns (+UNK)', words[:10])
print('\nAmostra: ', sample, test_data[sample])


Palavras mais comuns (+UNK) [['UNK', 30107], ('project', 33823), ('research', 29486), ('using', 17009), ('students', 15941), ('data', 15879), ('support', 15560), ('award', 12825), ('broader', 12743), ('impacts', 12682)]

Amostra:  Reactions of Arynes and Other Reactive Intermediates [6, 171, 628, 16182, 33, 442, 210, 1499, 0, 9330, 54, 3670, 2743, 16756, 728, 11, 2706, 2082, 4803, 278, 0, 146, 11, 797, 51, 2224, 4803, 14424, 309, 456, 1064, 3103, 463, 8346, 2700, 363, 11, 171, 129, 797, 51, 261, 270, 628, 456, 1064, 64, 846, 171, 240, 10411, 210, 1460, 11, 2082, 0, 129, 1593, 171, 2421, 109, 5281, 179, 101, 352, 706, 464, 378, 24, 71, 129, 2706, 342, 612, 64, 515, 2, 797, 162, 16052, 11331, 495, 312, 5142, 140, 1580, 29, 1070, 48, 706, 2207, 1956, 1174, 5601, 652, 147, 1377, 127, 1012, 261, 211, 24, 95, 52, 4, 2, 1, 165, 564, 2370, 4, 942, 105, 16756, 1764, 16756, 175, 33, 24, 95, 4, 2743, 360, 31, 1036, 557, 55, 33, 0, 24, 95, 0, 9330, 4, 32, 0, 24754, 24755, 1174, 28, 0, 210, 781, 1

In [16]:
def define_batchs(data, batch_size, window_size):
    index = 0
    span = 2 * window_size + 1
    batch = np.ndarray(shape=(batch_size,span-1), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)

    buffer = collections.deque(maxlen=span)

    for _ in range(span):
        buffer.append(data[index])
        index = (index + 1) % len(data)

    num_samples = 2*window_size

    for i in range(batch_size // num_samples):
        k=0
        for j in list(range(window_size))+list(range(window_size+1,2*window_size+1)):
            batch[i * num_samples + k] = buffer[window_size]
            labels[i * num_samples + k, 0] = buffer[j]
            k += 1

        buffer.append(data[index])
        index = (index + 1) % len(data)

    assert batch.shape[0]==batch_size and batch.shape[1]== span-1
    return batch, labels

In [21]:
def define_tests_batchs(data, batch_size):
    index = 0
    batch = np.ndarray(shape=(batch_size,), dtype=np.int32)

    for bi in range(batch_size):
        batch[bi] = data[index]
        index = (index + 1) % len(data)

    return batch

In [18]:
index = 0
batch, labels = define_batchs(data, batch_size=8, window_size=2)
print('\nCom window_size = %d:' % (2))
print('    batch:', [[reverse_dictionary[bii] for bii in bi] for bi in batch])
print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


Com window_size = 2:
    batch: [['study', 'study', 'study', 'study'], ['study', 'study', 'study', 'study'], ['study', 'study', 'study', 'study'], ['study', 'study', 'study', 'study'], ['interacting', 'interacting', 'interacting', 'interacting'], ['interacting', 'interacting', 'interacting', 'interacting'], ['interacting', 'interacting', 'interacting', 'interacting'], ['interacting', 'interacting', 'interacting', 'interacting']]
    labels: ['nontechnical', 'summarythe', 'interacting', 'electrons', 'summarythe', 'study', 'electrons', 'crucial']


In [23]:
index = 0
test_batch = define_tests_batchs(test_data[list(test_data.keys())[0]], batch_size=8)
print('\nCom window_size = %d:' % (2))
print('    labels:', [reverse_dictionary[li] for li in test_batch.reshape(8)])


Com window_size = 2:
    labels: ['support', 'chemical', 'synthesis', 'syn', 'program', 'division', 'chemistry', 'professor']


In [26]:
# setting the hiperparamethers

batch_size = 128
embedding_size = 128
window_size = 4

# validation 
valid_size = 16
valid_window = 50
valid_examples = np.array(random.sample(range(valid_window), valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)

num_sampled = 32

In [None]:
# setting the placeholders

tf.compat.v1.reset_default_graph()
train_dataset = tf.compat.v1.placeholder(tf.int32, shape=[batch_size,2*window_size])
train_labels = tf.compat.v1.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
test_labels = tf.compat.v1.placeholder(tf.int32, shape=[batch_size],name='test_dataset')

In [None]:
# setting embedding variables
embeddings = tf.Variable(tf.random.uniform([vocabulary_size, embedding_size], -1.0, 1.0,dtype=tf.float32))

softmax_weights = tf.Variable(tf.random.truncated_normal([vocabulary_size, embedding_size],
                     stddev=1.0 / math.sqrt(embedding_size),dtype=tf.float32))

softmax_biases = tf.Variable(tf.zeros([vocabulary_size],dtype=tf.float32))