# Code

## Data import

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import random
import requests

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, LSTM, Activation, Dense, Dropout
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import RMSprop

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from pybtex.database import parse_file

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, GPT2LMHeadModel, GPT2Tokenizer

In [56]:
bib_data = parse_file('data/test_dataset.bib')

In [57]:
list(bib_data.entries.keys())

['wassa-2021-approaches',
 'xiang-etal-2021-toxccin',
 'kerz-etal-2021-language',
 'lindow-etal-2021-partisanship',
 'akula-garibay-2021-explainable',
 'troiano-etal-2021-emotion',
 'dayanik-pado-2021-disentangling',
 'lamprinidis-etal-2021-universal',
 'bianchi-etal-2021-feel']

In [61]:
os.remove('data.txt')

In [62]:
for k in bib_data.entries.keys():
    try:
        f = open('data.txt', 'a')
        f.write(bib_data.entries[k].fields['abstract'])
        f.close()
        print(k)
    except KeyError:
        pass

xiang-etal-2021-toxccin
kerz-etal-2021-language
lindow-etal-2021-partisanship
akula-garibay-2021-explainable
troiano-etal-2021-emotion
dayanik-pado-2021-disentangling
lamprinidis-etal-2021-universal
bianchi-etal-2021-feel


In [9]:
text = open("data.txt").read()

## Baseline #1: char-level LSTM

In [10]:
# eliminate stop words
def tokenize_input(input):
    # lowercase
    input = input.lower()

    # use tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # end result in final
    final = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(final)

In [11]:
processed = tokenize_input(text)

In [13]:
chars = sorted(list(set(processed)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [14]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [15]:
input_len = len(processed)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 5647
Total vocab: 36


In [16]:
seq_length = 100
x_data = []
y_data = []

In [17]:
# loop through inputs
for i in range(0, input_len - seq_length, 1):
    
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [18]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 5547


In [19]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [20]:
y = utils.to_categorical(y_data)

In [21]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [22]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
# saving weights
filepath = "lstm_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [24]:
# fit model
model.fit(X, y, epochs=10, batch_size=256, callbacks=desired_callbacks)

Train on 5547 samples
Epoch 1/10
Epoch 00001: loss improved from inf to 3.07730, saving model to model_weights_saved.hdf5
Epoch 2/10
Epoch 00002: loss improved from 3.07730 to 2.96878, saving model to model_weights_saved.hdf5
Epoch 3/10
Epoch 00003: loss improved from 2.96878 to 2.95555, saving model to model_weights_saved.hdf5
Epoch 4/10
Epoch 00004: loss improved from 2.95555 to 2.94841, saving model to model_weights_saved.hdf5
Epoch 5/10
Epoch 00005: loss improved from 2.94841 to 2.93708, saving model to model_weights_saved.hdf5
Epoch 6/10
Epoch 00006: loss improved from 2.93708 to 2.93647, saving model to model_weights_saved.hdf5
Epoch 7/10
Epoch 00007: loss improved from 2.93647 to 2.92930, saving model to model_weights_saved.hdf5
Epoch 8/10
Epoch 00008: loss improved from 2.92930 to 2.92780, saving model to model_weights_saved.hdf5
Epoch 9/10
Epoch 00009: loss did not improve from 2.92780
Epoch 10/10
Epoch 00010: loss improved from 2.92780 to 2.92664, saving model to model_weight

<tensorflow.python.keras.callbacks.History at 0x16f33c57fd0>

In [25]:
filename = "lstm_weights.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
# random seed initialization
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Generated Abstract: \n")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Generated Abstract: 

" easures liwc style measures show recurrent neural network classifier trained exclusively within text "


## Baseline #2: char-level GRU

In [150]:
# Storing all the unique characters present in the text
vocabulary = sorted(list(set(text)))

# Creating dictionaries to map each character to an index
char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

In [151]:
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])
      
# Hot encoding each character into a boolean vector
  
# Initializing a matrix of boolean vectors with each column representing
# the hot encoded representation of the character
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)
  
# Placing the value 1 at the appropriate position for each vector
# to complete the hot-encoding process
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

In [152]:
model = Sequential()
  
# Defining the cell type
model.add(GRU(128, input_shape =(max_length, len(vocabulary))))
  
# Defining the densely connected Neural Network layer
model.add(Dense(len(vocabulary)))
  
# Defining the activation function for the cell
model.add(Activation('softmax'))

# Defining the optimizing function
optimizer = RMSprop(lr = 0.01)
  
# Configuring the model for training
model.compile(loss ='categorical_crossentropy')

In [153]:
# Helper function to sample an index from a probability array
def sample_index(preds, temperature = 1.0):
# temperature determines the freedom the function has when generating text
  
    # Converting the predictions vector into a numpy array
    preds = np.asarray(preds).astype('float64')
  
    # Normalizing the predicitons array
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    # The main sampling step. Creates an array of probablities signifying
    # the probability of each character to be the next character in the 
    # generated text
    probas = np.random.multinomial(1, preds, 1)
  
    # Returning the character with maximum probability to be the next character
    # in the generated text
    return np.argmax(probas)

In [154]:
# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = "gru_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor ='loss', 
                             save_best_only = True, 
                             mode ='min')

In [155]:
# Defining a helper function to reduce the learning rate each time the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor ='loss', factor = 0.2,
							patience = 1, min_lr = 0.001)
callbacks = [checkpoint, reduce_alpha]

In [156]:
# Training the GRU model
model.fit(X, y, batch_size = 128, epochs = 50, callbacks = callbacks)

Train on 1454 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x2509a43f898>

In [157]:
def generate_text(length, diversity):
	# Get random starting text
	start_index = random.randint(0, len(text) - max_length - 1)

	# Defining the generated text
	generated = ''
	sentence = text[start_index: start_index + max_length]
	generated += sentence

	# Generating new text of given length
	for i in range(length):

			# Initializing the predicition vector
			x_pred = np.zeros((1, max_length, len(vocabulary)))
			for t, char in enumerate(sentence):
				x_pred[0, t, char_to_indices[char]] = 1.

			# Making the predicitons
			preds = model.predict(x_pred, verbose = 0)[0]

			# Getting the index of the next most probable index
			next_index = sample_index(preds, diversity)

			# Getting the most probable next character using the mapping built
			next_char = indices_to_char[next_index]

			# Generating new text
			generated += next_char
			sentence = sentence[1:] + next_char
	return generated

print(generate_text(500, 0.2))

ble for TED talks in a multi-label classification task and (2) to determine what types of features derficas an tal angess on the pres an the ereress on the pres ans angess on the pres ans andeal ange ses anne the pres ange seresich of the pres anne pres anntantent tont the ereress on the pres ans ange seres an that ans ange pred ans angesich ored anntict on the pres ange pres anne pred at onthe feress to turessal se furess the preas on the pres ans ange serfire ine pred the ereress on theress on the pres an enthe ferress on the pres angeress on the pres ans andeltat on the pres ange ing ares a


## Baseline #3: Standard GPT-2

In [20]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [23]:
inputs = tokenizer.encode('In this paper we present', 
                          max_length=1024, 
                          truncation=True,
                          return_tensors='pt')

In [24]:
outputs = model.generate(inputs, max_length=200, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [25]:
to_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [26]:
outputs = model.generate(inputs, 
                         max_length=200, 
                         do_sample=True)

tokenizer.decode(outputs[0], skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'In this paper we present a system of neural networks to analyze the role of specific brain regions in various tasks, including perceptual recognition and task performance. We conclude that the use of neural networks may have unexpected evolutionary potential for early human ancestors.'

# Model

# Evaluation