# Code

## Data import

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import random
import requests
import gensim
import string

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, LSTM, Activation, Dense, Dropout, Embedding
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import RMSprop

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.test.utils import datapath

from pybtex.database import parse_file

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, GPT2LMHeadModel, GPT2Tokenizer



In [2]:
bib_data = parse_file('data/test_dataset.bib')

In [3]:
list(bib_data.entries.keys())

['wassa-2021-approaches',
 'xiang-etal-2021-toxccin',
 'kerz-etal-2021-language',
 'lindow-etal-2021-partisanship',
 'akula-garibay-2021-explainable',
 'troiano-etal-2021-emotion',
 'dayanik-pado-2021-disentangling',
 'lamprinidis-etal-2021-universal',
 'bianchi-etal-2021-feel']

In [4]:
os.remove('data.txt')

In [5]:
for k in bib_data.entries.keys():
    try:
        f = open('data.txt', 'a')
        f.write(bib_data.entries[k].fields['abstract'])
        f.close()
        print(k)
    except KeyError:
        pass

xiang-etal-2021-toxccin
kerz-etal-2021-language
lindow-etal-2021-partisanship
akula-garibay-2021-explainable
troiano-etal-2021-emotion
dayanik-pado-2021-disentangling
lamprinidis-etal-2021-universal
bianchi-etal-2021-feel


In [1]:
text = open("data.txt").read()

<class 'str'>


## Baseline #1: char-level LSTM

In [7]:
# eliminate stop words
def tokenize_input(input):
    # lowercase
    input = input.lower()

    # use tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # end result in final
    final = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(final)

In [8]:
processed = tokenize_input(text)

In [9]:
processed

'despite recent successes transformer based models terms effectiveness variety tasks decisions often remain opaque humans explanations particularly important tasks like offensive language toxicity detection social media manual appeal process often place dispute automatically flagged content work propose technique improve interpretability models based simple powerful assumption post least toxic toxic span incorporate assumption transformer models scoring post based maximum toxicity spans augmenting training process identify correct spans find approach effective produce explanations exceed quality provided logistic regression analysis often regarded highly interpretable model according human study aim paper twofold 1 automatically predict ratings assigned viewers 14 categories available ted talks multi label classification task 2 determine types features drive classification accuracy categories focus features language usage five groups pertaining syntactic complexity lexical richness reg

In [10]:
chars = sorted(list(set(processed)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [11]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [12]:
input_len = len(processed)
vocab_len = len(chars)
print ("Total chars:", input_len)
print ("Total vocab:", vocab_len)

Total chars: 5647
Total vocab: 36


In [13]:
seq_length = 100
x_data = []
y_data = []

In [14]:
# loop through inputs
for i in range(0, input_len - seq_length, 1):
    
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [15]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 5547


In [16]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [17]:
y = utils.to_categorical(y_data)

In [18]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [19]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
# saving weights
filepath = "lstm_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [21]:
# fit model
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Train on 5547 samples
Epoch 1/20
Epoch 00001: loss improved from inf to 3.10250, saving model to lstm_weights.hdf5
Epoch 2/20
Epoch 00002: loss improved from 3.10250 to 2.97373, saving model to lstm_weights.hdf5
Epoch 3/20
Epoch 00003: loss improved from 2.97373 to 2.95766, saving model to lstm_weights.hdf5
Epoch 4/20
Epoch 00004: loss improved from 2.95766 to 2.94320, saving model to lstm_weights.hdf5
Epoch 5/20
Epoch 00005: loss improved from 2.94320 to 2.94045, saving model to lstm_weights.hdf5
Epoch 6/20
Epoch 00006: loss improved from 2.94045 to 2.93801, saving model to lstm_weights.hdf5
Epoch 7/20
Epoch 00007: loss improved from 2.93801 to 2.93517, saving model to lstm_weights.hdf5
Epoch 8/20
Epoch 00008: loss improved from 2.93517 to 2.93384, saving model to lstm_weights.hdf5
Epoch 9/20
Epoch 00009: loss improved from 2.93384 to 2.93119, saving model to lstm_weights.hdf5
Epoch 10/20
Epoch 00010: loss improved from 2.93119 to 2.92339, saving model to lstm_weights.hdf5
Epoch 11/20

<tensorflow.python.keras.callbacks.History at 0x1c40bf22f60>

In [22]:
filename = "lstm_weights.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
# random seed initialization
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Generated Abstract: \n")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Generated Abstract: 

" im paper twofold 1 automatically predict ratings assigned viewers 14 categories available ted talks  "


## Baseline #2: char-level GRU

In [24]:
# Storing all the unique characters present in the text
vocabulary = sorted(list(set(text)))

# Creating dictionaries to map each character to an index
char_to_indices = dict((c, i) for i, c in enumerate(vocabulary))
indices_to_char = dict((i, c) for i, c in enumerate(vocabulary))

In [25]:
max_length = 100
steps = 5
sentences = []
next_chars = []
for i in range(0, len(text) - max_length, steps):
    sentences.append(text[i: i + max_length])
    next_chars.append(text[i + max_length])
      
# Hot encoding each character into a boolean vector
  
# Initializing a matrix of boolean vectors with each column representing
# the hot encoded representation of the character
X = np.zeros((len(sentences), max_length, len(vocabulary)), dtype = np.bool)
y = np.zeros((len(sentences), len(vocabulary)), dtype = np.bool)
  
# Placing the value 1 at the appropriate position for each vector
# to complete the hot-encoding process
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_indices[char]] = 1
    y[i, char_to_indices[next_chars[i]]] = 1

In [26]:
model = Sequential()
  
# Defining the cell type
model.add(GRU(128, input_shape =(max_length, len(vocabulary))))
  
# Defining the densely connected Neural Network layer
model.add(Dense(len(vocabulary)))
  
# Defining the activation function for the cell
model.add(Activation('softmax'))

# Defining the optimizing function
optimizer = RMSprop(lr = 0.01)
  
# Configuring the model for training
model.compile(loss ='categorical_crossentropy')

In [27]:
# Helper function to sample an index from a probability array
def sample_index(preds, temperature = 1.0):
# temperature determines the freedom the function has when generating text
  
    # Converting the predictions vector into a numpy array
    preds = np.asarray(preds).astype('float64')
  
    # Normalizing the predicitons array
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    # The main sampling step. Creates an array of probablities signifying
    # the probability of each character to be the next character in the 
    # generated text
    probas = np.random.multinomial(1, preds, 1)
  
    # Returning the character with maximum probability to be the next character
    # in the generated text
    return np.argmax(probas)

In [28]:
# Defining a helper function to save the model after each epoch
# in which the loss decreases
filepath = "gru_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor ='loss', 
                             save_best_only = True, 
                             mode ='min')

In [29]:
# Defining a helper function to reduce the learning rate each time the learning plateaus
reduce_alpha = ReduceLROnPlateau(monitor ='loss', factor = 0.2,
							patience = 1, min_lr = 0.001)
callbacks = [checkpoint, reduce_alpha]

In [30]:
# Training the GRU model
model.fit(X, y, batch_size = 128, epochs = 50, callbacks = callbacks)

Train on 1454 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1c40d1a6898>

In [32]:
def generate_text(length, diversity):
	# Get random starting text
	start_index = random.randint(0, len(text) - max_length - 1)

	# Defining the generated text
	generated = ''
	sentence = text[start_index: start_index + max_length]
	generated += sentence

	# Generating new text of given length
	for i in range(length):

			# Initializing the predicition vector
			x_pred = np.zeros((1, max_length, len(vocabulary)))
			for t, char in enumerate(sentence):
				x_pred[0, t, char_to_indices[char]] = 1.

			# Making the predicitons
			preds = model.predict(x_pred, verbose = 0)[0]

			# Getting the index of the next most probable index
			next_index = sample_index(preds, diversity)

			# Getting the most probable next character using the mapping built
			next_char = indices_to_char[next_index]

			# Generating new text
            generated += next_char
			sentence = sentence[1:] + next_char
	return generated

print(generate_text(500, 0.2))

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 28)

## Baseline #3: Standard GPT-2

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
inputs = tokenizer.encode('In this paper we present', 
                          max_length=1024, 
                          truncation=True,
                          return_tensors='pt')

In [None]:
outputs = model.generate(inputs, max_length=200, do_sample=True)

In [None]:
to_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
outputs = model.generate(inputs, 
                         max_length=200, 
                         do_sample=True)

tokenizer.decode(outputs[0], skip_special_tokens=True)

## Word-level LSTM + pre-trained word2vec 

In [None]:
text_path = "data/arxiv_abstracts.txt"

In [None]:
max_sentence_len = 40

with open(text_path) as file_:
    docs = file_.readlines()

sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in docs]

In [None]:
w2v_abs_model = gensim.models.Word2Vec(sentences, vector_size=100, min_count=1, window=5, epochs=5)
w2v_abs_model.save("word2vec_arxiv_abstracts.model")

In [None]:
vocab_size, emdedding_size = w2v_abs_model.wv.vectors.shape
print(vocab_size,emdedding_size)

In [None]:
example_vector = w2v_abs_model.wv['computer']
print(example_vector)

In [None]:
example_similar = w2v_abs_model.wv.most_similar('computer', topn=10) 
print(example_similar)

In [None]:
def word_to_id(word):
    return w2v_abs_model.wv.key_to_index[word]

def id_to_word(id):
    return w2v_abs_model.wv.index_to_key[id]

In [None]:
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)

for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word_to_id(word)
    train_y[i] = word_to_id(sentence[-1])

print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

In [None]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[w2v_abs_model.wv.vectors]))
model.add(LSTM(256, input_shape=(vocab_size, 100), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
model.fit(train_x, train_y,
          batch_size=128,
          epochs=20)

In [None]:
def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)

    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
  
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
  
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def generate_next(text, num_generated=20):
    word_ids = [word_to_id(word) for word in text.lower().split()]
    
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_ids))
        id = sample(prediction[-1], temperature=0.5)
        word_ids.append(id)
    
    return ' '.join(id_to_word(id) for id in word_ids)

In [None]:
random_word = random.choice(w2v_abs_model.wv.index_to_key)

generate_next("Here we present")

## LSTM + custom-trained word2vec

## LSTM + temporal embeddings

In [None]:
compass_path = "data/twec_test/compass.txt"

In [None]:
from temp import TWEC

# siter is the number of iterations of the compass, diter is the number of iterations of each slice
aligner = TWEC(vector_size=30, siter=10, diter=10, workers=4)

In [None]:
max_sentence_len = 40

with open(compass_path) as file_:
    docs = file_.readlines()

sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in docs]

In [None]:
aligner.train_compass(compass_path, overwrite=False)

In [None]:
slice_one = aligner.train_slice("data/twec_test/arxiv_14.txt", save=True) 
slice_two = aligner.train_slice("data/twec_test/arxiv_9.txt", save=True)

In [None]:
model1 = gensim.models.Word2Vec.load("model/arxiv_14.model")
model2 = gensim.models.Word2Vec.load("model/arxiv_9.model")

In [None]:
vocab_size, emdedding_size = model2.wv.vectors.shape
print(vocab_size,emdedding_size)

In [None]:
example_vector = model2.wv['computer']
print(example_vector)

In [None]:
example_similar = model2.wv.most_similar('computer', topn=10) 
print(example_similar)

In [None]:
def word_to_id(word):
    return model2.wv.key_to_index[word]

def id_to_word(id):
    return model2.wv.index_to_key[id]

In [None]:
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)

In [None]:
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word_to_id(word)
    train_y[i] = word_to_id(sentence[-1])

print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

In [None]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=30, weights=[model2.wv.vectors]))
model.add(LSTM(256, input_shape=(vocab_size, 30), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
model.fit(train_x, train_y,
          batch_size=128,
          epochs=10)

In [None]:
random_word = random.choice(model2.wv.index_to_key)

generate_next("This paper", 30)

## Evaluation