In [2]:
import os
import re
import requests
import zipfile
import pandas as pd
from functools import reduce

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import nltk
from nltk.corpus import stopwords

# typing
from typing import List, Callable, Dict

# Data pipeline
The goal of this section is to convert initial textual input into a numerical format that is compatible with our models.

## Data Loading
Download the dataset and save it to file system.
The dataset is composed by 3 csv files: train, validation and test. These 3 csv files will be loaded directly into 3 different dataframes.

In [None]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

Load the files into different data frames.

In [None]:
# Be sure all columns have a name
column_names = ['Row', 'Claim', 'Evidence', 'ID', 'Label']

# Load the datasets
df_train = pd.read_csv ('dataset/train_pairs.csv', names=column_names, header=0)
df_val = pd.read_csv ('dataset/val_pairs.csv', names=column_names, header=0)
df_test = pd.read_csv ('dataset/test_pairs.csv', names=column_names, header=0)


In [None]:
df_train.head()

Unnamed: 0,Row,Claim,Evidence,ID,Label
0,0,Chris Hemsworth appeared in A Perfect Getaway.,2\tHemsworth has also appeared in the science ...,3,SUPPORTS
1,1,Roald Dahl is a writer.,0\tRoald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈ...,7,SUPPORTS
2,2,Roald Dahl is a governor.,0\tRoald Dahl -LRB- -LSB- langpronˈroʊ.əld _ ˈ...,8,REFUTES
3,3,Ireland has relatively low-lying mountains.,10\tThe island 's geography comprises relative...,9,SUPPORTS
4,4,Ireland does not have relatively low-lying mou...,10\tThe island 's geography comprises relative...,10,REFUTES


## Data Pre-processing
Perform text cleaning and tokenization operations.
Start by creating some utility methods that will be used for cleaning the text.

In [None]:
# Special characters to remove: /(){}[]|@,;
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

# Accepted symbols:
# - numbers between 0-9
# - all lower cased letters
# - whitespace, #, + _
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

# - ^ begnning of a string
# - \d any digit
# - \s whitespaces and tabs
BEGINNING_IDS_RE = re.compile('^\d*\s*')

# The stopwords are a list of words that are very very common but don’t 
# provide useful information for most text analysis procedures.
# Therefore, they will be removed from the dataset
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    """
    Method used for removing most common words

    Parameters
    ----------
    text : str
        The text to process
    
    Returns
    -------
    text : str
        The processed text.
    """
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()

def replace_ids(text: str) -> str:
    """
    Method used for removing ids and some whitespaces that could appear
    at the beginning of the text.

    Parameters
    ----------
    text : str
        The text to process
    
    Returns
    -------
    text : str
        The processed text.
    """
    return BEGINNING_IDS_RE.sub('', text)

GENERIC_PREPROCESSING_PIPELINE = [
                                  lower,
                                  replace_special_characters,
                                  filter_out_uncommon_symbols,
                                  remove_stopwords,
                                  strip_text
                                  ]

EVIDENCES_PREPROCESSING_PIPELINE = GENERIC_PREPROCESSING_PIPELINE
EVIDENCES_PREPROCESSING_PIPELINE.insert(0, replace_ids)
    

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """
    filter_methods = \
        filter_methods if filter_methods is not None else GENERIC_PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)

Now we are ready to pre-process the dataset.

In [None]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    # Replace each sentence with its pre-processed version
    df['Evidence'] = df['Evidence'].apply(
        lambda txt: text_prepare(txt, filter_methods=EVIDENCES_PREPROCESSING_PIPELINE)
        )
    df['Claim'] = df['Claim'].apply(lambda txt: text_prepare(txt))
    
    return df

df_train = preprocess_dataset(df_train)
df_val = preprocess_dataset(df_val)
df_test = preprocess_dataset(df_test)

In [None]:
df_train.head()

Unnamed: 0,Row,Claim,Evidence,ID,Label
0,0,chris hemsworth appeared perfect getaway,hemsworth also appeared science fiction action...,3,SUPPORTS
1,1,roald dahl writer,roald dahl lrb lsb langpronrold _ dl rsb lsb u...,7,SUPPORTS
2,2,roald dahl governor,roald dahl lrb lsb langpronrold _ dl rsb lsb u...,8,REFUTES
3,3,ireland relatively lowlying mountains,island geography comprises relatively lowlying...,9,SUPPORTS
4,4,ireland relatively lowlying mountains,island geography comprises relatively lowlying...,10,REFUTES


#Models

##Constants and utilities

In [5]:
#Sample values
BATCH_SIZE = 32
EMBEDDING_SIZE = 64
MAX_TOKENS = 100
EPOCHS = 5
VOCABULARY_SIZE = 1000

RNN_UNITS = 128
DENSE_UNITS = 256
MLP_LAYERS = 3
DENSE_CLASSIFICATION_LAYERS = 3

EMBEDDING_MODE = "Simple" #This must be one between "Simple", "GloVe static" and "GloVe dynamic"
SENTENCE_EMBEDDING_MODE = "RNN last" #This must be one between "RNN last", "RNN mean", "Bag of vectors", "MLP"
RNN_MODEL = "GRU" #This must be one between "GRU" and "LSTM"
MERGE_MODE = "Concatenate" #This must be one between "Concatenate", "Sum" and "Mean"

model_compile_info = {
    'optimizer': keras.optimizers.Adam(learning_rate=1e-3),
    'loss': 'sparse_categorical_crossentropy',
    'metrics': [keras.metrics.SparseCategoricalAccuracy()],
}

# Model common training information
training_info = {
    'verbose': 1,
    'epochs': EPOCHS,
    'batch_size': BATCH_SIZE,
    'callbacks': [keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                patience=10,
                                                restore_best_weights=True)]
}

# Model common prediction information
prediction_info = {
    'batch_size': BATCH_SIZE,
    'verbose': 1
}

##Inputs

In [6]:
claims_input = keras.Input(shape=(MAX_TOKENS,EMBEDDING_SIZE), name="claims")
evidences_input = keras.Input(shape=(MAX_TOKENS,EMBEDDING_SIZE), name="evidences")

##Embedding

In [9]:
def token_embedding_layer(vocab_size,
                    embedding_size,
                    max_length,
                    layer_name,
                    pre_trained_weights=None,
                    train=True):
  
  if pre_trained_weights==None:
    layer = layers.Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_size, 
        input_length=max_length,
        mask_zero=True,
        trainable=train,
        name=layer_name
        )
  
  else:
    layer = layers.Embedding(
          input_dim=vocab_size, 
          output_dim=embedding_size, 
          input_length=max_length,
          weights=[pre_trained_weights],
          mask_zero=True,
          trainable=train,
          name=layer_name
          )
  
  return layer

#TODO: implement the other embedding approaches

##Sentence embedding

In [14]:
class RNN_Sentence_Embedding(keras.layers.Layer):
  def __init__(self, rnn_size, layer_name, mode, rnn_model):
    super(RNN_Last, self).__init__()

    self.name = layer_name
    self.mode = mode

    rnn_params= {units:rnn_size,
                 return_sequences:True,
                 return_state:True,
                 activation:"tanh"}

    if rnn_model=="GRU":
      layer = layers.GRU(rnn_params)
    elif rnn_model=="LSTM":
      layer = layers.LSTM(rnn_params)
    else:
      raise "Invalid RNN model. Use 'GRU' or 'LSTM'"

    self.rnn = layers.Bidirectional(layer)

  def call(self, inputs):

    whole_seq_output, final_memory_state, final_carry_state = self.rnn(inputs)

    if mode=="RNN last":
      return final_memory_state
    elif mode=="RNN mean":
      return tf.reduce_mean(whole_seq_output,axis=0) #axis=0 ->mean on max_tokens dim
    else:
      raise "Invalid Mode. Use 'RNN Last' or 'RNN Mean'"

class Bag_Of_Vectors_Sentence_Embedding(keras.layers.Layer):
  def __init__(self, layer_name):
    super(Bag_Of_Vectors_Sentence_Embedding, self).__init__()
    self.name=layer_name

  def call(self, inputs):
    return tf.reduce_mean(inputs, axis=0)  #axis=0 ->mean on max_tokens dim

class MLP_Sentence_Embedding(keras.layers.Layer):
  def __init__(self, dense_units, intermediate_dense_activation, last_dense_activation, n_layers, layer_name):
    super(MLP_Sentence_Embedding,self).__init__()

    self.name=layer_name
    self.n_layers = n_layers
    self.last_dense_activation = last_dense_activation

    self.intermediate_dense_layer = layers.Dense(units=dense_units,
                                    activation=intermediate_dense_activation)

  def call(self, inputs):
    inputs_shape = tf.shape(inputs).numpy()
    max_tokens = inputs_shape[0]
    embedding_size = inputs_shape[1]

    x = tf.reshape(inputs,[max_tokens*embedding_size])

    for i in range(self.n_layers-1):
      x = self.intermediate_dense_layer(x)
    
    last_dense_layer = layers.Dense(units=embedding_size, activation=self.last_dense_activation)
    return last_dense_layer(x)

def sentence_embedding_layer(mode, rnn_units, rnn_model, dense_units, mlp_layers, layer_name):
  if mode=="RNN last" or mode=="RNN mean":
    layer = RNN_Sentence_Embedding(rnn_units,layer_name,mode,rnn_model)
  elif mode=="Bag of vectors":
    layer = Bag_Of_Vectors_Sentence_Embedding(layer_name)
  elif mode=="MLP":
    layer = MLP_Sentence_Embedding(dense_units,"relu","tanh",mlp_layers,layer_name)
  else:
    raise "Invalid Mode."

  return layer

##Merge inputs

In [12]:
def concatenate(layer_name):

  return layers.Concatenate(name=layer_name)

def sum(layer_name):

  return layers.Add(name=layer_name)

def mean(layer_name):

  return layers.Average(name=layer_name)

def merge_layer(merge_mode, layer_name):
  if merge_mode=="Concatenate":
    return concatenate
  elif merge_mode=="Sum":
    return sum(layer_name)
  elif merge_mode=="Mean":
    return mean(layer_name)
  else:
    raise "Invalid merge mode."

##Classification

In [13]:
def dense_classification_layer(dense_units, activation_function, layer_name, last):

  if last:
    layer= layers.Dense(units=1,
                        activation="softmax",
                        name=layer_name)
    
  else:
    layer= layers.Dense(units=dense_units,
                        activation=dense_activation,
                        name=layer_name)
  
  return layer

#TODO: add other classification architectures

##Build model

In [None]:
#Building layers
layer_embedded_tokens = token_embedding_layer(VOCABULARY_SIZE,
                                         EMBEDDING_SIZE,
                                         MAX_TOKENS,
                                         "token embedding")

layer_embedded_sentences = sentence_embedding_layer(SENTENCE_EMBEDDING_MODE,
                                                           RNN_UNITS,
                                                           RNN_MODEL,
                                                           DENSE_UNITS,
                                                           MLP_LAYERS,
                                                           "sentences embedding")

layer_merge = merge_layer("merge")

layer_classification = dense_classification_layer(DENSE_UNITS,
                                                  "relu",
                                                  "intermidiate classification",
                                                  False)
layer_output = dense_classification_layer(None,
                                          None,
                                          "output classification",
                                          True)

#Building model
claims_tokens_embedded = layer_embedded_tokens(claims_input)
evidences_tokens_embedded = layer_embedded_tokens(evidences_input)

claims_sentences_embedded = layer_embedded_sentences(claims_tokens_embedded)
evidences_sentences_embedded = layer_embedded_sentences(evidences_tokens_embedded)

classification_input = layer_merge(claims_sentences_embedded,evidences_sentences_embedded)

for i in range(DENSE_CLASSIFICATION_LAYERS-1):
  classification_input = layer_classification(classification_input)

classification_output = layer_output(classification_input)

# Instantiate an end-to-end model
model = keras.Model(
    inputs=[claims_input, evidences_input],
    outputs=[classification_output]
)