In [None]:
import os
import re
import requests
import zipfile
import pandas as pd
from functools import reduce, partial

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize,WhitespaceTokenizer

# typing
from typing import List, Callable, Dict

# Data pipeline
The goal of this section is to convert initial textual input into a numerical format that is compatible with our models.

## Data Loading
Download the dataset and save it to file system.
The dataset is composed by 3 csv files: train, validation and test. These 3 csv files will be loaded directly into 3 different dataframes.

In [None]:
def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def download_data(data_path):
    toy_data_path = os.path.join(data_path, 'fever_data.zip')
    toy_data_url_id = "1wArZhF9_SHW17WKNGeLmX-QTYw9Zscl1"
    toy_url = "https://docs.google.com/uc?export=download"

    if not os.path.exists(data_path):
        os.makedirs(data_path)

    if not os.path.exists(toy_data_path):
        print("Downloading FEVER data splits...")
        with requests.Session() as current_session:
            response = current_session.get(toy_url,
                                   params={'id': toy_data_url_id},
                                   stream=True)
        save_response_content(response, toy_data_path)
        print("Download completed!")

        print("Extracting dataset...")
        with zipfile.ZipFile(toy_data_path) as loaded_zip:
            loaded_zip.extractall(data_path)
        print("Extraction completed!")

download_data('dataset')

Load the files into different data frames.

In [None]:
# Be sure all columns have a name
column_names = ['Row', 'Claim', 'Evidence', 'ID', 'Label']

# Load the datasets
df_train = pd.read_csv ('dataset/train_pairs.csv', names=column_names, header=0)
df_val = pd.read_csv ('dataset/val_pairs.csv', names=column_names, header=0)
df_test = pd.read_csv ('dataset/test_pairs.csv', names=column_names, header=0)


In [None]:
df_train.head()

## Data Pre-processing
Perform text cleaning and tokenization operations.
Start by creating some utility methods that will be used for cleaning the text.

In [None]:
# Special characters to remove: /(){}[]|@,;
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

# Accepted symbols:
# - numbers between 0-9
# - all lower cased letters
# - whitespace, #, + _
GOOD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

# - ^ begnning of a string
# - \d any digit
# - \s whitespaces and tabs
BEGINNING_IDS_RE = re.compile('^\d*\s*')

# Remove multiple whitespaces, tabs and newlines
EXTRA_WHITE_SPACE_RE = re.compile('/\s\s+/g')

TAGS_TO_REMOVE = ['-LCB-', '-RCB-', '-LSB-', '-RSB-', '-RRB', '-LRB-']


# The stopwords are a list of words that are very very common but don’t 
# provide useful information for most text analysis procedures.
# Therefore, they will be removed from the dataset
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    STOPWORDS = set(stopwords.words('english'))


nltk.download('punkt') # necessary for being able to tokenize
nltk.download('wordnet') 
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
tokenizer = WhitespaceTokenizer()

from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def lower(text: str) -> str:
    """
    Transforms given text to lower case.
    Example:
    Input: 'I really like New York city'
    Output: 'i really like new your city'
    """

    return text.lower()

def replace_special_characters(text: str) -> str:
    """
    Replaces special characters, such as paranthesis,
    with spacing character
    """

    return REPLACE_BY_SPACE_RE.sub(' ', text)

def filter_out_uncommon_symbols(text: str) -> str:
    """
    Removes any special character that is not in the
    good symbols list (check regular expression)
    """

    return GOOD_SYMBOLS_RE.sub('', text)

def remove_stopwords(text: str) -> str:
    """
    Method used for removing most common words

    Parameters
    ----------
    text : str
        The text to process
    
    Returns
    -------
    text : str
        The processed text.
    """
    return ' '.join([x for x in text.split() if x and x not in STOPWORDS])

def strip_text(text: str) -> str:
    """
    Removes any left or right spacing (including carriage return) from text.
    Example:
    Input: '  This assignment is cool\n'
    Output: 'This assignment is cool'
    """

    return text.strip()

def replace_ids(text: str) -> str:
    """
    Method used for removing ids and some whitespaces that could appear
    at the beginning of the text.

    Parameters
    ----------
    text : str
        The text to process
    
    Returns
    -------
    text : str
        The processed text.
    """
    return BEGINNING_IDS_RE.sub('', text)

def lemsent(sentence):
    """
    Method used for lemmatize text.

    Parameters
    ----------
    text : str
        The text to process.
    
    Returns
    -------
    text : str
        The processed text.
    """
    #words = [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(str(sentence))]
    words = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]
    return " ".join(words)

def remove_pos_tags(text: str) -> str:
  for tag in TAGS_TO_REMOVE:
    text = re.sub(tag, '', text)
  return text

def remove_wikipedia_tags(text: str) -> str:
  return text.split('.\t')[0]

GENERIC_PREPROCESSING_PIPELINE = [
                                  remove_wikipedia_tags,
                                  remove_pos_tags,
                                  lower,
                                  replace_special_characters,
                                  filter_out_uncommon_symbols,
                                  #remove_stopwords,
                                  strip_text,
                                  lemsent
                                  ]

EVIDENCES_PREPROCESSING_PIPELINE = GENERIC_PREPROCESSING_PIPELINE
EVIDENCES_PREPROCESSING_PIPELINE.insert(0, replace_ids)
    

def text_prepare(text: str,
                 filter_methods: List[Callable[[str], str]] = None) -> str:
    """
    Applies a list of pre-processing functions in sequence (reduce).
    Note that the order is important here!
    """
    filter_methods = \
        filter_methods if filter_methods is not None else GENERIC_PREPROCESSING_PIPELINE

    return reduce(lambda txt, f: f(txt), filter_methods, text)

Now we are ready to pre-process the dataset.

In [None]:
def preprocess_dataset(df: pd.DataFrame) -> pd.DataFrame:
    # Replace each sentence with its pre-processed version
    df['Evidence'] = df['Evidence'].apply(
        lambda txt: text_prepare(txt, filter_methods=EVIDENCES_PREPROCESSING_PIPELINE)
        )
    df['Claim'] = df['Claim'].apply(lambda txt: text_prepare(txt))
    
    return df

df_train = preprocess_dataset(df_train)
df_val = preprocess_dataset(df_val)
df_test = preprocess_dataset(df_test)

In [None]:
# TODO: comments that we tested this, but actually not used
label_count = df_train['Label'].value_counts()
pos = label_count['SUPPORTS']
neg = label_count['REFUTES']
print(pos, neg)

initial_bias = np.log([pos/neg])
print(initial_bias)

## Tokenization and vocabularies
Here each set is passed through a tokenization process, which also allows to define the vocabulary of each set and also their vocabulary size. Furthermore the maximum length of a token sequence is defined and the labels are extracted from the sets.

In [None]:
STARTING_TOKEN = 1 #First value to start the tokenization on (0 is already used as padding value)
QUANTILE = 0.99

def get_tokenizer(corpus: List[str],
                  starting_dict=None)->Dict[str,int]:
  '''
  Create or expand (given an existing dictionary) a tokenization dictionary
  that associates an integer to each word.

  Parameters:
  -----------
  corpus: List[str]
    Text to examine searching for new words to add into the dictionary.
  starting_dict: Dict[str,int]
    An already filled dictionary to further expand (optional).

  Returns:
  --------
  words_to_tokens: Dict[str,int]
    1. A filled dictionary that associates an integer to each word (if starting_dict=None);
    2. An expanded dictionary that associates an integer to each new word (if starting_dict is not None)
  '''

  #Copy the original dictionary to keep it save from updates
  words_to_tokens = {} if starting_dict==None else starting_dict.copy()

  for text in corpus:
    words = text.split()
    for word in words:
      if not word in words_to_tokens:
        words_to_tokens[word] = len(words_to_tokens)+STARTING_TOKEN

  return words_to_tokens

def tokenize(word: str,
             words_to_tokens: Dict[str,int])->int:
  '''
  Get the integer value of a given token.

  Parameters:
  -----------
  word: str
    Token
  words_to_tokens: Dict[str,int]
    Tokenization dictionary

  Returns:
  -------
  int:
    Value associated to the token
  '''

  return words_to_tokens[word]

def detokenize(token:int,
               words_to_tokens: Dict[str,int])->str:
  '''
  Get the token-word of a given token-value.

  Parameters:
  -----------
  token: int
    Tokenized word
  words_to_tokens: Dict[str,int]
    Tokenization dictionary

  Returns:
  -------
  str:
    Word associated to the token-value
  '''

  val_list = list(words_to_tokens.values())
  key_list = list(words_to_tokens.keys())

  position = val_list.index(token)

  return key_list[position]

def tokenize_string(string: str,
                    words_to_tokens: Dict[str,int],
                    max_length: int)->List[int]:

  '''
  Get the tokenized sequence of a string of separated tokens (document/sentence).

  Parameters:
  string: str
    String of separated tokens (document or sentence)
  words_to_tokens: Dict[str,int]
    Tokenization dictionary
  max_length: int
    Tokenization length

  Returns:
    List[int]:
      A list of token-values where each one is the tokenized value of a token
      int the input-string.
      The list is padded if its length is below the max_length.
      The list is truncated if its length is above the max_length.
  '''

  tokens = string.split()
  tokenized_sequence = [tokenize(token, words_to_tokens)  for token in tokens]
  length_diff = max_length-len(tokenized_sequence)

  if length_diff==0: # Return the same sequence if it has the requested size
    return tokenized_sequence
  elif length_diff<0: # Return the truncated sequence if it exceeds the requested size
    return tokenized_sequence[0:max_length]
  else: # Return the padded sequence if it has an inferior size than the expected one
    return np.pad(tokenized_sequence, (0, length_diff), 'constant').tolist()

def label_to_binary(label):
  if label=="SUPPORTS":
    return 1
  elif label=="REFUTES":
    return 0
  else:
    raise "Invalid label."

#Define corpus
train_text_claim = df_train["Claim"].tolist()
train_text_evidence = df_train["Evidence"].tolist()
val_text_claim = df_val["Claim"].tolist()
val_text_evidence = df_val["Evidence"].tolist()
test_text_claim = df_test["Claim"].tolist()
test_text_evidence = df_test["Evidence"].tolist()

#Define labels
train_labels = df_train["Label"].tolist()
val_labels = df_val["Label"].tolist()
test_labels = df_test["Label"].tolist()

#Token dictionary
corpus = train_text_claim+train_text_evidence+val_text_claim+val_text_evidence+test_text_claim+test_text_evidence
tokens_dictionary = get_tokenizer(corpus)

#Vocabulary
tokens_vocabulary = tokens_dictionary.keys()

#Vocab size
vocabulary_size = len(tokens_vocabulary)+STARTING_TOKEN #+1 to include padding value

#Max length of a token sequence
n_tokens = [len(doc.split()) for doc in corpus]
max_length = int(np.quantile(n_tokens,QUANTILE))

#Tokenized sets
train_claims_tokenized = np.array(list(map(lambda string: tokenize_string(string, tokens_dictionary,max_length),train_text_claim)))
train_evidences_tokenized = np.array(list(map(lambda string: tokenize_string(string, tokens_dictionary,max_length),train_text_evidence)))

val_claims_tokenized = np.array(list(map(lambda string: tokenize_string(string, tokens_dictionary,max_length),val_text_claim)))
val_evidences_tokenized = np.array(list(map(lambda string: tokenize_string(string, tokens_dictionary,max_length),val_text_evidence)))

test_claims_tokenized = np.array(list(map(lambda string: tokenize_string(string, tokens_dictionary,max_length),test_text_claim)))
test_evidences_tokenized = np.array(list(map(lambda string: tokenize_string(string, tokens_dictionary,max_length),test_text_evidence)))

#Tokenized labels
train_labels_tokenized = np.array(list(map(label_to_binary,train_labels)))
val_labels_tokenized = np.array(list(map(label_to_binary,val_labels)))
test_labels_tokenized = np.array(list(map(label_to_binary,test_labels)))

#Models

##Constants and utilities

In [None]:
#Sample values
BATCH_SIZE = 32
EMBEDDING_SIZE = 64
EPOCHS = 50

DENSE_UNITS = 64
MLP_LAYERS = 2 #One will be a dropout layer
DENSE_CLASSIFICATION_LAYERS = 3 #One will be a dropout layer

L2_RATE = 0.01
DROPOUT_RATE = 0.4
LEARNING_RATE = 1e-3

TOKEN_EMBEDDING_MODE = "Simple" #This must be one between "Simple", "GloVe static" and "GloVe dynamic"
SENTENCE_EMBEDDING_MODE = "RNN mean" #This must be one between "RNN last", "RNN mean", "Bag of vectors", "MLP"
RNN_MODEL = "LSTM" #This must be one between "GRU" and "LSTM"
MERGE_MODE = "Concatenate" #This must be one between "Concatenate", "Sum" and "Mean"
APPLY_COSINE_SIMILARITY = True #Simple extension task
CLAIM_VERIFICATION_EVALUATION = True #Evaluates using the claim verification evaluation method

OPTIMIZER = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
LOSS = tf.keras.losses.BinaryCrossentropy()
METRICS = [tf.keras.metrics.BinaryAccuracy()]

# Model common training information
training_info = {
    'verbose': 1,
    'epochs': EPOCHS,
    'batch_size': BATCH_SIZE,
    'callbacks': [keras.callbacks.EarlyStopping(monitor='val_loss', 
                                                patience=10,
                                                restore_best_weights=True)]
}

# Model common prediction information
prediction_info = {
    'batch_size': BATCH_SIZE,
    'verbose': 1
}

##Inputs

In [None]:
claims_input = keras.Input(shape=(max_length), name="claims")
evidences_input = keras.Input(shape=(max_length), name="evidences")

##Token embedding

In [None]:
def token_embedding_layer(vocab_size,
                    embedding_size,
                    max_length,
                    layer_name,
                    pre_trained_weights=None,
                    train=True):

  if pre_trained_weights is None:
    layer = layers.Embedding(
        input_dim=vocab_size, 
        output_dim=embedding_size, 
        input_length=max_length,
        mask_zero=True,
        trainable=train,
        name=layer_name
        )
  
  else:
    layer = layers.Embedding(
          input_dim=vocab_size, 
          output_dim=embedding_size, 
          input_length=max_length,
          weights=[pre_trained_weights],
          mask_zero=True,
          trainable=train,
          name=layer_name
          )
  
  return layer

##Sentence embedding

In [None]:
class RNN_Sentence_Embedding(keras.layers.Layer):
  def __init__(self, rnn_size, layer_name, mode, rnn_model,dropout,l2):
    super(RNN_Sentence_Embedding, self).__init__(name=layer_name)

    self.mode = mode

    # TODO: TEMP!
    dropout = None
    l2 = None
    
    dropout_value = dropout
    if dropout is None:
      dropout_value = 0

    print(f'recurrent_dropout: {dropout_value}')
    

    rnn_params= {"units":rnn_size,
                 "return_sequences":True,
                 "return_state":True,
                 "activation":"tanh",
                 "kernel_regularizer": l2,
                 "dropout": dropout_value
                 }

    if rnn_model=="GRU":
      layer = layers.GRU(**rnn_params)
    elif rnn_model=="LSTM":
      layer = layers.LSTM(**rnn_params)
    else:
      raise "Invalid RNN model. Use 'GRU' or 'LSTM'"

    self.rnn = layers.Bidirectional(layer,merge_mode="ave")
    print(f'Adding bidirectional rnn with {rnn_size} units')

    if self.mode=="RNN last":
      self.average_layer = layers.Average()
      print(f'Adding average layer')
    elif self.mode=="RNN mean":
      self.average_layer = layers.GlobalAveragePooling1D()
      print(f'Adding global average pooling layer')
    else:
      raise "Invalid Mode. Use 'RNN Last' or 'RNN Mean'"

  def compute_mask(self, inputs, mask=None):
    return None

  def call(self, inputs, mask=None):

    whole_seq_output, forward_h, forward_c, backward_h, backward_c = self.rnn(inputs,mask=mask)

    if self.mode=="RNN last":
      return self.average_layer([forward_h, backward_h])
    elif self.mode=="RNN mean":
      return self.average_layer(whole_seq_output,mask=mask)

class Bag_Of_Vectors_Sentence_Embedding(keras.layers.Layer):
  def __init__(self, layer_name):
    super(Bag_Of_Vectors_Sentence_Embedding, self).__init__(name=layer_name)

    self.average_pooling_layer = layers.GlobalAveragePooling1D()
    print(f'Adding global average pooling layer')

  def compute_mask(self, inputs, mask=None):
    return None

  def call(self, inputs, mask=None):
    #masked_inputs = tf.ragged.boolean_mask(inputs,mask)
    #return tf.reduce_mean(masked_inputs, axis=1)
    return self.average_pooling_layer(inputs,mask=mask)

class MLP_Sentence_Embedding(keras.layers.Layer):
  def __init__(self, 
               max_dense_units,
               n_layers, 
               layer_name, 
               max_tokens, 
               embedding_size, 
               dropout=None,
               l2=None):
    

    super(MLP_Sentence_Embedding,self).__init__(name=layer_name)

    self.layer_list = []
    
    reshape_layer = layers.Reshape((max_tokens*embedding_size,))
    self.layer_list.append(reshape_layer)
    
    for i in range(n_layers):
      n_units = self.number_of_units(i, n_layers, max_dense_units, embedding_size)
      dense_layer = layers.Dense(
          units=n_units,
          activation='tanh',
          kernel_regularizer=l2
          )
      self.layer_list.append(dense_layer)
      print(f'Adding dense layer with {n_units} units')

      if dropout is not None:
        dropout_layer = layers.Dropout(dropout)
        self.layer_list.append(dropout_layer)
        print(f'Adding dropout layer')
      
  def compute_mask(self, inputs, mask=None):
    return None
  
  def number_of_units(self, 
                      layer_number: int, 
                      max_layer_number: int,
                      max_dense_units: int,
                      embedding_size: int):
    
    if layer_number == max_layer_number - 1:
      n = embedding_size
    else:
      n = max_dense_units / 2**layer_number
      if(n < embedding_size):
        n = embedding_size
    
    return n

  def call(self, x, mask=None):

    for layer in self.layer_list:
      x = layer(x)
    
    return x

def sentence_embedding_layer(mode, 
                             rnn_model, 
                             dense_units, 
                             mlp_layers, 
                             layer_name, 
                             max_tokens, 
                             embedding_size, 
                             dropout, 
                             l2):
  print('Sentence Embedding layers creation started')

  if mode=="RNN last" or mode=="RNN mean":
    layer = RNN_Sentence_Embedding(embedding_size,layer_name,mode,rnn_model,dropout,l2)
  elif mode=="Bag of vectors":
    layer = Bag_Of_Vectors_Sentence_Embedding(layer_name)
  elif mode=="MLP":
    layer = MLP_Sentence_Embedding(dense_units, mlp_layers, layer_name, max_tokens, embedding_size, dropout, l2)
  else:
    raise Exception("Invalid Mode.")
  
  print('Sentence Embedding layers creation completed')

  return layer

##Merge inputs

In [None]:
def concatenate(layer_name):

  return layers.Concatenate(name=layer_name)

def sum(layer_name):

  return layers.Add(name=layer_name)

def mean(layer_name):

  return layers.Average(name=layer_name)

def merge_layer(merge_mode, layer_name):
  if merge_mode=="Concatenate":
    return concatenate(layer_name)
  elif merge_mode=="Sum":
    return sum(layer_name)
  elif merge_mode=="Mean":
    return mean(layer_name)
  else:
    raise Exception("Invalid merge mode.")

##Classification

In [None]:
def dense_classification_layer(dense_units, activation_function, layer_name, last, l2=None):

  if last:
    layer= layers.Dense(1,
                        activation="sigmoid",
                        name=layer_name)
    
  else:
    layer= layers.Dense(units=dense_units,
                        activation=activation_function,
                        kernel_regularizer=l2,
                        name=layer_name)
  
  return layer

##Build model

###Building layers

In [None]:
def build_layers(vocab_size,
                 embedding_size,
                 max_tokens,
                 token_embedding_mode,
                 sentence_embedding_mode,
                 dense_units,
                 mlp_layers,
                 dense_classification_layers,
                 rnn_model,
                 dropout=None,
                 l2=None):

  layer_embedded_tokens = token_embedding_layer(vocab_size,
                                                embedding_size,
                                                max_tokens,
                                                "token_embedding")

  layer_embedded_sentences = sentence_embedding_layer(sentence_embedding_mode,
                                                      rnn_model,
                                                      dense_units,
                                                      mlp_layers,
                                                      "sentences_embedding",
                                                      max_tokens,
                                                      embedding_size,
                                                      dropout,
                                                      l2)

  layer_merge = merge_layer(MERGE_MODE,"merge")

  classification_layers = []
  
  for i in range(dense_classification_layers):
    units = dense_units/2**i
    is_last_layer = False
    layer_name = "intermediate_classification_"+str(i+1)

    if i == (dense_classification_layers - 1):
      is_last_layer = True
      layer_name = "final_classification"
    
    classification_layers.append(
        dense_classification_layer(units,
                                   "tanh",
                                   layer_name,
                                   is_last_layer,
                                   l2)
        )
    
    if dropout is not None and not is_last_layer:
      classification_layers.append(layers.Dropout(dropout))
  
  return (layer_embedded_tokens, layer_embedded_sentences, layer_merge, classification_layers)

###End-to-end model

In [None]:
def build_model(built_layers,
                claims_input,
                evidences_input,
                dense_classification_layers):

  layer_embedded_tokens, layer_embedded_sentences, layer_merge, classification_layers = built_layers

  claims_tokens_embedded = layer_embedded_tokens(claims_input)
  evidences_tokens_embedded = layer_embedded_tokens(evidences_input)

  claims_sentences_embedded = layer_embedded_sentences(claims_tokens_embedded)
  evidences_sentences_embedded = layer_embedded_sentences(evidences_tokens_embedded)

  classification_input = layer_merge([claims_sentences_embedded,evidences_sentences_embedded])

  #Cosine similarity extension step:
  if APPLY_COSINE_SIMILARITY:
    layer_cosine_similarity = layers.Dot(axes=(1), normalize=True,name="cosine_similarity") #Normalize=True will compute the cosine similarity (see documentation)
    layer_concatenation = layers.Concatenate(name="cosine_similarity_concat")
    cosine_similarity = layer_cosine_similarity([claims_sentences_embedded,evidences_sentences_embedded])
    classification_output = layer_concatenation([classification_input,cosine_similarity])
  
  

  for layer in classification_layers:
    classification_output = layer(classification_output)

  # Instantiate an end-to-end model
  model = keras.Model(
      inputs=[claims_input, evidences_input],
      outputs=[classification_output]
  )


  return model

###Build and compile model

In [None]:
def build_and_compile(vocab_size,
                      embedding_size,
                      max_tokens,
                      token_embedding_mode,
                      sentence_embedding_mode,
                      dense_units,
                      mlp_layers,
                      rnn_model,
                      claims_input,
                      evidences_input,
                      dense_classification_layers,
                      optimizer,
                      loss,
                      metrics,
                      dropout=None,
                      l2=None):

  built_layers = build_layers(vocab_size,
                              embedding_size,
                              max_tokens,
                              token_embedding_mode,
                              sentence_embedding_mode,
                              dense_units,
                              mlp_layers,
                              dense_classification_layers,
                              rnn_model,
                              dropout,
                              l2)
  
  model = build_model(built_layers,
                      claims_input,
                      evidences_input,
                      dense_classification_layers)

  model_compile_info = {
      'optimizer': optimizer,
      'loss': loss,
      'metrics': metrics,
  }

  model.compile(**model_compile_info)

  return model

In [None]:
model = build_and_compile(vocabulary_size,
                          EMBEDDING_SIZE,
                          max_length,
                          TOKEN_EMBEDDING_MODE,
                          SENTENCE_EMBEDDING_MODE,
                          DENSE_UNITS,
                          MLP_LAYERS,
                          RNN_MODEL,
                          claims_input,
                          evidences_input,
                          DENSE_CLASSIFICATION_LAYERS,
                          OPTIMIZER,
                          LOSS,
                          METRICS,
                          DROPOUT_RATE,#None, DROPOUT_RATE
                          keras.regularizers.l2(L2_RATE))#None, keras.regularizers.l2(L2_RATE)

#Show architecture
keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True, expand_nested=True)

##Train model

In [None]:
def show_history(history: keras.callbacks.History):
    """
    Shows training history data stored by the History Keras callback

    :param history: History Keras callback
    """
    print(history.history)
    history_data = history.history
    print("Displaying the following history keys: ", history_data.keys())

    for key, value in history_data.items():
        if not key.startswith('val'):
            fig, ax = plt.subplots(1, 1)
            ax.set_title(key)
            ax.plot(value)
            if 'val_{}'.format(key) in history_data:
                ax.plot(history_data['val_{}'.format(key)])
            else:
                print("Couldn't find validation values for metric: ", key)

            ax.set_ylabel(key)
            ax.set_xlabel('epoch')
            ax.legend(['train', 'val'], loc='best')

    val_accuracies = history_data['val_binary_accuracy']
    best_val_epoch = np.argmax(val_accuracies)
    best_val_acc = val_accuracies[best_val_epoch]

    print(f'Best validation accuracy: {best_val_acc} obtained at epoch: {best_val_epoch}')

    plt.show()


def train_model(model: keras.Model,
                claims_train: np.ndarray,
                evidences_train: np.ndarray,
                labels_train: np.ndarray,
                claims_val: np.ndarray,
                evidences_val: np.ndarray,
                labels_val: np.ndarray,
                training_info: dict) -> keras.Model:

    print("Start training! \nParameters: {}".format(training_info))

    history = model.fit(x={"claims":claims_train,"evidences":evidences_train},
                        y=labels_train,
                        validation_data=({"claims":claims_val,"evidences":evidences_val},labels_val),
                        **training_info)
    
    print("Training completed! Showing history...")

    show_history(history)

    return model

In [None]:
model = train_model(model,
            train_claims_tokenized,
            train_evidences_tokenized,
            train_labels_tokenized,
            val_claims_tokenized,
            val_evidences_tokenized,
            val_labels_tokenized,
            training_info)

##Test model

In [None]:
def consecutive_claims(tokenized_claims_list):

  n_claims = tokenized_claims_list.shape[0]

  consecutive_claims_list = []

  current_claim = tokenized_claims_list[0]
  current_sum = 1

  for i in range(1,n_claims):
    if np.array_equal(current_claim, tokenized_claims_list[i]):
      current_sum+=1
    else:
      consecutive_claims_list.append(current_sum)

      current_claim = tokenized_claims_list[i]
      current_sum=1
  consecutive_claims_list.append(current_sum)

  return consecutive_claims_list

def majority_vote(votes):
  zeros = (votes==0).sum()
  ones = (votes==1).sum()

  return 0 if zeros>=ones else 1

def check_labels_consistency(labels):
  return np.max(labels) == np.min(labels)

def claim_verification_selection(claims_test, labels_test, test_predictions):

  consecutive_claims_list = consecutive_claims(claims_test)

  reduced_predictions = np.zeros(len(consecutive_claims_list))
  reduced_labels = np.zeros(len(consecutive_claims_list))

  start=0
  for index, n_consecutives in enumerate(consecutive_claims_list):

    labels = labels_test[start:start+n_consecutives]
    if check_labels_consistency(labels):
      reduced_labels[index] = labels[0]
    else:
      print("Inconsistency found!")
      for j in range(start,start+n_consecutives):
        print("Claim {}: {}".format(j-start+1,test_text_claim[j]))
        print("Evidence {}: {}".format(j-start+1,test_text_evidence[j]))
        print("Label {}: {}".format(j-start+1,test_labels[j]))
        print("\n")
      
      start+=n_consecutives
      continue

    votes = test_predictions[start:start+n_consecutives]
    vote = majority_vote(votes)
    reduced_predictions[index]=vote

    start+=n_consecutives

  return reduced_predictions, reduced_labels

def round_to_label(prediction, threshold=0.65):
  if prediction<threshold:
    return 0
  else:
    return 1

def predict_data(model: keras.Model,
                 claims_test: np.ndarray,
                 evidences_test: np.ndarray,
                 prediction_info: dict) -> np.ndarray:

    print('Starting prediction: \n{}'.format(prediction_info))

    predictions = model.predict(x={"claims":claims_test,"evidences":evidences_test}, **prediction_info)
    return predictions

def evaluate_predictions(predictions: np.ndarray,
                         y: np.ndarray,
                         metrics: List[Callable],
                         metric_names: List[str]):
    
    assert len(metrics) == len(metric_names)
    
    print(f'Prediction evaluation started...')

    metric_info = {}
    for metric, metric_name in zip(metrics, metric_names):
        
        metric_value = metric(y_pred=predictions, y_true=y)
        metric_info[metric_name] = metric_value

    return metric_info

def test_model(model: keras.Model,
               claims_test: np.array,
               evidences_test: np.array,
               labels_test: np.array,
               prediction_info: dict,
               claim_verification_evaluation: bool):
        
    predictions = predict_data(model,
                               claims_test,
                               evidences_test,
                               prediction_info)

    #Reshape predictions from (Batch,1) to (Batch)
    predictions = np.reshape(predictions,(predictions.size))

    #Round each probability into a label (0 or 1)
    test_predictions = np.array(list(map(round_to_label,predictions)))

    # Evaluation
    metrics = [
        partial(f1_score, average='macro'),
        partial(f1_score, average='micro'),
        partial(f1_score, average='weighted')
    ]

    metric_names = [
        "macro_f1",
        "micro_f1",
        "weighted_f1"
    ]

    if claim_verification_evaluation:
      reduced_predictions, reduced_labels = claim_verification_selection(claims_test, labels_test,test_predictions)
      metric_info = evaluate_predictions(predictions=reduced_predictions,
                                        y=reduced_labels,
                                        metrics=metrics,
                                        metric_names=metric_names)

    else:
      metric_info = evaluate_predictions(predictions=test_predictions,
                                        y=labels_test,
                                        metrics=metrics,
                                        metric_names=metric_names)

    return metric_info, test_predictions

In [None]:
metric_info, y_pred = test_model(model,
                                 test_claims_tokenized,
                                 test_evidences_tokenized,
                                 test_labels_tokenized,
                                 prediction_info,
                                 CLAIM_VERIFICATION_EVALUATION)

##Evaluate model

In [None]:
def show_f1_scores(metric_info):
    """
    Method for creating a list of labels that will be used for testing.
    
    Parameters
    -------
    metric_info : dict
        Dictionary that contains the f1 scores
    
    """
    print()
    print('F1 SCORES:')
    print(f'  macro: {metric_info["macro_f1"]}')
    print(f'  micro: {metric_info["micro_f1"]}')
    print(f'  weighted: {metric_info["weighted_f1"]}')
    print()

def show_classification_report(y_true, y_pred):
    """
    Method that prints the classification report
    
    Parameters
    -------
    y_true : np.array
        Array of true labels
    y_pred : np.array
        Array of predicted labels
    """

    print(classification_report(
        y_true, 
        y_pred,
        ))

def show_confusion_matrix(y_true, y_pred):
    """
    Method that shows the confusion matrix.
    
    Parameters
    -------
    y_true : np.array
        Array of true labels
    y_pred : np.array
        Array of predicted labels
    """
    fig, ax = plt.subplots(figsize=(5,5))
    ConfusionMatrixDisplay.from_predictions(
        y_true,
        y_pred,
        normalize='true', 
        cmap=plt.cm.Blues, 
        values_format=".2f",
        xticks_rotation='vertical',
        ax=ax)

In [None]:
show_f1_scores(metric_info)
show_classification_report(test_labels_tokenized, y_pred)
show_confusion_matrix(test_labels_tokenized, y_pred)

# Discussion and Error Analysis
In this part there is a small summary of the tests we have done, and the explanation of some choices that we made.

Finally, there are some considerations about the models that we selected for the final testing, and a comparison with the results obtained on the validation set.

* how we pre-processed the data and why
* observe that the train set is unbalanced and contains some inconsistencies
* model creation and parameter tuning
* results discussion
* possible improvements


## Data pre-processing
We noticed that the original datasets were relatively dirty, and they required some cleaning before using them as inputs of the models.

Firstly, we made some standard operations, e.g., removed unwanted punctuation, lower cased everything, removed special characters, etc....
Secondly, we applied specific actions for datasets cluttering removal. In particular:
* we noticed that each **evidence** was starting with a number (most likely the ID of the evidence) and a tabulation character. We removed all of these occurences in all the dataset evidences. Example:

 **Original Sentence**
~~~
14	The series finale aired August 28 , 2013 .
~~~
 **Processed Sentence**
~~~
The series finale aired August 28 , 2013 .
~~~
* almost all the evidences terminate with a series of tags that are not really relevant for the classification task, and we decided to remove them. Example:
 **Original Sentence**
~~~
5	It stars Ray Winstone in his first role in a costume drama .	Ray Winstone	Ray Winstone
~~~
 **Processed Sentence**
~~~
It stars Ray Winstone in his first role in a costume drama .
~~~

At the beginning, we also removed **stopwords** from the dataset.
After a lot of testing, we discovered that this was causing 2 major problems:
* **inconsistences**: after the pre-processing, some claims had the **same description**, but **different labels**. 
For example, in the original dataset we have the following claims:
~~~
SZA's music combines elements from different genres. (SUPPORT)
SZA's music does not combine elements from different genres. (REFUTES)
~~~
After the pre-processing, which was also consisting in removing stopwords, the results were the same, but with different labels:
~~~
szas music combine element different genre (SUPPORT)
szas music combine element different genre (REFUTES)
~~~
* **duplicates**: for the same reasons explained above, some rows were duplicated (e.g., same claim and same evidence).

Therefore, we eventually decided to discard the stopwords removal, and we remarked an important difference in the validation accuracy during the training phase (~+5-10%).
