In [173]:
## choose from large or small or custom named dataset

## for custom named dataset, the MLP parameters for small dataset are used

file = "small"

In [None]:
import seaborn
import numpy as np
import pandas as pd
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score as precision, recall_score as recall
from sklearn.neural_network import MLPClassifier as MLP

In [None]:
labels = ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]

In [None]:
id2label = {i:label for i, label in enumerate(labels)}
label2id = {label:i for i, label in enumerate(labels)}

In [None]:
def load_sparse_csr(filename):
    # here we need to add .npz extension manually
    loader = np.load(filename + '.npz')
    return sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [171]:
train = load_sparse_csr(f"train_{file}")
test = load_sparse_csr(f"test_{file}")
retrieved = load_sparse_csr(f"test_retrieved_{file}")
ytrain = pd.read_json(f"train_label_{file}.jsonl", lines = True)
ytest = pd.read_json(f"test_label_{file}.jsonl", lines = True)
yretrieved = pd.read_json(f"neg_label_{file}.jsonl", lines = True)
yret = yretrieved["label"]

In [172]:
print("Train len: " + str(len(ytrain)))
print("Test len: " + str(len(ytest)))
print("Retrieved Test len: " + str(len(yret)))

Train len: 32179
Test len: 16742
Retrieved Test len: 36185


In [None]:
def get_predictions(model, xtest, ytest, log_string):
    print(f"Performance for {log_string}:")
    y_pred = model.predict(xtest)
    
    yt = [id2label[y] for y in ytest]
    yp = [id2label[y] for y in y_pred]
    
    f1_average = f1_score(y_true=ytest, y_pred = y_pred, average='weighted')
    p = precision(ytest, y_pred, average = 'weighted')
    r = recall(ytest, y_pred, average = 'weighted')
    accuracy = accuracy_score(ytest, y_pred)
    # return as dictionary
    metrics = {'f1': f1_average,
               'precision': p,
               'recall': r,
               'accuracy': accuracy}
    print(metrics)
    
    cm = confusion_matrix(yt, yp, labels=labels)
    plt.figure(figsize=(8, 6))
    seaborn.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap='Reds')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix')
    
    plt.show()

In [None]:
# MLP Classifier
mlp = 0
if file == "large":
    mlp = MLP(random_state=101, activation='relu', solver="adam", hidden_layer_sizes=(40,40), max_iter=200,
          batch_size=1024, verbose=True, early_stopping=True)
else:
    mlp = MLP(random_state=101, activation='relu', solver="adam", hidden_layer_sizes=(40), max_iter=200,
          batch_size=256, verbose=True, early_stopping=True)
mlp.fit(train, ytrain["label"].to_numpy())

In [None]:
plt.subplot(2,2,1)
plt.plot(mlp.loss_curve_, label='Training Loss', color='blue')
plt.plot(mlp.validation_scores_, label='Validation Score', color='red')
plt.xlabel('Iterations')
plt.ylabel('Loss/ValidationScore')
plt.title('Training and Validation Values')
plt.legend()
plt.show()

### Predictions with true evidences given in dataset (Type-I)

In [None]:
get_predictions(mlp, test, ytest["label"].to_numpy(), "MLP Classifier")

### Predictions with retrieved Evidences (Type-II)

In [None]:
get_predictions(mlp, retrieved, yret, "MLP Classifier")

### Were wrong evidences discarded? (Robustness)

In [None]:
yneg = yretrieved[yretrieved.neg_label == 0]["neg_label"]
yneg[:] = 2
get_predictions(mlp, retrieved[yretrieved.neg_label.to_numpy() == 0], yneg, "MLP Classifier")

### Performance with correct evidences

In [None]:
ypos = yretrieved[yretrieved.neg_label == 1]["label"]
get_predictions(mlp, retrieved[yretrieved.neg_label.to_numpy() == 1], ypos, "MLP Classifier")

### Performance with incorrect evidences

In [None]:
yneg = yretrieved[yretrieved.neg_label == 0]["label"]
get_predictions(mlp, retrieved[yretrieved.neg_label.to_numpy() == 0], yneg, "MLP Classifier")

### FEVER Score

In [None]:
def getFEVERScore(model, test, y, y_neg):
    y_pred = model.predict(test)
    true = 0
    for i, element in enumerate(y_neg):
        if(element == 1):
            if(y_pred[i] == y[i]):
                true += 1
    
    return true/len(y_pred)

yneg = yretrieved.neg_label.to_numpy()
fever = getFEVERScore(mlp, retrieved, yret, yneg)
print(f'FEVER Score: {fever}')

### Performance on Refuted Samples of true calim-evidence pairs of Type-II Dataset

In [None]:
## accuracy = recall in this case

y = yretrieved[(yretrieved.label == 1) & (yretrieved.neg_label == 1)]["label"]
get_predictions(mlp, retrieved[(yretrieved.label == 1) & (yretrieved.neg_label == 1)], y.to_numpy(), "MLP Classifier")

### Recall of 'REFUTES' for corret+incorrect pairs in Type-II

In [None]:
# accuracy = recall

y = yretrieved[yretrieved.label == 1]["label"]
get_predictions(mlp, retrieved[yretrieved.label == 1], y.to_numpy(), "MLP Classifier")