# Metric plots
This notebook generates the plots for the final paper.

It loads the datasets from all 3 models.
Each model is trained on the Waseem and Hovy (2016) dataset and evaluated on two test datasets:

1. Waseem and Hovy (2016): to simulate seen data.
2. SemEval (2019): to simulate unseen data.

### Metric results (all survey values)
The plots below do take all values of TP, TN, FP, FN, and Rejection into account.

In [1]:
from rejector.prediction import Prediction
from rejector.values import Values
from rejector.metric import Metric

values = Values(value_TP=18.149543604085686, value_TN=36.31953463203463, value_FP=16.68669949423102, value_FN=28.08375563063063,value_rejection=4.82167904290429)

predictions = Prediction.load("input/lr-waseem-waseem.p", gold_class="Hate")
metric_lr_waseem_waseem = Metric(values, predictions)
predictions = Prediction.load("input/lr-waseem-semeval.p", gold_class="Hate")
metric_lr_waseem_semeval = Metric(values, predictions)
predictions = Prediction.load("input/distilbert-waseem-waseem.p", gold_class="Hate")
metric_distilbert_waseeem_waseem = Metric(values, predictions)
predictions = Prediction.load("input/distilbert-waseem-semeval.p", gold_class="Hate")
metric_distilbert_waseeem_semeval = Metric(values, predictions)
predictions = Prediction.load("input/cnn-waseem-waseem.p", gold_class="Hate")
metric_cnn_waseeem_waseem = Metric(values, predictions)
predictions = Prediction.load("input/cnn-waseem-semeval.p", gold_class="Hate")
metric_cnn_waseeem_semeval = Metric(values, predictions)

lr_seen = ("LR", metric_lr_waseem_waseem)
distilbert_seen = ("DistilBERT", metric_distilbert_waseeem_waseem)
cnn_seen = ("CNN", metric_cnn_waseeem_waseem)

lr_unseen = ("LR", metric_lr_waseem_semeval)
distilbert_unseen = ("DistilBERT", metric_distilbert_waseeem_semeval)
cnn_unseen = ("CNN", metric_cnn_waseeem_semeval)

metrics_seen = [lr_seen, distilbert_seen, cnn_seen]
metrics_unseen = [lr_unseen, distilbert_unseen, cnn_unseen]

In [None]:
Metric.plot_multiple_effectiveness(metrics=metrics_seen, filename="metric-all-values-seen-data.pdf", use_pdf=False)

In [None]:
Metric.plot_multiple_effectiveness(metrics=metrics_unseen, filename="metric-all-values-unseen-data.pdf", use_pdf=False)

### Metric results (TP and TN set to 0)
The plots below do take all values into account but TP and TN are set to 0.

In [None]:
values = Values(value_TP=0.0, value_TN=0.0, value_FP=16.68669949423102, value_FN=28.08375563063063,value_rejection=4.82167904290429)

predictions = Prediction.load("input/lr-waseem-waseem.p", gold_class="Hate")
metric_lr_waseem_waseem = Metric(values, predictions)
predictions = Prediction.load("input/lr-waseem-semeval.p", gold_class="Hate")
metric_lr_waseem_semeval = Metric(values, predictions)
predictions = Prediction.load("input/distilbert-waseem-waseem.p", gold_class="Hate")
metric_distilbert_waseeem_waseem = Metric(values, predictions)
predictions = Prediction.load("input/distilbert-waseem-semeval.p", gold_class="Hate")
metric_distilbert_waseeem_semeval = Metric(values, predictions)
predictions = Prediction.load("input/cnn-waseem-waseem.p", gold_class="Hate")
metric_cnn_waseeem_waseem = Metric(values, predictions)
predictions = Prediction.load("input/cnn-waseem-semeval.p", gold_class="Hate")
metric_cnn_waseeem_semeval = Metric(values, predictions)

lr_seen = ("LR", metric_lr_waseem_waseem)
distilbert_seen = ("DistilBERT", metric_distilbert_waseeem_waseem)
cnn_seen = ("CNN", metric_cnn_waseeem_waseem)

lr_unseen = ("LR", metric_lr_waseem_semeval)
distilbert_unseen = ("DistilBERT", metric_distilbert_waseeem_semeval)
cnn_unseen = ("CNN", metric_cnn_waseeem_semeval)

metrics_seen = [lr_seen, distilbert_seen, cnn_seen]
metrics_unseen = [lr_unseen, distilbert_unseen, cnn_unseen]

In [None]:
Metric.plot_multiple_effectiveness(metrics=metrics_seen, filename="metric-tptn0-seen-data.pdf", use_pdf=False)

In [None]:
Metric.plot_multiple_effectiveness(metrics=metrics_unseen, filename="metric-tptn0-unseen-data.pdf", use_pdf=False)