# Error Analysis

In [33]:
import pandas as pd
import spacy
import thesis_utils
from IPython.display import display, HTML
from collections import defaultdict
from tqdm import tqdm
import random

In [35]:
bert_base_cased_realec = './predictions_on_processedfiles/proc_realec_bert_base-cased.tsv.tsv'
xlm_roberta_realec = './predictions_on_processedfiles/proc_realec_xlmroberta.tsv'

In [37]:
bert_realec_sents = thesis_utils.read_tsv_file_and_find_sentences_with_headers(bert_base_cased_realec)
xlm_roberta_realec_sents = thesis_utils.read_tsv_file_and_find_sentences_with_headers(xlm_roberta_realec)

In [39]:
def evaluate_error_types_recall_only_merlin(sentences, dataset, print_results=True):
    error_stats = defaultdict(lambda: {"gold_i": 0, "correct_pred_i": 0})
    
    for sentence in sentences:
        for line in sentence:
            if dataset == 'merlin':
                error_type = line[6]
                gold_label = line[-2]
                pred_label = line[-1]
                if gold_label == 'i':
                    error_stats[error_type]["gold_i"] += 1
                    if pred_label == 'i':
                        error_stats[error_type]["correct_pred_i"] += 1
            elif dataset == 'fce':
                error_type = line[3]
                gold_label = line[-2]
                pred_label = line[-1]
                if gold_label == 'i':
                    error_stats[error_type]["gold_i"] += 1
                    if pred_label == 'i':
                        error_stats[error_type]["correct_pred_i"] += 1
            elif dataset == 'realec':
                gold_label = line[-2]
                pred_label = line[-1]
                error_type = line[2]
                if gold_label == 'i':
                    error_stats[error_type]["gold_i"] += 1
                    if pred_label == 'i':
                        error_stats[error_type]["correct_pred_i"] += 1

    if print_results:
        header = f"| {'Error Code':<23} | {'Gold':>6} | {'True Pos.':>7} | {'False Neg.':>10} | {'Recall':>6} |"
        print(header)
        print("|" + "-"*(len(header)-2) + "|")

        total_gold = 0
        total_correct = 0

        sorted_items = sorted(
            error_stats.items(), 
            key=lambda item: item[1]["correct_pred_i"] / item[1]["gold_i"] if item[1]["gold_i"] > 0 else 0.0, 
            reverse=True
        )

        for error_type, stats in sorted_items:
            gold_i = stats["gold_i"]
            correct_pred_i = stats["correct_pred_i"]
            false_neg = gold_i - correct_pred_i
            recall = correct_pred_i / gold_i if gold_i > 0 else 0.0
            print(f"| {error_type:<23} | {gold_i:6d} | {correct_pred_i:9d} | {false_neg:10d} | {recall:6.2f} |")
            
            total_gold += gold_i
            total_correct += correct_pred_i

        total_false_neg = total_gold - total_correct
        overall_recall = total_correct / total_gold if total_gold > 0 else 0.0

        print("|" + "-"*(len(header)-2) + "|")
        print(f"| {'TOTAL':<23} | {total_gold:6d} | {total_correct:9d} | {total_false_neg:10d} | {overall_recall:6.2f} |")

    return error_stats


In [40]:
bert_realec_stats = evaluate_error_types_recall_only_merlin(bert_realec_sents,'realec')

| Error Code              |   Gold | True Pos. | False Neg. | Recall |
|--------------------------------------------------------------------|
| Adj_as_collective       |      1 |         1 |          0 |   1.00 |
| Adverbs                 |      1 |         1 |          0 |   1.00 |
| Vocabulary              |      1 |         1 |          0 |   1.00 |
| Countable_uncountable   |      9 |         8 |          1 |   0.89 |
| Spelling                |   1503 |      1327 |        176 |   0.88 |
| Agreement_errors        |    184 |       128 |         56 |   0.70 |
| Tense_form              |     74 |        48 |         26 |   0.65 |
| Derivation              |      8 |         5 |          3 |   0.62 |
| Capitalisation          |     77 |        48 |         29 |   0.62 |
| Category_confusion      |    122 |        76 |         46 |   0.62 |
| Numerals                |     52 |        28 |         24 |   0.54 |
| Formational_affixes     |     32 |        17 |         15 |   0.53 |
| Noun

In [43]:
roberta_realec_stats = evaluate_error_types_recall_only_merlin(xlm_roberta_realec_sents,'realec')

| Error Code              |   Gold | True Pos. | False Neg. | Recall |
|--------------------------------------------------------------------|
| Adj_as_collective       |      1 |         1 |          0 |   1.00 |
| Adverbs                 |      1 |         1 |          0 |   1.00 |
| Vocabulary              |      1 |         1 |          0 |   1.00 |
| Spelling                |   1503 |      1268 |        235 |   0.84 |
| Countable_uncountable   |      9 |         7 |          2 |   0.78 |
| Agreement_errors        |    184 |       124 |         60 |   0.67 |
| Numerals                |     52 |        29 |         23 |   0.56 |
| Capitalisation          |     77 |        41 |         36 |   0.53 |
| Lack_par_constr         |      6 |         3 |          3 |   0.50 |
| Adjectives              |      6 |         3 |          3 |   0.50 |
| Infinitive_constr       |      8 |         4 |          4 |   0.50 |
| suggestion              |     18 |         9 |          9 |   0.50 |
| Noun

In [45]:
def find_false_positives_and_false_negatives(list_sentences, dataset):
    only_fp = []
    only_fn = []
    both_fp_fn = []

    false_negatives_by_type_clean = defaultdict(set)
    false_negatives_by_type_all = defaultdict(set)

    for sentence in list_sentences:
        has_fp = any(token[-1] == 'i' and token[-2] == 'c' for token in sentence)
        has_fn = any(token[-1] == 'c' and token[-2] == 'i' for token in sentence)

        if has_fp and not has_fn:
            only_fp.append(sentence)
        elif has_fn and not has_fp:
            only_fn.append(sentence)
        elif has_fp and has_fn:
            both_fp_fn.append(sentence)

    for sentence in only_fn:
        if dataset == 'merlin':
            for token in sentence:
                if token[-1] == 'c' and token[-2] == 'i' and len(token) > 6:
                    error_type = token[6] if token[6] else 'NO_LABEL'
                    false_negatives_by_type_clean[error_type].add(tuple(tuple(t) for t in sentence))
        elif dataset == 'fce':
            for token in sentence:
                if token[-1] == 'c' and token[-2] == 'i' and len(token) > 3:
                    error_type = token[3] if token[3] else 'NO_LABEL'
                    false_negatives_by_type_clean[error_type].add(tuple(tuple(t) for t in sentence))
        else:
            for token in sentence:
                if token[-1] == 'c' and token[-2] == 'i':
                    error_type = token[2] if token[2] else 'NO_LABEL'
                    false_negatives_by_type_clean[error_type].add(tuple(tuple(t) for t in sentence))

    for sentence in both_fp_fn:
        if dataset == 'merlin':
            for token in sentence:
                if token[-1] == 'c' and token[-2] == 'i' and len(token) > 6:
                    error_type = token[6] if token[6] else 'NO_LABEL'
                    false_negatives_by_type_all[error_type].add(tuple(tuple(t) for t in sentence))
        elif dataset == 'fce':
            for token in sentence:
                if token[-1] == 'c' and token[-2] == 'i' and len(token) > 3:
                    error_type = token[3] if token[3] else 'NO_LABEL'
                    false_negatives_by_type_all[error_type].add(tuple(tuple(t) for t in sentence))
        else:
            for token in sentence:
                if token[-1] == 'c' and token[-2] == 'i':
                    error_type = token[2] if token[2] else 'NO_LABEL'
                    false_negatives_by_type_all[error_type].add(tuple(tuple(t) for t in sentence))

    false_negatives_by_type_clean = {k: list(v) for k, v in false_negatives_by_type_clean.items()}
    false_negatives_by_type_all = {k: list(v) for k, v in false_negatives_by_type_all.items()}

    return only_fp, only_fn, both_fp_fn, false_negatives_by_type_clean, false_negatives_by_type_all


bert_realec_only_fp, bert_realec_only_fn, bert_realec_fp_and_fn, bert_realec_only_fn_error_type, bert_realec_fn_fp_error_type = find_false_positives_and_false_negatives(bert_realec_sents, 'realec')
print(f"False Positive Sentences (bert-realec): {len(bert_realec_only_fp)}")
print(f"False Negative Sentences (bert-realec): {len(bert_realec_only_fn)}\n")

roberta_realec_only_fp, roberta_realec_only_fn, roberta_realec_fp_and_fn, roberta_realec_only_fn_error_type, roberta_realec_fn_fp_error_type = find_false_positives_and_false_negatives(xlm_roberta_realec_sents, 'realec')
print(f"False Positive Sentences (roberta-realec): {len(roberta_realec_only_fp)}")
print(f"False Negative Sentences (roberta-realec): {len(roberta_realec_only_fn)}")


False Positive Sentences (bert-realec): 1003
False Negative Sentences (bert-realec): 886

False Positive Sentences (roberta-realec): 824
False Negative Sentences (roberta-realec): 1090


In [47]:
for key in sorted(set(bert_realec_only_fn_error_type) | set(bert_realec_fn_fp_error_type)):
    fn_count = len(bert_realec_only_fn_error_type.get(key, []))
    fp_fn_count = len(bert_realec_fn_fp_error_type.get(key, []))
    print(f"| {key:<23} | FN: {fn_count:6d} | TOTAL: {fp_fn_count:6d} |")

| Absence_comp_sent       | FN:     26 | TOTAL:     60 |
| Absence_explanation     | FN:     37 | TOTAL:     48 |
| Adjectives              | FN:      0 | TOTAL:      3 |
| Agreement_errors        | FN:     17 | TOTAL:     35 |
| Articles                | FN:    169 | TOTAL:    174 |
| Capitalisation          | FN:      7 | TOTAL:     16 |
| Category_confusion      | FN:     14 | TOTAL:     25 |
| Coherence               | FN:      9 | TOTAL:      4 |
| Comparative_constr      | FN:      6 | TOTAL:      5 |
| Comparison_degree       | FN:      1 | TOTAL:      3 |
| Compound_word           | FN:      4 | TOTAL:      4 |
| Confusion_of_structures | FN:      4 | TOTAL:     12 |
| Conjunctions            | FN:      8 | TOTAL:     12 |
| Countable_uncountable   | FN:      1 | TOTAL:      0 |
| Derivation              | FN:      1 | TOTAL:      2 |
| Determiners             | FN:     12 | TOTAL:     12 |
| Discourse               | FN:      2 | TOTAL:      1 |
| Formational_affixes     | FN:

In [49]:
for key in sorted(set(roberta_realec_only_fn_error_type) | set(roberta_realec_fn_fp_error_type)):
    fn_count = len(roberta_realec_only_fn_error_type.get(key, []))
    fp_fn_count = len(roberta_realec_fn_fp_error_type.get(key, []))
    print(f"| {key:<23} | FN: {fn_count:6d} | TOTAL: {fp_fn_count:6d} |")

| Absence_comp_sent       | FN:     47 | TOTAL:     45 |
| Absence_explanation     | FN:     50 | TOTAL:     40 |
| Adjectives              | FN:      1 | TOTAL:      2 |
| Agreement_errors        | FN:     22 | TOTAL:     34 |
| Articles                | FN:    218 | TOTAL:    146 |
| Capitalisation          | FN:     12 | TOTAL:     16 |
| Category_confusion      | FN:     28 | TOTAL:     26 |
| Coherence               | FN:     10 | TOTAL:      4 |
| Comparative_constr      | FN:      7 | TOTAL:      4 |
| Comparison_degree       | FN:      1 | TOTAL:      3 |
| Compound_word           | FN:      7 | TOTAL:      1 |
| Confusion_of_structures | FN:      4 | TOTAL:     12 |
| Conjunctions            | FN:      8 | TOTAL:     13 |
| Countable_uncountable   | FN:      1 | TOTAL:      1 |
| Derivation              | FN:      2 | TOTAL:      2 |
| Determiners             | FN:     13 | TOTAL:     12 |
| Discourse               | FN:      3 | TOTAL:      0 |
| Formational_affixes     | FN: