In [1]:
from utils import get_gold_relations, get_pred_relations, analyze_relations, get_total_num_of_preds, get_total_num_of_relations
import pandas as pd
from collections import Counter

In [16]:
lang = "eu"
true = get_gold_relations(lang)

In [26]:
def qualitative_analysis_of_relation_errors(lang, model, type="missed"):
    pred = get_pred_relations(lang, model)
    missed, spurious, partial = analyze_relations(true,pred)
    missed = [(m[0], m[1]) for m in missed]
    partial = [(m[0], m[1]) for m in partial]
    spurious = [(m[0], m[1]) for m in spurious]

    def analyse(relations):
        # Analyzing most common RML entities
        rml_entities = [relation[0] for relation in relations]
        rml_counter = Counter(rml_entities)
        most_common_rml = rml_counter.most_common()
        print("Most Common RML Entities:", most_common_rml)

        # Analyzing most common TST entities
        tst_entities = [relation[1] for relation in relations]
        tst_counter = Counter(tst_entities)
        most_common_tst = tst_counter.most_common()
        print("Most Common TST Entities:", most_common_tst)

        # Analyzing most common pairs
        pair_counter = Counter(relations)
        most_common_pairs = pair_counter.most_common()
        print("Most Common Relations", most_common_pairs)

    if type == "missed":
        analyse(missed)
    elif type == "partial":
        analyse(partial)
    else:
        analyse(spurious)

    return missed, spurious, partial




In [27]:
missed_relations_combined = {}
spurious_relations_combined = {}
partial_relations_combined = {}

for model in ['mbert', 'xlmroberta', 'biobert', 'mbert_multilingual', 'xlmroberta_multilingual', 'biobert_multilingual']:
    print(model)
    missed, spurious, partial = qualitative_analysis_of_relation_errors(lang, model, type="missed")
    missed_relations_combined[model] = missed
    spurious_relations_combined[model] = spurious
    partial_relations_combined[model] = partial
    print()

mbert
Most Common RML Entities: [('normala', 32), ('normalak', 27), ('positiboa', 7), ('negatiboa', 4), ('emaitza normalak', 4), ('SBA', 3), ('+', 3), ('normala/handitua', 2), ('7 puntu', 2), ('moderatu-larria', 2), ('larria', 2), ('42.9', 2), ('ondo', 2), ('anormala', 2), ('2 mm', 1), ('10 mm', 1), ('% 99', 1), ('98 tau/min', 1), ('144/83 mm Hg', 1), ('39 ºC-ko', 1), ('80 kg', 1), ('140/95 mmHg', 1), ('% 45', 1), ('45 mg/dL', 1), ('140/97 mm-Hg', 1), ('1,65 m', 1), ('235 mg/dL', 1), ('75 orduko', 1), ('1240 UI/mL', 1), ('ezezkoak', 1), ('2048ko titulua', 1), ('74 mg/L', 1), ('15.000 leukozito/mm3', 1), ('%80S', 1), ('zalantzatsua', 1), ('17mm', 1), ('10mm', 1), ('86 l/m', 1), ('%21', 1), ('3 cm- ko diametroko', 1), ('%7ko', 1), ('178,9U/L', 1), ('maila altua', 1), ('14 mm-ko', 1), ('3-10 hematie / kanpo', 1), ('36,6 ̊C', 1), ('19 cm-ko', 1), ('444 ng/ml', 1), ('1,44 mg/dl', 1), ('80 taup', 1), ('10 cm-ko', 1), ('100', 1), ('mingarria', 1), ('osoa', 1), ('90', 1), ('38ºC', 1), ('altu',

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:

df = pd.DataFrame([(model, pair[0], pair[1]) for model, relations in missed_relations_combined.items() for pair in relations], columns=['Model', 'RML', 'TST'])

# Count the frequency of each RML and TST by model
rml_freq = df.groupby(['Model', 'RML']).size().unstack(fill_value=0)
tst_freq = df.groupby(['Model', 'TST']).size().unstack(fill_value=0)


# Analyze common and unique missed pairs
all_pairs = pd.Series([pair for relations in missed_relations_combined.values() for pair in relations])
common_pairs = all_pairs[all_pairs.duplicated(keep=False)].value_counts()
print("Common Missed Pairs Across Models:", common_pairs)

# Find unique missed relations by each model
unique_analysis = {model: set(relations) - set(all_pairs[all_pairs != model]) for model, relations in missed_relations_combined.items()}
print("Unique Missed Relations by Model:", unique_analysis)

Common Missed Pairs Across Models: (18.60, PASI)                12
(normal, exploración)        12
(normales, calcio)           12
(inespecíficas, biopsias)    12
(negativos, Salmonella)       9
                             ..
(70 mmHg, estenosis)          2
(< 70%, saturación)           2
(1,1 cm, Lesión)              2
(+, IgG)                      2
(normales, gasometría)        2
Name: count, Length: 298, dtype: int64
Unique Missed Relations by Model: {'mbert': set(), 'xlmroberta': set(), 'biobert': set(), 'mbert_multilingual': set(), 'xlmroberta_multilingual': set(), 'biobert_multilingual': set()}


In [58]:
def find_exclusively_missed_relations(selected_models, all_missed_relations):
    """
    Identify relations that are exclusively missed by a specified subset of models,
    meaning these relations are missed by all models in the subset and not missed by any models outside this subset.
    
    Parameters:
    - selected_models: List[str] -- a list of model names to analyze.
    - all_missed_relations: Dict[str, List[Tuple]] -- a dictionary where keys are model names and values are lists of missed relations.
    
    Returns:
    - Set[Tuple] -- a set of relations uniquely missed by the specified subset of models.
    """
    # Relations missed by all selected models
    selected_misses = [set(all_missed_relations[model]) for model in selected_models]
    if not selected_misses:
        return set()  # Return an empty set if no models are selected
    exclusively_missed = set.intersection(*selected_misses)
    
    # Remove any relations that are missed by models not in the selected subset
    other_models = set(all_missed_relations.keys()) - set(selected_models)
    for model in other_models:
        exclusively_missed -= set(all_missed_relations[model])
    
    return exclusively_missed



# Example usage
# selected_models = ['mbert', 'mbert_multilingual']
selected_models = ['xlmroberta', 'xlmroberta_multilingual']
# selected_models = ['biobert', 'biobert_multilingual']
exclusively_missed = find_exclusively_missed_relations(selected_models, missed_relations_combined)
print("Relations Exclusively Missed by Selected Models:" )
print(exclusively_missed)

Relations Exclusively Missed by Selected Models:
{('39%', 'neutrofili'), ('negatività', 'urinocoltura'), ('nella norma', 'diametri'), ('154 mMol/L', 'ipernatremia'), ('nella norma', 'sierodiagnosi'), ('500 mmol/mol', 'Creatinina'), ('negativo', 'catalasi'), ('110 mEq/l', 'iposodiemia'), ('negatività', 'sierologie'), ('ai limiti inferiori del range', 'sideremia'), ('negativa', 'obiettività'), ('746 μmolL', 'iperammoniemia'), ('positiva', 'Pseudomonas'), ('11,6 UI/L', 'CKMB'), ('>15%', 'ipereosinofilia'), ('nel range di normalità', 'kaliemia'), ('negativi', 'Ricerca'), ('negative', 'emoglobinopatie'), ('kg 16.7', 'Peso'), ('2/6', 'soffio')}


In [25]:
lang = "eu"

In [26]:
for model in ['mbert', 'xlmroberta', 'biobert', 'mbert_multilingual', 'xlmroberta_multilingual', 'biobert_multilingual']:
    print(model)
    print(f"Total number of preds {get_total_num_of_preds(lang, model)}")

mbert
Total number of preds 231
xlmroberta
Total number of preds 364
biobert
Total number of preds 349
mbert_multilingual
Total number of preds 227
xlmroberta_multilingual
Total number of preds 421
biobert_multilingual
Total number of preds 340


In [8]:
for model in ['mbert', 'xlmroberta', 'biobert', 'mbert_multilingual', 'xlmroberta_multilingual', 'biobert_multilingual']:
    print(model)
    print(f"Total number of preds {get_total_num_of_preds(lang, model)}")

mbert
Total number of preds 915
xlmroberta
Total number of preds 676
biobert
Total number of preds 623
mbert_multilingual
Total number of preds 676
xlmroberta_multilingual
Total number of preds 717
biobert_multilingual
Total number of preds 662


In [4]:
print(get_total_num_of_relations("it"))
print(get_total_num_of_relations("es"))
print(get_total_num_of_relations("eu"))

2128
2464
2166


In [13]:
x = get_gold_relations("it")

In [14]:
x[0]

[('2', 'engagement', ('994', '995'), ('952', '962')),
 ('2', 'PHE-S', ('994', '995'), ('910', '915')),
 ('3', 'PHE-S', ('2658', '2659'), ('2591', '2596')),
 ('3', 'engagement', ('2658', '2659'), ('2639', '2649'))]

In [15]:
len([item for sublist in x for item in sublist])

612

In [7]:
y = get_pred_relations("eu", "xlmroberta")

In [12]:
len([item for sublist in y for item in sublist])

364