In [None]:
import pandas as pd

In [None]:
df = pd.read_json("sample_data/extraction.json")

In [None]:
import weak_nlp
from weak_nlp import extraction


def get_enlm_from_df(df):
    vectors = []
    for source_id, df_sub_source in df.fillna("manual").groupby("source_id"):
        associations = []
        for (record_id, label_id), df_sub_source_record_label in df_sub_source.groupby(
            ["record_id", "label_id"]
        ):
            max_token = max(df_sub_source_record_label.token_index)
            min_token = min(df_sub_source_record_label.token_index)
            confidence = df_sub_source_record_label.confidence.iloc[0]
            associations.append(
                extraction.ExtractionAssociation(
                    record_id,
                    label_id,
                    min_token,
                    max_token,
                    confidence=confidence,
                )
            )
        vectors.append(
            weak_nlp.SourceVector(source_id, source_id == "manual", associations)
        )

    return extraction.ENLM(vectors)

In [None]:
enlm = get_enlm_from_df(df)

In [None]:
enlm.quality_metrics()

In [None]:
enlm.quantity_metrics()

In [None]:
vector_ = enlm.vectors_noisy[0]
enlm_ = extraction.ENLM([enlm.vector_reference, vector_])
enlm_ = enlm

In [None]:
from weak_nlp.extraction import util

In [None]:
df_reference = enlm_.vector_reference.associations
df_reference_flat = util.flatten_range_df(df_reference, include_source=False)

reference_labels = list(df_reference["label"].dropna().unique())
for idx, vector_noisy in enumerate(enlm_.vectors_noisy):
    quality = {}
    df_noisy = vector_noisy.associations
    df_noisy_flat = util.flatten_range_df(df_noisy, include_source=False)
    
    break

    noisy_labels = list(vector_noisy.associations["label"].dropna().unique())
    for label_name in noisy_labels + reference_labels:
        quality[label_name] = {
            "true_positives": 0,
            "false_positives": 0,
            "false_negatives": 0,
        }

    for (record, label), df_reference_sub_record_label in df_reference.groupby(
        ["record", "label"]
    ):
        token_set_reference = util.get_token_range(
            df_reference_sub_record_label
        )

        df_noisy_sub_record_label = df_noisy.loc[
            (df_noisy["record"] == record) & (df_noisy["label"] == label)
        ].copy()

        if len(df_noisy_sub_record_label) > 0:
            token_set_noisy = util.get_token_range(df_noisy_sub_record_label)

            tps = len(token_set_reference.intersection(token_set_noisy))
            fps = len(token_set_noisy.difference(token_set_reference))
            fns = len(token_set_reference.difference(token_set_noisy))
        else:
            tps = 0
            fps = 0
            fns = len(token_set_reference)

        quality[label]["true_positives"] += tps
        quality[label]["false_positives"] += fps
        quality[label]["false_negatives"] += fns
    enlm_.vectors_noisy[idx].quality = quality.copy()

In [None]:
df_joined_by_token = df_reference_flat.set_index("record").join(df_noisy_flat.set_index("record"), how="outer", lsuffix="_reference", rsuffix="_noisy")

true_positives = df_joined_by_token.loc[df_joined_by_token["label_reference"] == df_joined_by_token["label_noisy"]]
both_negatives = df_joined_by_token.loc[df_joined_by_token["label_reference"] != df_joined_by_token["label_noisy"]].dropna()
false_positives = df_joined_by_token.loc[df_joined_by_token["label_reference"].isnull()]
false_negatives = df_joined_by_token.loc[df_joined_by_token["label_noisy"].isnull()]

In [None]:
quality = {label: {
    "true_positives": 0,
    "false_positives": 0,
    "false_negatives": 0
} for label in reference_labels + noisy_labels}

for label, tp_sub_label in true_positives.groupby("label_reference"):
    quality[label]["true_positives"] += len(tp_sub_label)
    
for label, fn_sub_label in both_negatives.groupby("label_reference"):
    quality[label]["false_negatives"] += len(fn_sub_label)
    
for label, fn_sub_label in false_negatives.groupby("label_reference"):
    quality[label]["false_negatives"] += len(fn_sub_label)
    
for label, fp_sub_label in both_negatives.groupby("label_noisy"):
    quality[label]["false_positives"] += len(fp_sub_label)
    
for label, fp_sub_label in false_positives.groupby("label_noisy"):
    quality[label]["false_positives"] += len(fp_sub_label)

In [None]:
quality