In [61]:
import ast
import glob
import os

import argilla as rg
import numpy as np
import pandas as pd
from dotenv import load_dotenv, find_dotenv
import spacy
import math
from sklearn.metrics import precision_score, recall_score, f1_score
from functools import partial

In [62]:
def tokenize(text):
    return [token.text for token in nlp(text)]

def extract_text(row, start_col='start', end_col='end', text_col='text'):
    if pd.isna(row[start_col]) or pd.isna(row[end_col]):
        return np.nan
    try:
        start = int(row[start_col])
        end = int(row[end_col])
        return row[text_col][start:end]
    except ValueError:
        return np.nan

def create_binary_tokens(df, apply_fn, text_col='text', start_col_name='start', end_col_name='end'):
    df['new_text'] = df.apply(apply_fn, axis=1)
    df['text_before'] = df.apply(lambda x: x[text_col][:int(x[start_col_name])] if not math.isnan(x[start_col_name]) else np.nan, axis=1)
    df['text_after'] = df.apply(lambda x: x[text_col][int(x[end_col_name]):] if not math.isnan(x[end_col_name]) else np.nan, axis=1)

    # Creating the new tokens column
    df['new_tokens'] = df['new_text'].apply(lambda x: tokenize(x) if pd.notnull(x) else np.nan)
    df['tokens_before'] = df['text_before'].apply(lambda x: tokenize(x) if pd.notnull(x) else np.nan)
    df['tokens_after'] = df['text_after'].apply(lambda x: tokenize(x) if pd.notnull(x) else np.nan)

    df['in_new_span'] = df.apply(lambda x: [0]*len(x.tokens_before) + [1]*len(x.new_tokens) + [0]*len(x.tokens_after) if np.all(pd.notnull(x.new_tokens)) else np.nan, axis=1)
    df['in_new_span'] = df['in_new_span'].fillna(df.tokens.apply(lambda x: [0]*len(x)))
    return df

def parse_annotation(annotation):
    if isinstance(annotation, str):
        try:
            return ast.literal_eval(annotation)
        except ValueError:  # catches strings that aren't list/dict syntax
            return annotation
    elif isinstance(annotation, float) and np.isnan(annotation):  # handle NaN values
        return {}
    else:
        return annotation

def calculate_metrics(df):
    # Concatenate all lists in the 'relevant_tokens_x' and 'relevant_tokens_y' columns
    predictions = np.concatenate(df['predictions'].values)
    true_values = np.concatenate(df['ground_truth'].values)

    # Calculate precision, recall and F1-score
    precision = precision_score(true_values, predictions)
    recall = recall_score(true_values, predictions)
    f1 = f1_score(true_values, predictions)

    return precision, recall, f1

def process_explorer_concepts_df(explorer_df):
    # filter the explorer_df to only include documents that are in the dataset
    explorer_df = explorer_df[explorer_df.document_id.isin(argilla_df.document_id_new.unique())]

    replace_dict = {
        "financial-flows": "Financial Flows",
        "deforestation": "Deforestation",
        "vulnerable-groups": "Vulnerable Groups",
        "equity-and-just-transition": "Equity And Justice",
        "barriers-and-challenges": "Barriers and Challenges",
        "good-practice-and-opportunities": "Good Practices and Opportunities",
    }
    explorer_df["subdir"] = explorer_df["subdir"].replace(replace_dict)



    explorer_df['tokens'] = explorer_df['sentence'].apply(tokenize)



    partial_fn = partial(extract_text, start_col='start_idx', end_col='end_idx', text_col='sentence')
    explorer_df = create_binary_tokens(explorer_df, partial_fn, text_col='sentence', start_col_name='start_idx', end_col_name='end_idx')
    return explorer_df

def calculate_concept_metrics(explorer_df, argilla_df):
    metrics_list=[]
    for concept in explorer_df['subdir'].unique():
        explorer_concept_df = explorer_df[explorer_df['subdir'] == concept]
        argilla_concept_df = argilla_df[argilla_df['label'] == concept]
        argilla_negatives = argilla_df[argilla_df['label'].isna()]
        argilla_concept_df = pd.concat([argilla_concept_df, argilla_negatives], axis=0)

        df_relevant_argilla = (argilla_concept_df.groupby(['text'])['in_new_span']
                    .apply(lambda x: np.any(x.values, axis=0))
                    .reset_index())
        df_relevant_argilla.columns = ['text', 'ground_truth']

        df_relevant_explorer = (explorer_concept_df.groupby(['sentence'])['in_new_span']
                    .apply(lambda x: np.any(x.values, axis=0))
                    .reset_index())
        df_relevant_explorer.columns = ['text', 'predictions']

        df_relevant_merged = df_relevant_argilla.merge(df_relevant_explorer, on='text', how='left')
        df_relevant_merged['predictions'] = df_relevant_merged['predictions'].fillna(df_relevant_merged['ground_truth'].apply(lambda x: [0]*len(x)))
        df_relevant_merged = df_relevant_merged[df_relevant_merged['ground_truth'].apply(lambda x: len(x)) == df_relevant_merged['predictions'].apply(lambda x: len(x))]

        precision, recall, f1 = calculate_metrics(df_relevant_merged)
        metrics_dict = {"concept": concept, "precision": precision, "recall": recall, "f1": f1}
        metrics_list.append(metrics_dict)
    metrics_df = pd.DataFrame(metrics_list)
    return metrics_df

def process_argilla_df(df):
    # ignore non-annotated data
    dataset_df = df[~df["annotation"].isna()]
    # create id column for cross-referencing with explorer
    dataset_df['document_id'] = "CCLW"+dataset_df.id.str.split("CCLW").str[-1]

    # load mapping
    mapping = pd.read_csv("/home/stefan/unfccc-global-stocktake-documents/notebooks/old-to-new-dataset-mapping.csv")


    argilla_df=dataset_df.copy()

    # assuming argilla_df is your dataframe and 'annotation' is your column
    argilla_df['annotation'] = argilla_df['annotation'].apply(parse_annotation)

    # explode the 'annotation' column
    argilla_df = argilla_df.explode('annotation')

    # create new columns from the dictionary
    argilla_df['start'] = argilla_df['annotation'].apply(lambda d: d.get('start') if isinstance(d, dict) else np.nan)
    argilla_df['label'] = argilla_df['annotation'].apply(lambda d: d.get('label') if isinstance(d, dict) else np.nan)
    argilla_df['end'] = argilla_df['annotation'].apply(lambda d: d.get('end') if isinstance(d, dict) else np.nan)

    # remove the 'annotation' column
    argilla_df = argilla_df.drop(columns='annotation')

    mapping.drop_duplicates(subset='document_id_old', keep='first', inplace=True)
    argilla_df['document_id_new'] = argilla_df.document_id.map(mapping.set_index('document_id_old')['document_id_new'])

    partial_fn = partial(extract_text, start_col='start', end_col='end', text_col='text')
    argilla_df = create_binary_tokens(argilla_df, partial_fn, text_col='text', start_col_name='start', end_col_name='end')
    return argilla_df

def load_explorer_concepts_df(base_dir, concepts):
    # File pattern
    pattern = "spans*.csv"

    # Store all DataFrames in a list
    df_list = []

    # Iterate over the specific subdirectories
    for subdir in concepts:
        dirpath = os.path.join(base_dir, subdir)
        if os.path.exists(dirpath):  # only proceed if the directory exists
            # Use glob to match the pattern 'spans*.csv'
            for filename in glob.glob(os.path.join(dirpath, pattern)):
                # Read csv file into a DataFrame and append to the list
                sub_df = pd.read_csv(filename)
                sub_df["subdir"] = subdir
                df_list.append(sub_df)
        else:
            print(f"Directory {dirpath} does not exist.")

    # Concatenate all dataframes in the list
    explorer_df = pd.concat(df_list, ignore_index=True)
    return explorer_df

In [63]:
load_dotenv(find_dotenv())
# Load spaCy model. You can also use 'en_core_web_sm' for a smaller model.
nlp = spacy.blank("en")
DATASET_NAME = "explorer-quality-testing"
# User management is done at a workspace level
rg.init(workspace="gst", api_key=os.environ["ARGILLA_API_KEY"])
dataset = rg.load(DATASET_NAME).to_datasets()

dataset_df = dataset.to_pandas()


argilla_df = process_argilla_df(dataset_df)

# List of subdirectories to include
concepts = [
    "financial-flows",
    "deforestation",
    "vulnerable-groups",
    "equity-and-just-transition",
    "barriers-and-challenges",
    "good-practice-and-opportunities",
]

# Base directory
base_dir = "/home/stefan/gst3/global-stocktake/concepts"


explorer_df = load_explorer_concepts_df(base_dir, concepts)
explorer_df = process_explorer_concepts_df(explorer_df)


metrics = calculate_concept_metrics(explorer_df, argilla_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['document_id'] = "CCLW"+dataset_df.id.str.split("CCLW").str[-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  explorer_df["subdir"] = explorer_df["subdir"].replace(replace_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  explorer_df['tokens'] = explorer_df['sentence'].apply(tokenize

In [65]:
metrics

Unnamed: 0,concept,precision,recall,f1
0,Financial Flows,0.666667,0.029851,0.057143
1,Deforestation,0.888889,0.017978,0.035242
2,Vulnerable Groups,1.0,0.028571,0.055556
3,Equity And Justice,0.0,0.0,0.0
4,Barriers and Challenges,1.0,0.022305,0.043636
5,Good Practices and Opportunities,0.666667,0.016598,0.032389
