This notebook takes the predictions made by the various models and calculates their performance.


In [None]:
import pandas as pd
from variables import PROJECTNAME_DATA_PATHS, SDG_MAP, ALL_EVAL_RESULTS_PATH
from file_org import iterdatatype_data

In [None]:
def count_sdgs(data):
    sdg_counts = {k: 0 for k in SDG_MAP.keys()}
    total_rows = data.shape[0]

    for label_data in data["labels"]:
        for label in set(label_data):
            sdg_counts[label] += 1

    return sdg_counts, total_rows


all_results = []
for datatype in PROJECTNAME_DATA_PATHS.keys():
    for project_name, data in iterdatatype_data(datatype):
        sdg_counts, total_rows = count_sdgs(data)
        for sdg, sdg_count in sdg_counts.items():
            all_results.append(
                dict(
                    project_name=project_name,
                    sdg=sdg,
                    datatype=datatype,
                    total_rows=total_rows,
                    positive=sdg_count,
                    negative=total_rows - sdg_count,
                    pos_perc=sdg_count / total_rows,
                    neg_perc=(total_rows - sdg_count) / total_rows,
                )
            )

data_counts_df = pd.DataFrame(all_results)

model_performance = pd.read_json(ALL_EVAL_RESULTS_PATH, orient="records", lines=True)
data_counts_df = data_counts_df.merge(
    model_performance, how="left", on=["sdg", "project_name"]
)

data_counts_df.to_excel("../data/exploratory/data_distribution.xlsx", index=False)
data_counts_df