In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
import os
import pickle
from datetime import datetime

import pandas as pd
from matplotlib import pyplot as plt

from src.utils import (
    get_all_organisations,
    populate_everything,
    populate_lengths,
    remove_empty_documents,
)

In [3]:
df: pd.DataFrame = pd.read_pickle("df_with_embeddings_float32_filtered50.pkl")
populate_lengths(df)
remove_empty_documents(df)

In [4]:
with open("questions_with_embeddings.pkl", "rb") as f:
    questions = pickle.load(f)

In [16]:
cwd = os.getcwd()
data_folder = os.path.join(cwd, "data")
orgs = get_all_organisations(df)
scores_df = pd.DataFrame(index=orgs)
for x, y in questions.items():
    print(f"{datetime.now()}: Executing code for question {x}...")

    question_reference = y["question_reference"]
    question_final = y["question_final"]
    inn_texts = y["reference_texts"]
    inn_texts_embedding = y["reference_embedding"]

    print(f"{datetime.now()}: Comparing similarity...")
    populate_everything(df, inn_texts_embedding)

    mean_values = df.groupby("name")["all_mean"].mean()

    # fill in a new columns with name x that uses the index to find the mean value
    scores_df[x] = scores_df.index.map(mean_values)

2024-05-08 12:51:07.355529: Executing code for question 5.3...
2024-05-08 12:51:07.355669: Comparing similarity...
2024-05-08 12:51:13.856453: Executing code for question 5.4...
2024-05-08 12:51:13.856590: Comparing similarity...


In [17]:
with open("scores_df.pkl", "wb") as f:
    pickle.dump(scores_df, f)

In [4]:
with open("scores_df.pkl", "rb") as f:
    scores_df = pickle.load(f)

In [8]:
cwd = os.getcwd()
data_folder = os.path.join(cwd, "data")
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
orgs = get_all_organisations(df)

for org, row in scores_df.iterrows():
    # plot the scores for the organisation
    # the scores are stored in the other columns
    # the plot uses matplotlib to plot as a bar chart
    # the plot is written here and does not use outside functions

    fig, ax = plt.subplots(figsize=(10, 5))
    # the x is the column names
    # the y is the values in the row
    ax.bar(row.index, row.values)
    ax.set_title(f"Aggregerte scorer for alle dokumentene til {org}")
    org_filename = org.replace("/", "-")
    fig.savefig(
        os.path.join(
            data_folder,
            f"{org_filename}.png",
        ),
        bbox_inches="tight",
    )
    plt.close()