In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import pickle
import shutil
from datetime import datetime

import matplotlib.figure
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

from src.embedding_model import embed
from src.plot_utils import plot_frequency, plot_grouped_results
from src.utils import (
    DEFAULT_K,
    get_all_organisations,
    get_departments,
    get_directorates,
    populate_everything,
    populate_lengths,
    remove_empty_documents,
)

In [3]:
df: pd.DataFrame = pd.read_pickle("df_with_embeddings_float32_filtered50.pkl")
populate_lengths(df)
remove_empty_documents(df)

In [5]:
deps = get_departments(df)

In [11]:
df[df["name"].isin(deps)].type.value_counts()

type
Tildelingsbrev    893
Årsrapport          2
Name: count, dtype: int64

In [12]:
df[(df["name"].isin(deps)) & (df.type == "Årsrapport")]

Unnamed: 0,id,uuid,type,title,subtitle,original_title,isbn,isbn_printed,issn,redirect_to_id,...,created_at.2,updated_at.2,id.4,level,parent_id,entity_id.1,created_at.3,updated_at.3,split_paragraphs,lengths
39,1138,5a261bf3-52ce-4584-88ec-9b7619c2153a,Årsrapport,Årsrapport Forsvaret 2019,,,,,,,...,2022-01-04 09:00:17,2022-10-12 21:02:04,1463,0,,972417823,2024-04-02 22:00:04,2024-04-02 22:00:04,"[(1, Innhold\n01 LEDERS BERETNING, [[ 0.06709...",691
108,3696,6dcbe90d-7bfc-47c0-bd22-de7c9de6b893,Årsrapport,Årsrapport Forsvaret 2018,,,,,,,...,2022-01-04 09:00:17,2022-10-12 21:02:04,1463,0,,972417823,2024-04-02 22:00:04,2024-04-02 22:00:04,"[(1, For alt vi har. Og alt vi er.\n1, [[ 0.05...",644


In [None]:
with open("questions.json", "r") as f:
    questions = json.load(f)

In [None]:
for x, y in questions.items():
    print(f"Executing code for question {x}...")
    question_reference = y["question_reference"]
    question_final = y["question_final"]
    inn_texts = y["reference_texts"]
    print("Embedding reference texts...")
    inn_texts_embeddings = [embed(inn_text) for inn_text in inn_texts]
    inn_texts_embeddings = np.array(inn_texts_embeddings)
    y["reference_embedding"] = np.mean(inn_texts_embeddings, axis=0)

In [None]:
with open("questions_with_embeddings.pkl", "wb") as f:
    pickle.dump(questions, f)

In [11]:
with open("questions_with_embeddings.pkl", "rb") as f:
    questions = pickle.load(f)

In [7]:
def clear_data_folder():
    cwd = os.getcwd()
    data_folder = os.path.join(cwd, "data")
    # clear all the files in all subfolders of the data folder
    for root, dirs, files in os.walk(data_folder, topdown=False):
        for file in files:
            os.remove(os.path.join(root, file))
        # clear all the subfolders
        for d in dirs:
            shutil.rmtree(os.path.join(root, d))


clear_data_folder()

In [None]:
cwd = os.getcwd()
data_folder = os.path.join(cwd, "data")
orgs = get_all_organisations(df)
deps = get_departments(df)
dirs = get_directorates(df)
ffed = False
for x, y in questions.items():
    print(f"{datetime.now()}: Executing code for question {x}...")
    question_folder = os.path.join(data_folder, x)
    if not os.path.exists(question_folder):
        os.makedirs(question_folder)

    question_reference = y["question_reference"]
    question_final = y["question_final"]
    inn_texts = y["reference_texts"]
    inn_texts_embedding = y["reference_embedding"]

    print(f"{datetime.now()}: Comparing similarity...")
    populate_everything(df, inn_texts_embedding)

    print(f"{datetime.now()}: Plotting...")
    plot_grouped_results(
        df, deps, title=question_final, fig_size=(1100, 600)
    ).write_image(os.path.join(question_folder, "departments_over_time.png"), scale=2)

    plot_frequency(df, question=question_final).savefig(
        os.path.join(question_folder, "hvor_ofte_alle_virksomheter.png"),
        dpi=200,
        bbox_inches="tight",
    )
    plot_frequency(df[df["type"] == "Tildelingsbrev"], question=question_final).savefig(
        os.path.join(question_folder, "hvor_ofte_kun_tildelingsbrev.png"),
        dpi=200,
        bbox_inches="tight",
    )
    plot_frequency(df[df["type"] == "Årsrapport"], question=question_final).savefig(
        os.path.join(question_folder, "hvor_ofte_kun_årsrapport.png"),
        dpi=200,
        bbox_inches="tight",
    )

    print()

In [None]:
# need to download the data folder and zip it in windows
# clear_data_folder()

In [6]:
cwd = os.getcwd()
data_folder = os.path.join(cwd, "data")
orgs = get_all_organisations(df)
deps = get_departments(df)
dirs = get_directorates(df)
for x, y in questions.items():
    print(f"{datetime.now()}: Executing code for question {x}...")
    question_folder = os.path.join(data_folder, x)
    if not os.path.exists(question_folder):
        os.makedirs(question_folder)

    question_reference = y["question_reference"]
    question_final = y["question_final"]
    inn_texts = y["reference_texts"]
    inn_texts_embedding = y["reference_embedding"]

    print(f"{datetime.now()}: Comparing similarity...")
    populate_everything(df, inn_texts_embedding)

    for org in tqdm(
        orgs, desc="Plotting how_often for each organisation", total=len(orgs)
    ):
        fig: matplotlib.figure.Figure = plot_frequency(
            df, question=question_final, org=org
        )
        org_filename = org.replace("/", "-")
        fig.savefig(os.path.join(question_folder,
                    f"{org_filename}.png"), dpi=200)

    print()

2024-05-09 05:51:44.389625: Executing code for question 4.2...
2024-05-09 05:51:44.390120: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 05:54:23.530828: Executing code for question 4.3...
2024-05-09 05:54:23.534278: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 05:56:29.881344: Executing code for question 4.4...
2024-05-09 05:56:29.885385: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 05:58:34.959678: Executing code for question 4.5...
2024-05-09 05:58:34.962000: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 06:02:05.144523: Executing code for question 5.1...
2024-05-09 06:02:05.148228: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 06:03:54.851534: Executing code for question 5.2...
2024-05-09 06:03:54.854847: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 06:05:56.998690: Executing code for question 5.3...
2024-05-09 06:05:57.001123: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]


2024-05-09 06:07:35.652430: Executing code for question 5.4...
2024-05-09 06:07:35.655335: Comparing similarity...


Plotting how_often for each organisation:   0%|          | 0/196 [00:00<?, ?it/s]




In [None]:
# need to download the data folder and zip it in windows
# clear_data_folder()

In [13]:
def split_paragraphs_with_score_to_dict(row):
    zipped_paras = zip(
        row["split_paragraphs"],
        row["deviation_scaled_with_length"],
        row["sims_scaled"],
    )
    zipped_paras = sorted(zipped_paras, key=lambda x: x[1], reverse=True)
    return {
        "dokument_score_alt_gjennomsnitt": row["all_mean"],
        "dokument_score_rent_gjennomsnitt": row["pure_mean_sims"],
        "dokument_score_gjennomsnitt_vektet_med_lengden": row["weighted_mean_sims"],
        f"dokument_score_gjennomsnitt_top_{DEFAULT_K}": row["top_k_mean"],
        "splittede_paragrafer": [
            {
                "side": page,
                "tekst": text,
                "score": deviation_scaled_with_length,
                "score_uten_å_ta_hensyn_til_lengde": sims_scaled,
            }
            for (
                page,
                text,
                _,
            ), deviation_scaled_with_length, sims_scaled in zipped_paras
        ],
    }

In [14]:
cwd = os.getcwd()
data_folder = os.path.join(cwd, "data")
orgs = get_all_organisations(df)
deps = get_departments(df)
dirs = get_directorates(df)
ffed = False
for x, y in questions.items():
    print(f"{datetime.now()}: Executing code for question {x}...")
    question_folder = os.path.join(data_folder, x)
    if not os.path.exists(question_folder):
        os.makedirs(question_folder)

    question_reference = y["question_reference"]
    question_final = y["question_final"]
    inn_texts = y["reference_texts"]
    inn_texts_embedding = y["reference_embedding"]

    print(f"{datetime.now()}: Comparing similarity...")
    populate_everything(df, inn_texts_embedding)

    print(f"{datetime.now()}: Getting and writing json representation...")
    texts_folder = os.path.join(
        question_folder, "tekstsnutter_med_score_for_hvert_dokument"
    )
    if not os.path.exists(texts_folder):
        os.makedirs(texts_folder)
    df["json"] = df.apply(split_paragraphs_with_score_to_dict, axis=1)
    # for each row, write the json to a file
    for i, row in tqdm(df.iterrows(), desc="Writing .json files", total=len(df)):
        company_folder = os.path.join(question_folder, row["name"].replace("/", "-"))
        if not os.path.exists(company_folder):
            os.makedirs(company_folder)
        # check if the file name exists already
        filename = os.path.join(
            company_folder, f"{row['title'].replace("/", "-")}.json"
        )
        if os.path.exists(filename):
            # if it exists, add a number to the end of the filename before the .json
            j = 1
            while os.path.exists(
                os.path.join(
                    company_folder,
                    f"{
                        row['title'].replace('/', '-')}-{j}.json",
                )
            ):
                j += 1
            filename = os.path.join(
                company_folder, f"{row['title'].replace('/', '-')}-{j}.json"
            )
        with open(filename, "w") as f:
            json.dump(row["json"], f, ensure_ascii=False)
            df.iloc[i, df.columns.get_loc("json")] = None

    print()
    break

2024-05-09 08:22:19.922753: Executing code for question 1.1...
2024-05-09 08:22:19.923572: Comparing similarity...
2024-05-09 08:22:34.789659: Getting and writing json representation...


Writing .json files:   0%|          | 0/1775 [00:00<?, ?it/s]




In [None]:
# need to download the data folder and zip it in windows
# clear_data_folder()