# Add data manuscript data from NTVMR metadata files
## 1 Installing needed packages

In [None]:
# Install a pip package in the current Jupyter kernel
!pip install --quiet pandas==2.1.4
!pip install --quiet tqdm==4.66.4

import json
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from glob import glob
from converters import docID_to_ga
from tqdm.notebook import tqdm

tqdm.pandas()

# read file manuscripts_tei.csv
manuscripts_tei_df = pd.read_csv(
    "../data/manuscripts_tei.csv",
    low_memory=False,
    usecols=["docID", "ga", "label", "source"],
    dtype={
        "docID": "string",
        "ga": "string",
        "label": "string",
        "source": "string",
    },
)

## 2 Process JSON files in parallel

Currently, we only take docID, ga, pagesCount, and estimated originYear/century. BUT there is way more data for each page/image etc.

In [None]:
# Function to process each JSON file and extract information
def process_json_file(file_path):
    """Process given JSON file and return relevant data from it

    TODO: it might be easier to work with xml files, which can be selected whiles downloading the manuscript metadata

    :param file_path:
    :return:
    """
    with open(file_path, "r") as file:
        json_data = json.load(file)

    # Initialize all values to NaN
    docID = century = pagesCount = ga = leavesCount = float("nan")

    # try to fill variables with values
    docID = json_data["data"]["manuscript"]["docID"]
    # originYearLate = json_data["data"]["manuscript"]["originYear"]["late"]
    # originYearEarly = json_data["data"]["manuscript"]["originYear"]["early"]
    pagesCount = json_data["data"]["manuscript"]["pages"]["count"]
    leavesCount = json_data["data"]["manuscript"]["leaves"]["leavesCount"]
    ga = docID_to_ga(
        docID
    )  # BUG: json_data["data"]["manuscript"]["leaves"]["gaNum"] for 20000 and 30000 identical FIX: wrote own function

    # sometimes no shelf index is given
    # try:
    #    shelf_instances_data = json_data["data"]["manuscript"]["shelfInstances"][
    #        "shelfInstance"
    #    ]
    #    if isinstance(shelf_instances_data, list):
    #        shelf_instances = [
    #            {k: v for k, v in instance.items() if v != ""}
    #            for instance in shelf_instances_data
    #        ]
    #    elif isinstance(
    #        shelf_instances_data, dict
    #    ):  # Clean and convert a single dictionary to a list of dictionaries
    #        shelf_instances = [
    #            {k: v for k, v in shelf_instances_data.items() if v != ""}
    #        ]
    #    else:
    #        shelf_instances = []
    # except:
    #    shelf_instances = None

    # sometimes century is not given
    try:
        century = json_data["data"]["manuscript"]["originYear"]["content"]
    except:
        century = None

    return {
        "docID": docID,
        # "originYearLate": originYearLate,
        # "originYearEarly": originYearEarly,
        "pagesCount": pagesCount,
        "leavesCount": leavesCount,
        "ga": ga,
        "century": century,
        # "shelfInstances": shelf_instances,
        "source": "ntvmr",
    }


# List of JSON files
json_files = glob("../data/manuscripts/ntvmr/*.json")

# Use ProcessPoolExecutor with tqdm progress bar
with ProcessPoolExecutor() as executor:
    # Wrap the executor.map call with tqdm for the progress bar
    results = list(
        tqdm(executor.map(process_json_file, json_files), total=len(json_files))
    )


# Create a DataFrame from the results
manuscripts_json_df = pd.DataFrame(results)
manuscripts_json_df.to_csv("../data/manuscripts_json.csv", index=False)

## 3 Merge
Here we merge the just generated data with the already found data from teiparse.ipynb

In [None]:
# concat multiple dataframes
merged_df = pd.concat([manuscripts_json_df, manuscripts_tei_df], ignore_index=True)

## 4 Writing to file

Before writing to file pagesCount should be set to be integer, therefor NaN values are set to 0

In [None]:
column_types = {
    "docID": "string",
    "pagesCount": "Int64",
    "leavesCount": "Int64",
    "ga": "string",
    "century": "string",
    "source": "string",
    "label": "string",
}
merged_df.astype(column_types)

merged_df.to_csv("../data/manuscripts_json_tei.csv", index=False)