In [None]:
# Install a pip package in the current Jupyter kernel
!pip install --quiet pandas==2.1.4

import pandas as pd
import json
from datetime import date

In [None]:
# copy files to output directory
!cp ../data/manuscripts.csv ../data/out/manuscripts.csv
!cp ../data/verses.csv ../data/out/verses.csv
!cp ../data/words.csv ../data/out/words.csv
!cp ../data/occurrences.csv ../data/out/occurrences.csv

In [None]:
# read files from output directory
manuscripts_df = pd.read_csv(
    "../data/out/manuscripts.csv",
    low_memory=False,
)

verses_df = pd.read_csv(
    "../data/out/verses.csv",
    low_memory=False,
)

words_df = pd.read_csv(
    "../data/out/words.csv",
    low_memory=False,
)

occurrences_df = pd.read_csv(
    "../data/out/occurrences.csv",
    low_memory=False,
)

In [None]:
# get only names from words dataframe
names_words_df = words_df[words_df["type"] == "name"]
names_words_df.drop(columns=["type"], inplace=True)
# get the unique variantIDs of all names
names_wordIDs = names_words_df["wordID"].unique()
# filter occurrences by variantIDs of names
names_occurrences_df = occurrences_df[
    occurrences_df["wordID"].isin(names_wordIDs)
]


column_types_names = {
    "label:en": "string",
    "gender": "string",
    "label:el:norm": "string",
    "factgrid": "string",
    "variant": "string",
    "wordID": "Int64",
    "variantID": "Int64",
}
names_words_df.astype(column_types_names)
names_words_df.fillna("NA",inplace=True)

column_types_occurrences = {
    "verse_id": "string",
    "variantID": "Int64",
    "wordID": "Int64",
    "occurrence": "boolean",
}
names_occurrences_df.astype(column_types_occurrences)


# write both dataframes to files
names_words_df.to_csv("../data/out/names.csv", index=False)
names_occurrences_df.to_csv("../data/out/name_occurrences.csv", index=False)

# Create a big file for use by theologists

As the theologists should also work with our data we build a big file including everything we know on verses, manuscripts, word and occurrences which they might need.

![Pseudo Database Structure](../data/out/relations.svg)

In [None]:
words_squashed = words_df.groupby("wordID").agg(lambda x: set(x)).reset_index()
words_squashed.head()

In [None]:
merged1 = pd.merge(occurrences_df, words_squashed, how="left", on="wordID")
merged1.head()

In [None]:
merged2 = pd.merge(merged1, verses_df, how="left", on="verse_id")
merged2.rename(
    columns={
        "variantID_x": "variantID",
        "variant": "variants",
        "variantID_y": "variantIDs",
    },
    inplace=True,
)
merged2.head()

In [None]:
merged2.to_csv(f"../data/out/theo_occurrences_{str(date.today())}.csv", index=False)

# Create Data Dicts for each file

In [None]:
NA_VALUE = "NA"
NAN_VALUE = "-1"
TITLE = "A Corpus of Biblical Names in the Greek New Testament to Study the Additions, Omissions, and Variations across different Manuscripts"
FILE_FORMAT = "csv"
CONTENT_URL = "https://github.com/chr-werner/SemDH2024-GreekNewTestamentNames"
DATE_PUBLISHED = str(date.today())
KEYWORDS = ["New Testament", "Biblical Names", "Textual Variation Units"]
LICENCE = "CC BY 4.0"
FUNDER = "Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) 513300936"


def generate_json(
    name: str,
    file_format: str,
    file_name: str,
    content_url: str,
    date_published: str,
    keywords: list[str],
    license_info: str,
    funder: str,
    authors: list[dict[str, str]],
    distribution_name: str,
    variable_measured: list[dict[str, str]],
    output_file: str,
):
    """Builder function for JSON data dictionary

    :param name: Title of the work
    :param file_format: Data file format
    :param file_name: Data file name
    :param content_url: GitHub content url
    :param date_published: Data publishign date
    :param keywords: List of keywords
    :param license_info: Licence to be applied on the data
    :param funder: Founder of the project
    :param authors: List of authors of the project
    :param distribution_name: Distribution title (same as parameter name)
    :param variable_measured: List of variables measured
    :param output_file: File path to output file
    :return:
    """
    data = {
        "@context": "https://schema.org/",
        "@type": "Dataset",
        "name": name,
        "fileFormat": file_format,
        "fileName": file_name,
        "contentUrl": content_url,
        "datePublished": date_published,
        "keywords": keywords,
        "license": license_info,
        "funder": funder,
        "author": [
            {
                "@type": "Person",
                "identifier": author["identifier"],
                "givenName": author["givenName"],
                "familyName": author["familyName"],
                "email": author["email"],
                "affiliation": author["affiliation"],
            }
            for author in authors
        ],
        "distribution": {
            "@type": "DataDownload",
            "name": distribution_name,
            "fileFormat": file_format,
        },
        "variableMeasured": [
            {
                "@type": "PropertyValue",
                "identifier": variable["identifier"],
                "unitText": variable["unitText"],
                "disambiguatingDescription": {
                    "@type": "Text",
                    "missingValuesAllowed": variable["missingValuesAllowed"],
                    "missingValuesValues": variable.get("missingValuesValues", "NA"),
                },
                "description": variable["description"],
                # "sameAs": variable["sameAs"],
            }
            for variable in variable_measured
        ],
    }

    # Save to a file
    with open(output_file, "w") as f:
        f.write(json.dumps(data, indent=4))

In [None]:
authors = [
    {
        "identifier": "0009-0008-9907-251X",
        "givenName": "Christoph",
        "familyName": "Werner",
        "email": "christoph.werner@hs-wismar.de",
        "affiliation": "University of Wismar",
    },
    {
        "identifier": "0000-0002-9784-7034",
        "givenName": "Zacharias",
        "familyName": "Shoukry",
        "email": "zacharias.shoukry@uni-rostock.de",
        "affiliation": "University of Rostock",
    },
    {
        "identifier": "0000-0003-1098-208X",
        "givenName": "Soham",
        "familyName": "Al-Suadi",
        "email": "soham.al-suadi@uni-rostock.de",
        "affiliation": "University of Rostock",
    },
    {
        "identifier": "0000-0002-7925-3363",
        "givenName": "Frank",
        "familyName": "Krüger",
        "email": "frank.krueger@hs-wismar.de",
        "affiliation": "University of Wismar",
    },
]

In [None]:
bkv = {
    "identifier": "bkv",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Verse Identifier by BKV Scheme",
}
century = {
    "identifier": "century",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Verse Identifier by BKV Scheme",
}
dbpedia = {
    "identifier": "dbpedia",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "DBpedia item id",
}
docID = {
    "identifier": "docID",
    "unitText": "integer",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Document Identifier by INTF scheme",
}
edition_date = {
    "identifier": "edition_date",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Transcription date",
}
edition_version = {
    "identifier": "edition_version",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Transcription edition",
}
encoding_version = {
    "identifier": "encoding_version",
    "unitText": "numeric",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Version of encoding scheme",
}
factgrid = {
    "identifier": "factgrid",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "FactGrid ItemID",
}
founder = {
    "identifier": "founder",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Founding institution",
}
ga = {
    "identifier": "ga",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Document Identifier by Gregory Aland Scheme",
    # "sameAs": "https://www.wikidata.org/prop/direct/P1577"
}
gender = {
    "identifier": "gender",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Genus",
}
label = {
    "identifier": "label",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Manuscript name",
}
label_el_norm = {
    "identifier": "label:el:norm",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Normalized greek label",
}
label_en = {
    "identifier": "label:en",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "English label",
}
lection = {
    "identifier": "lection",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Lection Identifier",
}
nkv = {
    "identifier": "nkv",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Verse Identifier by NKV Scheme",
}

occurrence = {
    "identifier": "occurrence",
    "unitText": "logical",
    "missingValuesAllowed": False,
    "description": "Indicator of occurrence",
}
pagesCount = {
    "identifier": "pagesCount",
    "unitText": "integer",
    "missingValuesAllowed": True,
    "missingValuesValues": NAN_VALUE,
    "description": "Number of pages",
}
leavesCount = {
    "identifier": "leavesCount",
    "unitText": "integer",
    "missingValuesAllowed": True,
    "missingValuesValues": NAN_VALUE,
    "description": "Number of leaves",
}
publisher = {
    "identifier": "publisher",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Transcription publisher",
}
publishing_date = {
    "identifier": "publishing_date",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Transcription publishing date",
}
source = {
    "identifier": "source",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Source of data",
}
sponsor = {
    "identifier": "sponsor",
    "unitText": "character",
    "missingValuesAllowed": True,
    "missingValuesValues": NA_VALUE,
    "description": "Transcription sponsor",
}
text = {
    "identifier": "text",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Transcription without gap annotations",
}
transcript = {
    "identifier": "transcript",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Transcription with gap annotations",
}
type = {
    "identifier": "type",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Word type",
}
variant = {
    "identifier": "variant",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Spelling variant",
}
variantID_words = {
    "identifier": "variantID",
    "unitText": "integer",
    "missingValuesAllowed": False,
    "description": "Unique variant identifier",
}
variantID_occurrences = {
    "identifier": "variantID",
    "unitText": "integer",
    "missingValuesAllowed": True,
    "missingValuesValues": NAN_VALUE,
    "description": "Unique variant identifier",
}
verse_id = {
    "identifier": "verse_id",
    "unitText": "character",
    "missingValuesAllowed": False,
    "description": "Unique verse identifier",
}
wordID = {
    "identifier": "wordID",
    "unitText": "integer",
    "missingValuesAllowed": False,
    "description": "Unique word identifier",
}

In [None]:
# Generate WORDS JSON
generate_json(
    name=TITLE,
    file_format=FILE_FORMAT,
    file_name="words",
    content_url=CONTENT_URL,
    date_published=DATE_PUBLISHED,
    keywords=KEYWORDS,
    license_info=LICENCE,
    funder=FUNDER,
    authors=authors,
    distribution_name=TITLE,
    variable_measured=[
        label_en,
        label_el_norm,
        gender,
        factgrid,
        variant,
        type,
        wordID,
        variantID_words,
    ],
    output_file=f"../data/out/words_{str(date.today())}.json",
)

In [None]:
# Generate VERSES JSON
generate_json(
    name=TITLE,
    file_format=FILE_FORMAT,
    file_name="verses",
    content_url=CONTENT_URL,
    date_published=DATE_PUBLISHED,
    keywords=KEYWORDS,
    license_info=LICENCE,
    funder=FUNDER,
    authors=authors,
    distribution_name=TITLE,
    variable_measured=[
        bkv,
        edition_date,
        edition_version,
        encoding_version,
        founder,
        ga,
        lection,
        nkv,
        publisher,
        publishing_date,
        source,
        sponsor,
        transcript,
        text,
        verse_id,
    ],
    output_file=f"../data/out/verses_{str(date.today())}.json",
)

In [None]:
# Generate OCCURRENCES JSON
generate_json(
    name=TITLE,
    file_format=FILE_FORMAT,
    file_name="occurrences",
    content_url=CONTENT_URL,
    date_published=DATE_PUBLISHED,
    keywords=KEYWORDS,
    license_info=LICENCE,
    funder=FUNDER,
    authors=authors,
    distribution_name=TITLE,
    variable_measured=[
        variantID_occurrences,
        occurrence,
        wordID,
        verse_id,
    ],
    output_file=f"../data/out/occurrences_{str(date.today())}.json",
)

In [None]:
# Generate MANUSCRIPTS JSON
generate_json(
    name=TITLE,
    file_format=FILE_FORMAT,
    file_name="manuscripts",
    content_url=CONTENT_URL,
    date_published=DATE_PUBLISHED,
    keywords=KEYWORDS,
    license_info=LICENCE,
    funder=FUNDER,
    authors=authors,
    distribution_name=TITLE,
    variable_measured=[
        docID,
        pagesCount,
        leavesCount,
        ga,
        century,
        source,
        label,
        dbpedia,
    ],
    output_file=f"../data/out/manuscripts_{str(date.today())}.json",
)