In [None]:
# Install a pip package in the current Jupyter kernel
import sys

!{sys.executable} -m pip install pandas==2.1.4
!{sys.executable} -m pip install requests==2.31.0

import requests
import pandas as pd

## Add data from dbpedia

### Query, Endpoint and Parameters

In [None]:
# Your SPARQL query
sparql_query = """
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX dbc: <http://dbpedia.org/resource/Category:>

SELECT DISTINCT ?entry ?entryLabel ?form ?number ?found
WHERE {
	VALUES ?concept {
		dbc:Greek_New_Testament_lectionaries
		dbc:Greek_New_Testament_minuscules
		dbc:Greek_New_Testament_uncials
		dbc:New_Testament_papyri
	}
	?entry dcterms:subject ?concept .
	
	OPTIONAL{?entry rdfs:label ?entryLabel}
	OPTIONAL{?entry dbp:form ?form}
	OPTIONAL{?entry dbp:number ?number}
	OPTIONAL{?entry dbp:found ?found}
	
	FILTER (langMatches(lang(?entryLabel), "en"))
}
"""

# DBpedia SPARQL endpoint
sparql_endpoint = "http://dbpedia.org/sparql"


# Set the request parameters
params = {"query": sparql_query, "format": "json"}

### Request data

In [None]:
# Send the SPARQL query to DBpedia
response = requests.get(sparql_endpoint, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Extract the bindings from the response
    bindings = data["results"]["bindings"]

    # Convert the bindings to a list of dictionaries
    results_list = [
        {key: binding[key]["value"] for key in binding} for binding in bindings
    ]

    # Create a pandas DataFrame from the list
    manuscripts_sparql_df = pd.DataFrame(results_list)

else:
    print(f"Error: {response.status_code} - {response.text}")

### Cleanup 'number' column

In [None]:
def has_decimal(string: str) -> bool:
    value = float(string)
    return value % 1 != 0


# Custom function to clean and convert values to integers
def clean_and_convert(string: str) -> int | None:
    try:
        if has_decimal(string):
            return None
        else:
            cleaned_value = "".join(filter(str.isdigit, string))
            return int(cleaned_value) if cleaned_value else None
    except:
        return None


manuscripts_cleanup1_df = manuscripts_sparql_df.copy()

# Apply the custom function to the specified column
manuscripts_cleanup1_df["number"] = manuscripts_cleanup1_df["number"].apply(
    clean_and_convert
)
# if a number is greater than 3000 (by mistake) set it to None
manuscripts_cleanup1_df.loc[manuscripts_cleanup1_df["number"] > 3000, "number"] = None

### Cleanup 'found' column

In [None]:
manuscripts_cleanup2_df = manuscripts_cleanup1_df.copy()

# Fill NaN values in the 'found' column with an empty string
manuscripts_cleanup2_df["found"] = (
    manuscripts_cleanup2_df["found"].fillna("").astype(str)
)
# run a groupby to merge found entries of otherwise duplicate rows
manuscripts_cleanup2_df = manuscripts_cleanup2_df.groupby(
    ["entry", "entryLabel", "form", "number"], as_index=False
)["found"].agg(",".join)

### (Re)Generate the GA number from manuscript 'number' and 'form'

In [None]:
# Custom function to modify values based on the 'form' column
def generate_ga(row):
    if pd.notna(row["form"]) and pd.notna(row["number"]):
        if row["form"] == "Papyrus":
            return "P" + str(int(row["number"]))
        elif row["form"] == "Uncial":
            return "0" + str(int(row["number"]))
        elif row["form"] == "Minuscule":
            return str(int(row["number"]))
        elif row["form"] == "Lectionary":
            return "L" + str(int(row["number"]))
    else:
        return None


manuscripts_cleanup3_df = manuscripts_cleanup2_df.copy()

manuscripts_cleanup3_df["ga"] = manuscripts_cleanup3_df.apply(generate_ga, axis=1)

manuscripts_cleanup3_df = manuscripts_cleanup3_df.rename(
    columns={"entry": "dbpedia", "entryLabel": "label"}
)
manuscripts_cleanup3_df.drop(labels=["number"], axis=1, inplace=True)
manuscripts_cleanup3_df["source"] = "dbpedia"

manuscripts_cleanup3_df.head(-1)

### Merge with already known data

In [None]:
# read file manuscripts.csv
manuscripts_df = pd.read_csv("../data/manuscripts_json_tei.csv")

merged_df = pd.concat([manuscripts_df, manuscripts_cleanup3_df], ignore_index=True)

merged_df["pagesCount"] = merged_df["pagesCount"].fillna(0).astype(int)

column_types = {
    "docID": "string",
    "originYearLate": "string",
    "originYearEarly": "string",
    "pagesCount": "string",
    "leavesCount": "string",
    "century": "string",
    "source": "string",
    "label": "string",
    "dbpedia": "string",
    "found": "string",
}
merged_df.astype(column_types)

# cleanup
merged_df["originYearLate"].replace({0: None, 0.0: None}, inplace=True)
merged_df["originYearEarly"].replace({0: None, 0.0: None}, inplace=True)
merged_df["pagesCount"].replace({0: None, 0.0: None}, inplace=True)
merged_df["leavesCount"].replace({0: None, 0.0: None}, inplace=True)

## Writing to file


In [None]:
merged_df.convert_dtypes().to_csv("../data/manuscripts.csv", index=False)