# Search for occurrences and omissions of names

## 1 Installing and importing needed modules

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
import sys

!{sys.executable} -m pip install pandas==2.1.4

from concurrent.futures import ProcessPoolExecutor
import pandas as pd
import time
import glob
import re
import os
from pathlib import Path


from utils import process_bkv

## 2 Read data from files

In [None]:
# data files
verses_data = "../data/verses.csv"
word_data = "../data/names.csv"

# Read CSV files into a DataFrames
verses_df = pd.read_csv(
    verses_data,
    low_memory=False,
    dtype={
        "docID": "int32",
        "ga": "string",
        "bkv": "string",
        "nkv": "string",
        "text": "string",
        "marks": "string",
        "publisher": "string",
        "source": "string",
        "ntvmrLink": "string",
    },
)
gendervoc_df = pd.read_csv(
    word_data,
    low_memory=False,
    dtype={
        "label:en": "string",
        "label:el": "string",
        "alternatives": "string",
        "gender": "string",
        "variant": "string",
        "variants": "string",
        "type": "string",
        "wordID": "int64",
        "variantID": "int64",
        "factgrid": "string",
    },
    usecols=[
        "label:en",
        "label:el",
        "gender",
        "variant",
        "variants",
        "wordID",
        "variantID",
        "factgrid",
    ],
)

## 3 Search for omissions and occurrences by bkv 

In [None]:
# Record the start time
start_time = time.time()

# Get unique values from the 'bkv' column
unique_bkvs = verses_df["bkv"].unique()
print(f"number of verse names: {len(unique_bkvs)}")

# execute threadpool
with ProcessPoolExecutor(max_workers=10) as executor:
    for bkv in unique_bkvs:
        # if file does not exist
        # if not Path(f"../data/occurrences/{bkv}.csv").exists():
        # Submitting the function with arguments to the executor for each docID
        executor.submit(
            process_bkv, bkv, "../data/occurrences", verses_df, gendervoc_df
        )

# Record the end time
end_time = time.time()
# Calculate the runtime
runtime = end_time - start_time
print(f"The script took {runtime:.4f} seconds to execute.")

# on server: 5128sec
# on laptop: 10043sec

## 4 Merging multiple csv files to one

In [None]:
# setting the path for joining multiple files
files = os.path.join("../data/occurrences", "B*.csv")
# list of merged files returned
files = glob.glob(files)
# joining files with concat and read_csv
df = pd.concat(map(pd.read_csv, files), ignore_index=True)
# drop duplicate rows
df.drop_duplicates(inplace=True, ignore_index=True)
# drop not needed columns, as docID/bkv/text is 'primary key'
df.drop(
    columns=["docID", "nkv", "marks", "publisher", "source", "ntvmrLink"],
    inplace=True,
)

# write to csv
df.to_csv("../data/occurrences.csv", index=False)