# Search for occurrences and omissions of names

## 1 Installing needed packages

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
!pip install --quiet pandas==2.1.4
!pip install --quiet flair==0.13.1
!pip install --quiet scipy==1.10.1
!pip install --quiet torch==2.1.2
!pip install --quiet swifter==1.4.0
!pip install --quiet tqdm==4.66.4

import pandas as pd
import concurrent.futures
from utils import process_bkv
from tqdm.notebook import tqdm

## 2 Read data from files

In [None]:
# data files
verses_data = "../data/verses.csv"
word_data = "../data/words.csv"

# Read CSV files into a DataFrames
verses_df = pd.read_csv(
    verses_data,
    low_memory=False,
    dtype={
        "ga": "string",
        "bkv": "string",
        "transcript": "string",
        "text": "string",
        "lection": "string",
        "publisher": "string",
        "source": "string",
        "edition_version": "float",
        "verse_id": "Int64",
    },
    usecols=[
        "ga",
        "bkv",
        "transcript",
        "text",
        "lection",
        "publisher",
        "source",
        "edition_version",
        "verse_id",
    ],
)
words_df = pd.read_csv(
    word_data,
    low_memory=False,
    dtype={
        "label:en": "string",
        "label:el:norm": "string",
        "gender": "string",
        "variant": "string",
        "wordID": "Int64",
        "variantID": "Int64",
        "factgrid": "string",
    },
    usecols=[
        "label:en",
        "label:el:norm",
        "gender",
        "variant",
        "wordID",
        "variantID",
    ],
)

## 3 Search for omissions and occurrences by bkv 

In [None]:
overwrite = True

# Get unique values from the 'bkv' column
unique_bkvs = verses_df["bkv"].unique()
# unique_bkvs = ["B01K12V22"]
print(f"number of verse names: {len(unique_bkvs)}")

# Initialize tqdm for the progress bar
total_bkvs = len(unique_bkvs)

# Execute tasks and gather results
with concurrent.futures.ProcessPoolExecutor() as executor:
    # Submit tasks and collect futures
    futures = [
        executor.submit(
            process_bkv, bkv, "../data/occurrences", verses_df, words_df, overwrite
        )
        for bkv in unique_bkvs
    ]

    progress_bar = tqdm(total=total_bkvs, desc="Processing")

    # Gather results
    for future in concurrent.futures.as_completed(futures):
        progress_bar.update(1)  # Update the progress bar

    # Close the progress bar
    progress_bar.close()

## 4 Merging multiple csv files to one
Concatenating with `awk` (obviously it needs to be installed) is done, as we know all files do have the same header. Also, this is computationally more efficient than first reading each file into a pd.DataFrame and then merging those into one.

In [None]:
!awk 'FNR==1 && NR!=1 { next; } { print }' ../data/occurrences/B*.csv > ../data/occurrences.csv