# Get/Generate words for search



## 1 Installing and importing needed modules

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
import sys

!{sys.executable} -m pip install pandas==2.1.4

import pandas as pd
from utils import df_remove_diacritics, string_to_list

## 2 Preprocess data 


In [None]:
names_df = pd.read_csv("../tables/names_squashed.csv", sep="|")

### 2.1 Remove rows with empty greek label

In [None]:
names_df.dropna(subset=["label:el"], how="any", inplace=True, ignore_index=True)

### 2.2 Remove diacritics
As the transcripted text contains no accents and is completely lowercase, diacritcs have to be removed from data collected on persons.

In [None]:
columns = [
    "label:el",
    "Nom Sg",
    "Gen Sg",
    "Dat Sg",
    "Akk Sg",
    "Voc Sg",
    "alternatives",
]

df_remove_diacritics(names_df, columns)

### 2.3 Merging column data

All cases and alternative spellings get merged into a column named 'variants' to remove identical forms

In [None]:
# Function to merge columns
def columns_to_set(row) -> set:
    filtered_list = [e for elem in row if pd.notnull(elem) for e in elem.split(',')]
    return set(filtered_list)


columns = [
    "label:el",
    "Nom Sg",
    "Gen Sg",
    "Dat Sg",
    "Akk Sg",
    "Voc Sg",
    "alternatives",
]

# Applying the function to merge columns
names_df["variants"] = names_df[columns].apply(columns_to_set, axis=1)

# drop merged columns
names_df.drop(
    labels=[
        # "label:de",
        # "note",
        # "source",
        "Nom Sg",
        "Gen Sg",
        "Dat Sg",
        "Akk Sg",
        "Voc Sg",
        "alternatives",
    ],
    axis=1,
    inplace=True,
)

### 2.4 Add type (future use)

In [None]:
#names_df["type"] = "name"

### 2.5 Remove factgrid link to keep entity numbers only

In [None]:
# split factgrid string on ,
# Splitting strings in the column based on comma and converting them into sets
names_df["factgrid"] = (
    names_df["factgrid"].astype(str).apply(lambda x: set(x.split(",")))
)
# Exploding the sets in the column
names_df = names_df.explode("factgrid")
# Removing the substring from all strings in the column
names_df["factgrid"] = names_df["factgrid"].str.replace(
    "https://database.factgrid.de/entity/", ""
)

## 4 Write to file

In [None]:
# add index for words
names_df["wordID"] = range(0, len(names_df))
names_df = names_df.rename(columns={'variants': 'variant'})
# explode name_df by column variant
names_df = names_df.explode("variant").reset_index()
# Adding a new column 'variantID' with unique numbers for each variant (row)
names_df["variantID"] = range(0, len(names_df))
# drop not needed column index (as it has already been updated) from dataframe
names_df.drop(columns=["index"], inplace=True)
# write to csv file
names_df.to_csv("../data/names.csv", index=False)