# Get/Generate words for search



## 1 Installing needed packages

When running on a remote JupyterLab, packages that are needed have to be explicitly installed:

In [None]:
# Install a pip package in the current Jupyter kernel
import sys

!pip install --quiet pandas==2.1.4

import pandas as pd
import unicodedata

## 2 Read/Preprocess data 


In [None]:
names_df = pd.read_csv(
    "../data/tables/names.csv",
    sep=",",
    usecols=[
        "label:en",
        "gender",
        "label:el",
        "label:el:norm",
        "alternatives",
        "Nom Sg",
        "Gen Sg",
        "Dat Sg",
        "Akk Sg",
        "Voc Sg",
        "factgrid",
    ],
)

nominals_df = pd.read_csv(
    "../data/tables/nominals.csv",
    sep=",",
    usecols=[
        "label:en",
        "gender",
        "label:el",
        "Nom Sg",
        "Gen Sg",
        "Dat Sg",
        "Akk Sg",
        "Voc Sg",
        "Nom Pl",
        "Gen Pl",
        "Dat Pl",
        "Akk Pl",
        "Voc Pl",
    ],
)

### 2.1 Remove rows with empty greek label

In [None]:
names_df.dropna(subset=["label:el"], how="any", inplace=True, ignore_index=True)
nominals_df.dropna(subset=["label:el"], how="any", inplace=True, ignore_index=True)

### 2.2 Remove diacritics and merge column data
As the transcripted text contains no accents and is completely lowercase, diacritcs have to be removed from data collected on persons. All cases and alternative spellings get merged into a column named 'variants' to remove identical forms


In [None]:
def str_remove_diacritics(s: str) -> str:
    return "".join(
        c for c in unicodedata.normalize("NFKD", s) if unicodedata.category(c) != "Mn"
    ).lower()


def df_remove_diacritics(df: pd.DataFrame, columns: list):
    # remove diacritics from all columns named in columns_to_process
    for col in columns:
        df[col] = df[col].apply(
            lambda x: str_remove_diacritics(x) if pd.notnull(x) else x
        )


def columns_to_set(row) -> set:
    filtered_list = [e for elem in row if pd.notnull(elem) for e in elem.split(",")]
    return set(filtered_list)

In [None]:
columns = [
    "label:el",
    "Nom Sg",
    "Gen Sg",
    "Dat Sg",
    "Akk Sg",
    "Voc Sg",
    "alternatives",
]

df_remove_diacritics(names_df, columns)
# Applying the function to merge columns
names_df["variants"] = names_df[columns].apply(columns_to_set, axis=1)

In [None]:
columns = [
    "label:el",
    "Nom Sg",
    "Gen Sg",
    "Dat Sg",
    "Akk Sg",
    "Voc Sg",
    "Nom Pl",
    "Gen Pl",
    "Dat Pl",
    "Akk Pl",
    "Voc Pl",
]

df_remove_diacritics(nominals_df, columns)
nominals_df["label:el:norm"] = nominals_df["label:el"]
# Applying the function to merge columns
nominals_df["variants"] = nominals_df[columns].apply(columns_to_set, axis=1)

In [None]:
# drop merged columns
names_df.drop(
    labels=[
        "label:el",
        "Nom Sg",
        "Gen Sg",
        "Dat Sg",
        "Akk Sg",
        "Voc Sg",
        "alternatives",
    ],
    axis=1,
    inplace=True,
)

nominals_df.drop(
    labels=[
        "label:el",
        "Nom Sg",
        "Gen Sg",
        "Dat Sg",
        "Akk Sg",
        "Voc Sg",
        "Nom Pl",
        "Gen Pl",
        "Dat Pl",
        "Akk Pl",
        "Voc Pl",
    ],
    axis=1,
    inplace=True,
)

### 2.3 Add type

In [None]:
names_df["type"] = "name"
nominals_df["type"] = "nominal"

### 2.4 Merge dataframes

In [None]:
merged_df = pd.concat([names_df, nominals_df])

### 2.5 Explode by name

In [None]:
# add index for words
merged_df["wordID"] = range(0, len(merged_df))
merged_df = merged_df.rename(columns={"variants": "variant"})
# explode name_df by column variant
merged_df = merged_df.explode("variant").reset_index()
# Adding a new column 'variantID' with unique numbers for each variant (row)
merged_df["variantID"] = range(0, len(merged_df))

### 2.6 Remove factgrid link to keep entity numbers only

In [None]:
# split factgrid string on ,
# Splitting strings in the column based on comma and converting them into sets
merged_df["factgrid"] = (
    merged_df["factgrid"].astype(str).apply(lambda x: set(x.split(",")))
)
# Exploding the sets in the column
merged_df = merged_df.explode("factgrid")
# Removing the substring from all strings in the column
merged_df["factgrid"] = merged_df["factgrid"].str.replace(
    "https://database.factgrid.de/entity/", ""
)

## 3 Write to file

In [None]:
# drop not needed column index (as it has already been updated) from dataframe
merged_df.drop(columns=["index"], inplace=True)
# write to csv file
merged_df.to_csv("../data/words.csv", index=False)