In [160]:
import json
import pandas as pd

author_df = pd.read_csv("persons.csv")[["givenname", "familyname", "initials", "email", "prim_affil"]]
author_df["fullname"] = author_df["givenname"] + " " + author_df["familyname"]
author_df["prim_affil"] = author_df["prim_affil"].apply(lambda x: x[1:-1])
author_df.sort_values("familyname", inplace=True)
author_df = author_df.set_index("fullname")

institution_df = pd.read_csv("institutions.csv")[["mnemonic", "city", "country", "name", "representative"]].set_index("mnemonic")

for index, row in author_df.iterrows():
    # Check if the initials are missing and generate them from the given name
    if pd.isnull(row['initials']):
        given_name = row['givenname'].strip().split()
        author_df.loc[index, 'initials'] = "".join([name[0].upper() + "." for name in given_name])
    else:
        author_df.loc[index, 'initials'] = row['initials'].strip()

    # Check if the email is missing and assign it based on the representative's email
    if pd.isnull(row['email']):
        representative = institution_df.loc[row["prim_affil"], "representative"]
        author_df.loc[index, 'email'] = author_df.loc[representative, "email"] if not pd.isnull(representative) else "noemail"
    else:
        author_df.loc[index, 'email'] = row['email'].strip()


In [161]:

"""
    % Example output format for authors and affiliations in LaTeX

    \author[1]{1.~Author}[type=collab]
    \ead{author1@gsi.de}
    \credit{did nothing}
    \affiliation[1]{
        organization={Affiliation 1},
        city={Affiliation 1 City},
        country={Affiliation 1 Country},
    }
"""
registered_affiliations = []
affiliation_tex = ""
auhtor_tex = ""
aff_idx = 0
for index, row in author_df.iterrows():
    author = row['initials'] + ",~" + "~".join(row['familyname'].split())

    #### Exlucde authors by rules ###
    author_aff = institution_df.loc[row["prim_affil"]]
    if author_aff.country.lower() == "russia":
        continue

    #################################
    if row["prim_affil"] not in registered_affiliations:
        registered_affiliations.append(row["prim_affil"] )

    aff_idx = registered_affiliations.index(row["prim_affil"])
    auhtor_tex += "\\author[" + str(aff_idx+1) + "]{" + author + "}[type=collab]\n" + \
        "\\ead{" + row['email'] + "}\n\n"


for aff_idx, aff in enumerate(registered_affiliations):
    row = institution_df.loc[aff]
    affiliation_tex += \
        "\\affiliation[" + str(aff_idx+1) + "]{\n" + \
        "\torganization={" + row['name'].strip() + "},\n" + \
        "\tcity={" + row['city'].strip() + "},\n" + \
        "\tcountry={" + row['country'].strip() + "}\n" + \
        "}\n"

with open("authors_affiliations.tex", "w", encoding="utf-8") as o_file:
    o_file.write(auhtor_tex)
    o_file.write(affiliation_tex)

##### Using the author list from Walter's file

In [162]:
def levenshtein_distance(s1, s2):
    s1 = s1.lower()
    s2 = s2.lower()
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions  = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

def find_best_match_index(query, df, max_distance=2):
    query = query.lower()
    best_index = -1
    best_score = float('inf')

    print("looking a match for:", query)
    for index, row in df.iterrows():
        dist = levenshtein_distance(query, index.lower())
        if dist < best_score and dist <= max_distance:
            best_score = dist
            best_index = index
    return best_index

In [163]:
# Load author list from Walter's file
authors_to_include = []
with open("sts_cbmcdb.tex", "r") as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith("\\author"):
            # Take out comments in the line
            line = line.split("%%")
            name = line[1].split(" (")[0].strip()

            lastname = line[0].strip()
            open_brace_index = lastname.find("{") + 1
            close_brace_index = lastname.rfind("}")
            lastname = lastname[open_brace_index:close_brace_index].strip()
            if '.~' in lastname:
                lastname = lastname.split('.~')[1].strip()

            # Clean up LaTeX characters
            name = name.translate(str.maketrans('', '', '\{\}\\"^\'~'))
            lastname = lastname.translate(str.maketrans('', '', '\{\}\\"^\'~'))

            if len(name):
                authors_to_include.append(name+" "+lastname)
            else:
                authors_to_include.append(lastname)

print(authors_to_include)

['Apar Agarwal', 'Kshitij Agarwal', 'Zubayer Ahammed', 'Nazeer Ahmad', 'Luca Jonas Ahrens', 'Mohammad Al-Turany', 'Noor Alam', 'Julio Andary', 'Anton Andronic', 'Harald Appelshauser', 'Benedict Arnoldi-Meadows', 'Beatriz Artur', 'Mohd. Danish Azmi', 'Marcel Bajdel', 'Matthias Balzer', 'Arup Bandyopadhyay', 'Vlad Andrei Basceanu', 'Jurgen Becker', 'Marten Becker', 'Artemiy Belousov', 'Alexandru Bercuci', 'Roland Berendes', 'Denis Bertini', 'Olga Bertini', 'Martin Beyer', 'Oleg Bezshyyko', 'Partha Pratim Bhaduri', 'Anju Bhasin', 'Shabir Ahmad Bhat', 'Towseef Ahmad Bhat', 'Waseem Ahmad Bhat', 'Buddhadeb Bhattacharjee', 'Abhijit Bhattacharyya', 'Nilay Kumar Bhowmik', 'Saikat Biswas', 'Thomas Blank', 'Nora Bluhme', 'Christoph Blume', 'Gianluca Boccarella', 'Daniel Bonaventura', 'Janusz Brzychczyk', 'Marius Calin', 'Michele Caselle', 'Amlan Chakrabarti', 'Petr Chaloupka', 'Souvik Chattopadhyay', 'Subhasis Chattopadhyay', 'Hamda Cherif', 'Serhii Chernyshenko', 'Eoin Clerkin', 'Lady Maryann Co

In [164]:
"""
    % Example output format for authors and affiliations in LaTeX

    \author[1]{1.~Author}[type=collab]
    \ead{author1@gsi.de}
    \credit{did nothing}
    \affiliation[1]{
        organization={Affiliation 1},
        city={Affiliation 1 City},
        country={Affiliation 1 Country},
    }
"""
author_matches = 0
missing_authors = []
registered_affiliations = []
affiliation_tex = ""
auhtor_tex = ""
aff_idx = 0

for author in authors_to_include:
    match_index = find_best_match_index(author, author_df, 5)
    if match_index == -1:
        missing_authors.append(author)
        continue

    author_matches += 1

    row = author_df.loc[match_index]
    author = row['initials'] + ",~" + "~".join(row['familyname'].split())
    author_aff = institution_df.loc[row["prim_affil"]]

    print(f"{author} matched with {match_index} -> {author}")

    # .tex building rules for authors
    if row["prim_affil"] not in registered_affiliations:
        registered_affiliations.append(row["prim_affil"] )

    aff_idx = registered_affiliations.index(row["prim_affil"])

    auhtor_tex += "\\author[" + str(aff_idx+1) + "]{" + author + "}[type=collab]\n" + \
        "\\ead{" + row['email'] + "}\n\n"


for aff_idx, aff in enumerate(registered_affiliations):
    row = institution_df.loc[aff]
    affiliation_tex += \
        "\\affiliation[" + str(aff_idx+1) + "]{\n" + \
        "\torganization={" + row['name'].strip() + "},\n" + \
        "\tcity={" + row['city'].strip() + "},\n" + \
        "\tcountry={" + row['country'].strip() + "}\n" + \
        "}\n"

with open("authors_affiliations.tex", "w", encoding="utf-8") as o_file:
    o_file.write(auhtor_tex)
    o_file.write(affiliation_tex)

print(author_matches, len(authors_to_include))
print(missing_authors)

looking a match for: apar agarwal
A.,~Agarwal matched with Apar Agarwal -> A.,~Agarwal
looking a match for: kshitij agarwal
K.,~Agarwal matched with Kshitij Agarwal -> K.,~Agarwal
looking a match for: zubayer ahammed
Z.,~Ahammed matched with Zubayer Ahammed -> Z.,~Ahammed
looking a match for: nazeer ahmad
N.,~Ahmad matched with Nazeer Ahmad -> N.,~Ahmad
looking a match for: luca jonas ahrens
L.J.,~Ahrens matched with Luca Jonas Ahrens -> L.J.,~Ahrens
looking a match for: mohammad al-turany
M.,~Al-Turany matched with Mohammad Al-Turany -> M.,~Al-Turany
looking a match for: noor alam
N.,~Alam matched with Noor Alam -> N.,~Alam
looking a match for: julio andary
J.,~Andary matched with Julio Andary -> J.,~Andary
looking a match for: anton andronic
A.,~Andronic matched with Anton Andronic -> A.,~Andronic
looking a match for: harald appelshauser
H.,~Appelshäuser matched with Harald Appelshäuser -> H.,~Appelshäuser
looking a match for: benedict arnoldi-meadows
B.,~Arnoldi-Meadows matched with