In [1]:
from habanero import Crossref
from unidecode import unidecode
from bs4 import BeautifulSoup
from pathlib import Path
import pandas as pd
import os
from collections import OrderedDict, Counter
from pubman_manager import PubManAPI, create_sheet
from dotenv import load_dotenv
from dateutil import parser
import yaml
from fuzzywuzzy import process
import copy
import re

def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text()

def get_metadata(doi):
    cr = Crossref()
    try:
        result = cr.works(ids=doi)
        return result['message']
    except Exception as e:
        print(f"Failed to retrieve data for DOI {doi}: {e}")

def parse_date(date_value):
    print("tt",date_value)
    if isinstance(date_value, str):
        parsed_date = parser.parse(date_value)
        return parsed_date.strftime("%d.%m.%Y")
    elif isinstance(date_value, list) and all(isinstance(i, int) for i in date_value):
        # Assuming the format is [day, month, year]
        if len(date_value) == 3:
            year, month, day = date_value
            parsed_date = parser.parse(f"{day:02d}.{month:02d}.{year}")
            return parsed_date.strftime("%d.%m.%Y")
        elif len(date_value) == 2:
            year, month = date_value
            parsed_date = parser.parse(f"{month:02d}.{year}")
            return parsed_date.strftime("%m.%Y")
        elif len(date_value) == 1:
            return date_value[0]
    raise RuntimeError

def process_name(names_affiliations, name):
    # Check for an exact match first
    if name in names_affiliations:
        return name

    # Split the input name into parts
    abbrev_parts = name.split(' ')
    initials = [part[0] for part in abbrev_parts if len(part) == 1 or '.' in part]
    surname = abbrev_parts[-1]

    # Attempt to match with the most initials and correct surname
    best_match = None
    best_score = -1

    for full_name in names_affiliations:
        full_name_parts = full_name.split(' ')

        # Check if the surname matches
        if full_name_parts[-1] == surname:
            # Calculate the match score based on initials
            score = sum(fn.startswith(init) for fn, init in zip(full_name_parts, initials))

            # Prefer matches with more matching initials
            if score > best_score:
                best_match = full_name
                best_score = score
            elif score == best_score:
                # Tie-breaking: prefer longer full matches over partial matches
                if len(full_name_parts) > len(best_match.split(' ')):
                    best_match = full_name

    # If a best match is found, check for full name replacement
    if best_match and '.' in name:  # Check if the input is an abbreviation
        expanded_name = process_name(names_affiliations, best_match)
        if expanded_name != best_match:  # Check if it results in a full name expansion
            return expanded_name

    return best_match if best_match else name  # Return the best match or the original name if no match is found


def process_author_list(affiliations_by_name_pubman, affiliations_by_name, title):
    non_mpg_affiliations = Counter()
    for author, affiliations in affiliations_by_name.items():
        print("author, affiliations", author, affiliations)
        for i, proposed_affiliation in enumerate(affiliations if affiliations else ['']):
            print("affiliations_by_name_pubman.get(author)",affiliations_by_name_pubman.get(author))
            if affiliations_by_name_pubman.get(author):
                if not proposed_affiliation.strip():
                    affiliation, _score = process.extractOne(title, affiliations_by_name_pubman[author])
                    color = 'orange'
                else:
                    affiliation, score = process.extractOne(proposed_affiliation, affiliations_by_name_pubman[author])
                    print("SS",score, author, proposed_affiliation)
                    if score > 50:
                        color = 'yellow'
                    else:
                        affiliation = proposed_affiliation.replace('  ', ', ').replace(') ', '), ')
                        color = 'gray' if not 'Max-Planck' in affiliation else 'purple'
                # print(_score, proposed_affiliation, match)
                if affiliations_by_name[author]:
                    affiliations_by_name[author][i] = (affiliation, color)
                else:
                    affiliations_by_name[author].append((affiliation, color))
                if not 'Max-Planck' in affiliation:
                    non_mpg_affiliations[affiliation] += 1
            elif proposed_affiliation.strip():
                # print("not in pubman but exists",author, proposed_affiliation)
                affiliation = proposed_affiliation.replace('  ', ', ').replace(') ', '), ')
                color = 'gray' if not 'Max-Planck' in affiliation else 'purple'
                if affiliations_by_name[author]:
                    affiliations_by_name[author][i] = (affiliation, color)
                else:
                    affiliations_by_name[author].append((affiliation, color))
                if not 'Max-Planck' in affiliation:
                    non_mpg_affiliations[affiliation] += 1
            # print("nothing",author, proposed_affiliation)
    if non_mpg_affiliations:
        most_common_affiliation = non_mpg_affiliations.most_common(1)[0][0]
    else:
        most_common_affiliation = ''
    for author, affiliations in affiliations_by_name.items():
        if not affiliations:
            print("overriding affiliation;", author, most_common_affiliation)
            affiliations_by_name[author] = [(most_common_affiliation, 'red')]

def main():
    load_dotenv()
    username = os.getenv("USERNAME")
    password = os.getenv("PASSWORD")
    api = PubManAPI(username, password)

    publications_path = Path('publications')
    with open('authors_info.yaml', 'r', encoding='utf-8') as f:
        authors_info = yaml.safe_load(f)
    affiliations_by_name_pubman = OrderedDict({key: val['affiliations'] for key, val in authors_info.items() if val})

    for publication_sheet in publications_path.iterdir():
        df = pd.read_csv(publication_sheet, encoding='ISO-8859-1')
        prefill_publications = []
        # Check if the DOI column exists
        if 'DOI' not in df.columns:
            raise RuntimeError(f"DOI col not found in the CSV file.")
        dfo = df['DOI'].dropna()
        # for doi in list(dfo):
        # for doi in ['10.1007/s00410-024-02150-z']:
        # for doi in ['10.1007/s10853-024-09418-6']:
        for doi in ['10.1093/micmic/ozad120']:
        # for doi in ['10.1002/adma.202211796', '10.1002/adma.202401735']:
            print("doi", doi)
            pub = api.search_publication_by_criteria({
                "metadata.identifiers.id": doi,
                "metadata.identifiers.type": 'DOI'
            })
            if not pub or True:
                data = get_metadata(doi)
                print("metadata", data)
                title = unidecode(clean_html(data.get('title', [None])[0]))
                prefill_publication = OrderedDict({
                    "Title": [title, 35, ''],
                    # "Type": [data.get('type'), 15, ''],
                    "Journal Title": [unidecode(data.get('container-title', [None])[0]), 25, ''],
                    "Publisher": [unidecode(data.get('publisher', None) or ''), 20, ''],
                    "Issue": [data.get('issue', None), 10, ''],
                    "Volume": [data.get('volume', None), 10, ''],
                    "Page": [data.get('page', None), 10, ''],
                    "ISSN": [unidecode(data.get('ISSN', [None])[0] or ''), 15, ''],
                    "Date created": [parse_date(data.get('created', {}).get('date-time', None)), 20, ''],
                    'Date issued': [parse_date(data.get('issued', {}).get('date-parts', [[None]])[0]), 20, ''],
                    'Date published': [parse_date(data.get('published', {}).get('date-parts', [[None]])[0]), 20, ''],
                    'DOI': [doi, 20, ''],
                    'link': [data.get('resource', {}).get('primary', {}).get('URL', ''), 20, ''],
                })

                sheet_authors = OrderedDict()
                affiliations_by_name = OrderedDict()
                for author in data.get('author', []):
                    print("author",author)
                    author_name = process_name(affiliations_by_name_pubman, unidecode(author.get('given', '')) + ' ' + unidecode(author.get('family', '')))
                    affiliations_by_name[author_name] = []
                    for affiliation in author.get('affiliation', []):
                        affiliations_by_name[author_name].append(unidecode(affiliation.get('name', '')))

                print("affiliations_by_nameee",affiliations_by_name)
                # return
                process_author_list(affiliations_by_name_pubman, affiliations_by_name, title)
                i = 1
                for author, affiliations in affiliations_by_name.items():
                    for affiliation in affiliations:
                        prefill_publication[f"Author {i}"] = [author, None, '']
                        prefill_publication[f"Affiliation {i}"] = [affiliation[0], affiliation[1], '']
                        i = i+1
                prefill_publications.append(prefill_publication)
                # break
        # Convert publications list to a DataFrame
        if prefill_publications:
            # Get the number of authors
            n_authors = max(len(authors) for authors in affiliations_by_name_pubman.values())
            n_entries = len(prefill_publication) + 10
            # print("affiliations_by_name_pubman",affiliations_by_name_pubman)
            # print("publications[0].keys()",publications[0][0].keys())
            # print("n_authors",n_authors)
            # print("n_entries",n_entries)
            column_details = OrderedDict({
                key: [val[1], val[2]]
                for key, val in prefill_publication.items()
                if 'Author ' not in key and 'Affiliation ' not in key
            })
            # print("column_details",column_details)
            # print("prefill_metadata",[publication[0] for publication in publications])
            # print("prefill_authors",[publication[1] for publication in publications])
            # Create the detailed Excel sheet with autocomplete


            # create_sheet(f'./Publication Templates/{Path(publication_sheet.stem)}.xlsx', affiliations_by_name_pubman,
            #             column_details, n_authors,
            #             prefill_publications = prefill_publications)
            # print(f"Saved {f'./Publication Templates/{Path(publication_sheet.stem)}.xlsx'} successfully.")

            create_sheet(f'./Publication Templates/test.xlsx', affiliations_by_name_pubman,
                        column_details, n_authors,
                        prefill_publications = prefill_publications)
            print(f"Saved {f'./Publication Templates/test.xlsx'} successfully.")

        break
main()




doi 10.1093/micmic/ozad120
metadata {'indexed': {'date-parts': [[2024, 7, 19]], 'date-time': '2024-07-19T09:45:10Z', 'timestamp': 1721382310489}, 'reference-count': 62, 'publisher': 'Oxford University Press (OUP)', 'issue': '6', 'license': [{'start': {'date-parts': [[2023, 10, 19]], 'date-time': '2023-10-19T00:00:00Z', 'timestamp': 1697673600000}, 'content-version': 'vor', 'delay-in-days': 0, 'URL': 'https://creativecommons.org/licenses/by/4.0/'}], 'funder': [{'DOI': '10.13039/100010663', 'name': 'ERC', 'doi-asserted-by': 'publisher', 'award': ['#771602']}, {'DOI': '10.13039/501100000266', 'name': 'EPSRC', 'doi-asserted-by': 'publisher', 'award': ['#EP/V007661/1']}, {'DOI': '10.13039/100004807', 'name': 'DFG', 'doi-asserted-by': 'publisher'}], 'content-domain': {'domain': [], 'crossmark-restriction': False}, 'short-container-title': [], 'published-print': {'date-parts': [[2023, 12, 21]]}, 'abstract': '<jats:title>Abstract</jats:title>\n               <jats:p>Repeatable and reliable sit

In [12]:
def process_name(names_affiliations, name):
    # Check for an exact match first
    if name in names_affiliations:
        return name

    # Split the input name into parts
    abbrev_parts = name.split(' ')
    initials = [part[0] for part in abbrev_parts if len(part) == 1 or '.' in part]
    surname = abbrev_parts[-1]

    # Attempt to match with the most initials and correct surname
    best_match = None
    best_score = -1

    for full_name in names_affiliations:
        full_name_parts = full_name.split(' ')

        # Check if the surname matches
        if full_name_parts[-1] == surname:
            # Calculate the match score based on initials
            score = sum(fn.startswith(init) for fn, init in zip(full_name_parts, initials))

            # Prefer matches with more matching initials
            if score > best_score:
                best_match = full_name
                best_score = score
            elif score == best_score:
                # Tie-breaking: prefer longer full matches over partial matches
                if len(full_name_parts) > len(best_match.split(' ')):
                    best_match = full_name

    # If a best match is found, check for full name replacement
    if best_match and '.' in name:  # Check if the input is an abbreviation
        expanded_name = process_name(names_affiliations, best_match)
        if expanded_name != best_match:  # Check if it results in a full name expansion
            return expanded_name

    return best_match if best_match else name  # Return the best match or the original name if no match is found

# Helper function to check initials

# Test the function with different abbreviated names
names_affiliations = [
    'Leonardo Shoji Aota',
    'Taylor Swift',
    'Bob Marley',
    'John Michael Smith',
    'John Smith',
    'Thoudden Sukumar Prithiv',
    'Renelle Dubosq',
    'R. Dubosq',
    'Christian Liebscher',
    'B. Zhou',
    'Xuyang Zhou',
    'T. Schwarz',
    'Tim Schwarz',
    'Jing Yang',
    'Zhigang Yang'
]

test_names = [
    'Leonardo S. Aota',
    'L. S. Aota',
    'L. Aota',
    'J. M. Smith',
    'J. Smith',
    'T. Swift',
    'B. Marley',
    'Bob John Marley',
    'Bob J. Marley',
    'T.S. Prithiv',
    'R. Dubosq',
    'Christian H. Liebscher',
    'Xuyang Zhou',
    'Tim M. Schwarz',
    'Jing Yang'
]

matched_names = {abbreviated_name: process_name(names_affiliations, abbreviated_name) for abbreviated_name in test_names}

matched_names


{'Leonardo S. Aota': 'Leonardo Shoji Aota',
 'L. S. Aota': 'Leonardo Shoji Aota',
 'L. Aota': 'Leonardo Shoji Aota',
 'J. M. Smith': 'John Michael Smith',
 'J. Smith': 'John Michael Smith',
 'T. Swift': 'Taylor Swift',
 'B. Marley': 'Bob Marley',
 'Bob John Marley': 'Bob Marley',
 'Bob J. Marley': 'Bob Marley',
 'T.S. Prithiv': 'Thoudden Sukumar Prithiv',
 'R. Dubosq': 'R. Dubosq',
 'Christian H. Liebscher': 'Christian Liebscher',
 'Xuyang Zhou': 'Xuyang Zhou',
 'Tim M. Schwarz': 'T. Schwarz',
 'Jing Yang': 'Jing Yang'}