In [1]:
import pathlib
import requests

import pandas as pd

from tqdm.notebook import tqdm

In [2]:
root_dir = pathlib.Path.cwd().parent

In [None]:
df = pd.read_csv(root_dir / "cmrt_database" / "All_column_charity.csv", index_col=0)
df.head()

In [4]:
def fix_doi(doi):
    if doi.endswith("."):
        doi = doi[:-1]
    if doi.count("/") > 1:
        doi = doi.replace("/", "-").replace("-", "/", 1)
    return doi

In [None]:
cmrt_doi_list = list(set(map(fix_doi, df["Literature"])))
cmrt_doi_list[:5]

In [6]:
def fetch_bibtex(doi):
    """
    Fetches the BibTeX entry for a given DOI.

    Parameters:
    - doi (str): The DOI of the paper.

    Returns:
    - str: The BibTeX entry if successful, otherwise an error message.
    """
    url = f"http://dx.doi.org/{doi}"
    headers = {
        "Accept": "application/x-bibtex;q=1"
    }

    try:
        response = requests.get(url, headers=headers, allow_redirects=True)

        if response.status_code == 200:
            return response.text[1:]
        else:
            return (
                f"Error: Unable to fetch BibTeX for DOI {doi} "
                "(HTTP {response.status_code})\n"
            )

    except requests.exceptions.RequestException as e:
        return f"Request failed for DOI {doi}: {e}\n"

In [None]:
bibfile = root_dir / "cmrt_database" / "cmrt_database.bib"
if bibfile.exists():
    bibtex_entries = open(bibfile, "r").read().split("\n\n")
else:
    bibtex_entries = [fetch_bibtex(doi) for doi in tqdm(cmrt_doi_list)]
    with open(bibfile, "w") as f:
        f.write("\n".join(bibtex_entries))

In [None]:
[entry for entry in bibtex_entries if not entry.startswith("@")]

In [None]:
authors = []
for entry in bibtex_entries:
    if entry.startswith("@"):
        authors.extend(
            entry.split("author={")[1].split("}")[0].replace(",", "").split(" and ")
        )
author_frequency = pd.DataFrame({"authors": authors})["authors"].value_counts()
author_frequency

In [None]:
frequent_authors = [
    author for author, count in author_frequency.items() if count >= 10
]
author_query = "AU=(" + " OR ".join(frequent_authors) + ")"
print(author_query)

Searched [Web of Science](https://www.webofscience.com/wos/woscc/advanced-search) using the output
of the previous cell, finding 76,501 documents. Then, refined by searching the following content
within the results:

```
chiral* OR enantio* OR "asymmetric cataly*"
```

Found 3,485 documents, which were exported as BibTeX (full record, max. 1000 entries per file).

The DOIs of these documents were extracted using the command below:

```
cat savedrecs*.bib | grep 'DOI =' | sed -e 's/DOI = {//' -e 's/},//' > webofscience_search_result.txt
```

In [None]:
webofscience_dois = open(
    root_dir / "data_gathering" / "webofscience_search_result.txt", "r"
).read().splitlines()
len(webofscience_dois)