In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
from bs4 import BeautifulSoup
from tqdm import tqdm
import imodelsx.llm
import json
import requests
import joblib
import numpy as np
import openai
from clean import try_or_none
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main_updated.csv')

### Scraping pubmed articles

- E-utilities: https://dataguide.nlm.nih.gov/eutilities/utilities.html
- Metadata: https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/
- example paper: https://pubmed.ncbi.nlm.nih.gov/16768059/
- example summary: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=16768059&retmode=json

In [None]:
def get_metadata(paper_id: str):
    cache_file = f"../data/metadata/{paper_id}.json"
    if os.path.exists(cache_file):
        metadata = json.load(open(cache_file))
    else:
        resp = requests.get(
            f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={paper_id}&retmode=json"
        )
        metadata = json.loads(resp.text)
        with open(cache_file, "w") as f:
            json.dump(metadata, f, indent=2)
    return metadata


def get_authors_with_firstname(paper_link: str, paper_id: str):
    cache_file = f"../data/metadata/{paper_id}_full.joblib"
    if os.path.exists(cache_file):
        return joblib.load(cache_file)
    else:
        resp = requests.get(paper_link).text
        soup = BeautifulSoup(resp)
        author_names = set()
        # print(soup.find_all("span", {"class": "authors-list-item"}))
        for s in soup.find_all("span", {"class": "authors-list-item"}):
            try:
                author_name = s.a["data-ga-label"]
                author_names.add(author_name)
                # print('author_name', author_name)
            except:
                pass
        print('a', author_names)
        joblib.dump(author_names, cache_file)
        return author_names

@try_or_none
def get_free_text_link(paper_id: str):
    cache_file = f"../data/metadata/{paper_id}_free_text_link.json"
    if os.path.exists(cache_file):
        free_text_link = json.load(open(cache_file))
    else:
        resp = requests.get(
            f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&id={paper_id}&cmd=prlinks&retmode=json"
        )
        free_text_link = resp.json()
        with open(cache_file, "w") as f:
            json.dump(free_text_link, f, indent=2)
    
    return free_text_link['linksets'][0]['idurllist'][0]['objurls'][0]['url']['value']

In [None]:

# intitialize
idxs_corrected = df['ref_href_corrected'].notna() # df['ref_href'].isna() & 
df['ref_href'][idxs_corrected] = df['ref_href_corrected'][idxs_corrected]
df["ref_metadata"] = np.nan
df["ref_authors"] = np.nan
df["ref_citations"] = np.nan
df['ref_authors_full'] = np.nan
df['ref_url_free_text'] = np.nan

# run scraping
for i in tqdm(range(df.shape[0])):
    paper_link = df["ref_href"][i]
    if isinstance(paper_link, str) and "pubmed" in paper_link:
        # paper_link = 'https://pubmed.ncbi.nlm.nih.gov/20738765/'
        if paper_link.endswith("/"):
            paper_link = paper_link[:-1]
        paper_id = paper_link.split("/")[-1]
        if not "?" in paper_id and not "&" in paper_id:
            # remove leading zeros
            while paper_id.startswith("0"):
                paper_id = paper_id[1:]

            try:
                metadata = get_metadata(paper_id)
                df["ref_metadata"][i] = metadata
                df["ref_authors"][i] = metadata["result"][paper_id]["authors"]
                df["ref_citations"][i] = metadata["result"][paper_id]["pmcrefcount"]
            except:
                print(f"Error for paper {paper_id}")

            # try:
            # print('paper_link', paper_link, paper_id)
            authors_list = get_authors_with_firstname(paper_link, paper_id)
            df["ref_authors_full"][i] = authors_list
            # print('auth_list', authors_list)
            # except:
                # print(f"Error scraping for paper {paper_id}")

            df['ref_url_free_text'][i] = get_free_text_link(paper_id)
            print('free_text_link', df['ref_url_free_text'][i])

In [None]:
@try_or_none
def parse_name(name: str):
    name_arr = name.split()

    # drop if too long
    if len(name) > 40:
        return None

    # drop special names
    for k in [
        "investigator",
        "group",
        "committee",
        "network",
    ]:
        if k in name.lower():
            return None

    # drop when first name is only one letter
    if len(name_arr[0]) == 1:
        return None

    # drop middle initial
    if len(name_arr) > 2 and len(name_arr[1]) == 1:
        name_arr = [name_arr[0], name_arr[-1]]

    # return name
    return " ".join(name_arr)


names = sorted(df["ref_authors_full"].explode().apply(parse_name).dropna().tolist())
# for name in names:
# print(name)

### Gender of authors

In [None]:
genders = []
llm = imodelsx.llm.get_llm(checkpoint='gpt-3.5-turbo', CACHE_DIR='/home/chansingh/cache/pubmed_names')

In [7]:
gender_ans = [
    llm(f'Return whether the name "{name}" is more common for a male or a female. Answer with one word, "Male" or "Female"', verbose=False)
    for name in tqdm(names)
]

That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID f6ffc67fe72c8a377e798b76f987d39d in your message.)


 95%|█████████▌| 2659/2798 [10:35<02:33,  1.10s/it]

Rate limit reached for default-gpt-3.5-turbo in organization org-rocrupyvzgcl4yf25rqq6d1v on tokens per min. Limit: 90000 / min. Current: 89770 / min. Contact us through our help center at help.openai.com if you continue to have issues.


 96%|█████████▌| 2684/2798 [10:49<00:39,  2.88it/s]

The server is overloaded or not ready yet.


 96%|█████████▌| 2686/2798 [10:55<02:35,  1.39s/it]

The server is overloaded or not ready yet.


 97%|█████████▋| 2712/2798 [11:10<00:56,  1.51it/s]

Rate limit reached for default-gpt-3.5-turbo in organization org-rocrupyvzgcl4yf25rqq6d1v on tokens per min. Limit: 90000 / min. Current: 89752 / min. Contact us through our help center at help.openai.com if you continue to have issues.


 98%|█████████▊| 2745/2798 [11:24<00:11,  4.46it/s]

The server is overloaded or not ready yet.


 99%|█████████▊| 2758/2798 [11:34<00:12,  3.23it/s]

Rate limit reached for default-gpt-3.5-turbo in organization org-rocrupyvzgcl4yf25rqq6d1v on tokens per min. Limit: 90000 / min. Current: 89747 / min. Contact us through our help center at help.openai.com if you continue to have issues.


 99%|█████████▉| 2781/2798 [11:47<00:06,  2.81it/s]

The server is overloaded or not ready yet.


100%|█████████▉| 2792/2798 [11:54<00:02,  2.17it/s]

The server is overloaded or not ready yet.


100%|██████████| 2798/2798 [12:02<00:00,  3.87it/s]


In [9]:
pd.Series(gender_ans).value_counts()

Male.                                                                                                                                                     1956
Female.                                                                                                                                                    840
I'm sorry, but I cannot determine the gender of a name without additional information. The name "Alet Wilga" does not have a clear gender association.       1
The name "Zoe Co" is more common for a female.                                                                                                               1
Name: count, dtype: int64

In [None]:
# citations plot
plt.hist(df['ref_citations'].replace('', np.nan).dropna().astype(int))