In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()
import imodelsx.llm
import json
import requests
import joblib
from pprint import pprint
import os
import numpy as np
import pubmed
import openai
from mdcalc import try_or_none
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main.csv')

### Scraping pubmed articles

- E-utilities: https://dataguide.nlm.nih.gov/eutilities/utilities.html
- Metadata: https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/
- example paper: https://pubmed.ncbi.nlm.nih.gov/16768059/
- example summary: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=16768059&retmode=json

In [None]:
# initialize
df["ref_href"] = pubmed.get_updated_refs(df)
df["ref_metadata"] = np.nan
df["ref_authors"] = np.nan
df["ref_citations"] = np.nan
df["ref_authors_full"] = np.nan
df["ref_authors_affils"] = np.nan
df["ref_url_free_text"] = np.nan

# only keep pubmed links
df_dropnan = df[(df["ref_href"].notna()) & ~(df['ref_href'] == '') & ~(df['ref_href_corrected'] == 'Unk')]
df_dropnonpubmed = df_dropnan[df_dropnan["ref_href"].str.contains("pubmed")]
df_dropnonpubmed.loc[:, 'paper_id'] = df_dropnonpubmed["ref_href"].apply(pubmed.get_paper_id)
print(
    "all cdis",
    df.shape[0],
    "drop na",
    df_dropnan.shape[0],
    "drop non pubmed",
    df_dropnonpubmed.shape[0],
)
df = df_dropnonpubmed.reset_index()

In [None]:
# run scraping (caches so is safe to rerun)
for i in tqdm(range(df.shape[0])):
    row = df.iloc[i]
    paper_link = row["ref_href"]
    if isinstance(paper_link, str) and "pubmed" in paper_link:
        # paper_link = 'https://pubmed.ncbi.nlm.nih.gov/20738765/'
        paper_id = row['paper_id']

        # this scrapes pubmed api
        try:
            metadata = pubmed.get_metadata(paper_id)
            df["ref_metadata"][i] = metadata
            df["ref_authors"][i] = metadata["result"][paper_id]["authors"]
            # df["ref_num_references"][i] = metadata["result"][paper_id]["pmcrefcount"]
        except Exception as e:
            print(f"Error for paper {paper_id}", e)

        # this scrapes actual paper page
        authors_list = pubmed.get_authors_with_firstname(paper_link, paper_id)
        df["ref_authors_full"][i] = [pubmed.parse_name(name) for name in authors_list]
        # print('auth_list', authors_list)
        # except:
        # print(f"Error scraping for paper {paper_id}")
        df['ref_authors_affils'][i] = pubmed.get_author_affiliations(paper_id)
print('failed to scrape affils for', df['ref_authors_affils'].isna().sum(), 'papers')

### Gender of authors

In [None]:
names = sorted(
    df["ref_authors_full"].explode().dropna().tolist()
)

genders = []
llm = imodelsx.llm.get_llm(
    checkpoint="gpt-3.5-turbo", CACHE_DIR="/home/chansingh/cache/pubmed_names"
)


def get_gender(name: str):
    return llm(
        f'Return whether the name "{name}" is more common for a male or a female. Answer with one word, "Male" or "Female"',
        verbose=False,
    )


gender_ans = [get_gender(name) for name in tqdm(names)]
pd.Series(gender_ans).value_counts()

### Affiliations of authors

In [None]:
import pycountry
import pycountry_convert as pc
def country_to_continent(country_name):
    country_alpha2 = pc.country_name_to_country_alpha2(country_name)
    country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
    country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    return country_continent_name

llm = imodelsx.llm.get_llm(
    checkpoint="gpt-3.5-turbo", CACHE_DIR="/home/chansingh/cache/pubmed_names"
)
def get_country(country_name: str):
    return llm(
        f"""Return the name of the country present in the following affiliation: {country_name}.
Return only the name of the country.""",
        verbose=False,
    )

affiliations = np.array(sorted(df["ref_authors_affils"].explode().dropna().tolist()))
dfa = pd.DataFrame(affiliations, columns=['aff_orig'])    

# automatically parse countries
dfa['aff_auto'] = dfa['aff_orig'].apply(lambda x: "".join([c for c in x.split(",")[-1] if c.isalpha()]))
dfa['country_auto'] = dfa['aff_auto'].progress_apply(try_or_none(lambda x: pycountry.countries.search_fuzzy(x)[0]))
dfa['country'] = dfa['country_auto']

# fill in missing countries with llm
dfa['aff_llm'] = None
idxs_na = dfa['country_auto'].isna()
dfa.loc[idxs_na, 'aff_llm'] = dfa.loc[idxs_na, 'aff_orig'].progress_apply(lambda x: get_country(x))

In [None]:
dfa['aff_llm'] = dfa['aff_llm'].apply(pubmed.clean_llm_country_output)
dfa['country_llm'] = dfa['aff_llm'].progress_apply(try_or_none(lambda x: pycountry.countries.search_fuzzy(x)[0]))
dfa['country'] = dfa['country_auto'].fillna(dfa['country_llm'])

In [None]:
dfa['aff_llm'].value_counts()

In [None]:
n = dfa['country'].dropna().shape[0]
print('dropping', dfa.shape[0] - n, 'affiliations', 'resulting in', n, 'affiliations')
countries = dfa['country'].dropna()
continents = countries.apply(lambda x: country_to_continent(x.name))

In [None]:
pd.DataFrame(continents.value_counts()/ n * 100).transpose().style.hide(axis='index').format("{:.1f}%")

In [None]:
cv = pd.DataFrame(countries.value_counts().head(15) / n * 100).reset_index()
cv['country'] = cv['country'].apply(lambda x: x.name)
cv.index = cv['country']
cv = cv.drop('country', axis=1)
cv.T.style.hide(axis='index').format("{:.1f}%")

### Gender over time

In [None]:
# check if any element of a list is not None
def any_not_none(l):
    for x in l:
        if x is not None:
            return True
    return False
    
def count_male(authors_full):
    if isinstance(authors_full, list) and any_not_none(authors_full):
        return np.sum([get_gender(name) == "Male." for name in authors_full])
    else:
        return 0

def count_female(authors_full):
    if isinstance(authors_full, list) and any_not_none(authors_full):
        return np.sum([get_gender(name) == "Female." for name in authors_full])
    else:
        return 0


df["count_male"] = df.apply(
    lambda row: count_male(row["ref_authors_full"]),
    axis=1,
)
df['count_female'] = df.apply(
   lambda row: count_female(row["ref_authors_full"]),
    axis=1, 
)

In [None]:
dp = df.sort_values(by='ref_year')
dp['count_male'].value_counts()
# plt.plot(dp['ref_year'], np.cumsum(dp['count_male']))
# plt.plot(dp['ref_year'], np.cumsum(dp['count_female']))
plt.plot(dp['ref_year'], np.cumsum(dp['count_male']) / np.cumsum(dp['count_female']))
plt.grid()
plt.xlim(2000, 2023)
plt.ylim(2, 4)