In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict

import dvu
import matplotlib.pyplot as plt
import pandas as pd
from os.path import join
import os.path
from bs4 import BeautifulSoup
from tqdm import tqdm
import imodelsx.llm
import json
import requests
import joblib
import os
import numpy as np
import pubmed
import openai
from clean import try_or_none
openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()
plt.style.use('default')
dvu.set_style()

df = pd.read_csv('../data/main.csv')

### Scraping pubmed articles

- E-utilities: https://dataguide.nlm.nih.gov/eutilities/utilities.html
- Metadata: https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/
- example paper: https://pubmed.ncbi.nlm.nih.gov/16768059/
- example summary: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=16768059&retmode=json

In [None]:
# intitialize
idxs_corrected = (df["ref_href_corrected"].notna() & ~(df['ref_href_corrected'] == 'Unk'))
df["ref_href"][idxs_corrected] = df["ref_href_corrected"][idxs_corrected]
df["ref_metadata"] = np.nan
df["ref_authors"] = np.nan
df["ref_citations"] = np.nan
df["ref_authors_full"] = np.nan
df["ref_url_free_text"] = np.nan

In [None]:
def get_paper_id(paper_link: str):
    if paper_link.endswith("/"):
        paper_link = paper_link[:-1]
    paper_id = paper_link.split("/")[-1]

    # remove leading zeros
    while paper_id.startswith("0"):
        paper_id = paper_id[1:]
    return paper_id

df_dropnan = df[(df["ref_href"].notna()) & ~(df['ref_href'] == '') &  ~(df['ref_href_corrected'] == 'Unk')]
df_dropnonpubmed = df_dropnan[df_dropnan["ref_href"].str.contains("pubmed")]
df_dropnonpubmed['paper_id'] = df_dropnonpubmed["ref_href"].apply(get_paper_id)
print(
    "all cdis",
    df.shape[0],
    "drop na",
    df_dropnan.shape[0],
    "drop non pubmed",
    df_dropnonpubmed.shape[0],
)
df = df_dropnonpubmed.reset_index()

In [8]:
# run scraping
for i in tqdm(range(df.shape[0])):
    row = df.iloc[i]
    paper_link = row["ref_href"]
    if isinstance(paper_link, str) and "pubmed" in paper_link:
        # paper_link = 'https://pubmed.ncbi.nlm.nih.gov/20738765/'
        paper_id = row['paper_id']

        try:
            metadata = pubmed.get_metadata(paper_id)
            df["ref_metadata"][i] = metadata
            df["ref_authors"][i] = metadata["result"][paper_id]["authors"]
            # df["ref_num_references"][i] = metadata["result"][paper_id]["pmcrefcount"]
        except Exception as e:
            print(f"Error for paper {paper_id}", e)

        # try:
        # print('paper_link', paper_link, paper_id)
        authors_list = pubmed.get_authors_with_firstname(paper_link, paper_id)
        df["ref_authors_full"][i] = [pubmed.parse_name(name) for name in authors_list]
        # print('auth_list', authors_list)
        # except:
        # print(f"Error scraping for paper {paper_id}")

 24%|██▎       | 134/565 [09:19<27:34,  3.84s/it]

### Gender of authors

In [None]:
names = sorted(
    df["ref_authors_full"].explode().dropna().tolist()
)

genders = []
llm = imodelsx.llm.get_llm(
    checkpoint="gpt-3.5-turbo", CACHE_DIR="/home/chansingh/cache/pubmed_names"
)


def get_gender(name: str):
    return llm(
        f'Return whether the name "{name}" is more common for a male or a female. Answer with one word, "Male" or "Female"',
        verbose=False,
    )


gender_ans = [get_gender(name) for name in tqdm(names)]
pd.Series(gender_ans).value_counts()

### Gender over time

In [None]:
# check if any element of a list is not None
def any_not_none(l):
    for x in l:
        if x is not None:
            return True
    return False
    
def count_male(authors_full):
    if isinstance(authors_full, list) and any_not_none(authors_full):
        return np.sum([get_gender(name) == "Male." for name in authors_full])
    else:
        return 0

def count_female(authors_full):
    if isinstance(authors_full, list) and any_not_none(authors_full):
        return np.sum([get_gender(name) == "Female." for name in authors_full])
    else:
        return 0


df["count_male"] = df.apply(
    lambda row: count_male(row["ref_authors_full"]),
    axis=1,
)
df['count_female'] = df.apply(
   lambda row: count_female(row["ref_authors_full"]),
    axis=1, 
)

In [None]:
dp = df.sort_values(by='ref_year')
dp['count_male'].value_counts()
# plt.plot(dp['ref_year'], np.cumsum(dp['count_male']))
# plt.plot(dp['ref_year'], np.cumsum(dp['count_female']))
plt.plot(dp['ref_year'], np.cumsum(dp['count_male']) / np.cumsum(dp['count_female']))
plt.grid()
plt.xlim(2000, 2023)
plt.ylim(2, 4)