In [6]:
from Bio import Entrez
from Bio import Medline
from tqdm import tqdm
import pandas as pd
import sqlite3
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import xlrd
import plotly.graph_objects as go
%matplotlib inline

In [8]:
def get_count(term):
    """
    Use pubmed api to get a count of number of results for a search term
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  rettype="count",
                                  #field="DP",
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count

def chunked_pmids(term, chunksize=1000):
    """
    Use pubmed api to fetch blocks of pmids for a search term
    """
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print("{} blocks of to process".format(len(retmax_requests)))
    
    for i, retmax in enumerate(retmax_requests):
        
        print("Processing block {}".format(i))
        
        pmid_handle = Entrez.esearch(db="pubmed",
                                     sort="relevance",
                                     retmode="xml",
                                     usehistory='y',
                                     retstart=retmax,
                                     retmax=chunksize,
                                     #field="DP",
                                     term=term)
        pmids.append(Entrez.read(pmid_handle)["IdList"])
            
    return pmids

def fetch_medline(pmids):
    """
    Use pubmed api to fetch medline record for pmids
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records

def getpapers(pmid_chunks):
    """
    Fetch_medline(chunk) returns a generator object of medline records. we iterate through it saving the records 
    to a list. We make a dict of the list indexed by the pubmed id. 
    """
    
    papers = []
    notpapers = []
    
    print("fetching medline records:")
    
    for chunk in tqdm(pmid_chunks):
        records = fetch_medline(chunk)
        for record in records:
            try:
                  papers.append((record['PMID'], (', '.join(record['AU'])),(', '.join(record['AD'])),
                                 record['DP'], record['TI'], record['JT'], 
                                 (', '.join(record['PT']))))            
            except: 
                notpapers.append(record)
                continue
            
    return papers, notpapers


def save_papers(papers):
    """
    Save our papers to an sqlite database
    """
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    
    # Drop table if already exists
    c.execute("DROP TABLE IF EXISTS papers")
    
    # Create table
    c.execute('''CREATE TABLE papers
             (pmid, author, author_affiliation, date, title, journal, pub_type)''')

    # Insert a rows of data
    c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?,?)', papers)

    # Save (commit) the changes
    conn.commit()

    # We can also close the connection if we are done with it.
    # Just be sure any changes have been committed or they will be lost.
    conn.close()
    
def df_from_papers_database():
    """
    Load our papers
    """
    conn = sqlite3.connect('papers.db')
    df = pd.read_sql_query('SELECT * FROM papers', conn)
    return df

def fetch_papers(year):
    """
    Fetch our papers
    """
    print("{} records to fetch".format(get_count(year)))
    pmid_chunks = chunked_pmids(year, 500)
    papers, notpapers = getpapers(pmid_chunks)
    
    return papers, notpapers

def results(df):
    """
    Make some results
    """
    print(df['journal'].unique())
    print('\n')
    
    print(df['author_affiliation'].str.contains('|'.join(list_of_lmics)).value_counts())
    print('contains lmic\n')

    print(df['author_affiliation'].str.contains('|'.join(list_of_lmics)).value_counts(normalize=True))
    print('contains lmic\n')

"""
# deprecated geocoding experiments, if did pursue this would start with named entity recognition
# we wanna know which author affiliations are LMIC countries
# initial tack, let's geocode then use country boundaries.... 

# Import the geocoding tool
from geopandas.tools import geocode

# Geocode addresses using Nominatim. Remember to provide a custom "application name" in the user_agent parameter!
geo = geocode(df['author_affiliation'][0][-10:], provider='nominatim', user_agent='drcjar_geotimes', timeout=4)

geocode("Imperial College London", provider='nominatim', user_agent='drcjar_geotimes', timeout=4).values

geocode("From the American College of Occupational and Environmental Medicine, ElkGrove, Illinois.", provider='nominatim', user_agent='drcjar_geotimes', timeout=4)

# our datas author_affiliation field is not readily geocoded
# it gets pretty hacky fast e.g https://pypi.org/project/pubmed-author-affiliation/ chopping strings based on 
# prescence of 'university' or 'institution' to make geocodeable
# more sensible approach is to search for LMIC names in strings
"""

'\n# deprecated geocoding experiments, if did pursue this would start with named entity recognition\n# we wanna know which author affiliations are LMIC countries\n# initial tack, let\'s geocode then use country boundaries.... \n\n# Import the geocoding tool\nfrom geopandas.tools import geocode\n\n# Geocode addresses using Nominatim. Remember to provide a custom "application name" in the user_agent parameter!\ngeo = geocode(df[\'author_affiliation\'][0][-10:], provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4)\n\ngeocode("Imperial College London", provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4).values\n\ngeocode("From the American College of Occupational and Environmental Medicine, ElkGrove, Illinois.", provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4)\n\n# our datas author_affiliation field is not readily geocoded\n# it gets pretty hacky fast e.g https://pypi.org/project/pubmed-author-affiliation/ chopping strings based on \n# prescen

In [10]:
# make a dict of journal papers retrieved using our pubmed paper fetching function and save them
# we currently save them to a sqllite database which is overkill for small number of results; could use 
# .csv instead but might be helpful for scaling or web apps
# uncommment to run; commented out so we don't download afresh each run

# target occ lung dis journals


target_journals = ["journal of occupational and environmental medicine [Journal]", "occupational medicine [Journal]","the american journal of industrial medicine [journal]","occupational environmental medicine [journal]"]
print(target_journals)
print("\n")


papers = {}
notpapers = {}

for journal in target_journals:
    print("fetching {}\n".format(journal))
    papers[journal], notpapers[journal] = fetch_papers(journal)
    
# smash together our dict of lists into a single list
papers = list(set().union(*papers.values()))
# notpapers = list(set().union(*notpapers.values()))

save_papers(papers)


['journal of occupational and environmental medicine [Journal]', 'occupational medicine [Journal]', 'the american journal of industrial medicine [journal]', 'occupational environmental medicine [journal]']


fetching journal of occupational and environmental medicine [Journal]

6107 records to fetch
13 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
Processing block 12
fetching medline records:


100%|███████████████████████████████████████████| 13/13 [00:41<00:00,  3.18s/it]


fetching occupational medicine [Journal]

4859 records to fetch
10 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
fetching medline records:


100%|███████████████████████████████████████████| 10/10 [00:29<00:00,  2.99s/it]


fetching the american journal of industrial medicine [journal]

5869 records to fetch
12 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
fetching medline records:


100%|███████████████████████████████████████████| 12/12 [00:34<00:00,  2.90s/it]


fetching occupational environmental medicine [journal]

4747 records to fetch
10 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
fetching medline records:


100%|███████████████████████████████████████████| 10/10 [00:30<00:00,  3.07s/it]


In [12]:
"""
example of 'not a paper' for our purposes this is a pubmed record that does not have all the fields 
of interest for our analysis. 

this is why N of records in pubmed search is not equal to N of records in our analysis.  

we consider records that have the following fields: AU, AD, DP, TI, JT, and PT. These correspond to 
author, affiliation, date of publication, title, journal title, and publication type.

                                 
see https://www.nlm.nih.gov/bsd/mms/medlineelements.html for more information.
"""

# notpapers['journal of occupational and environmental medicine [Journal]'][9]

"\nexample of 'not a paper' for our purposes this is a pubmed record that does not have all the fields \nof interest for our analysis. \n\nthis is why N of records in pubmed search is not equal to N of records in our analysis.  \n\nwe consider records that have the following fields: AU, AD, DP, TI, JT, and PT. These correspond to \nauthor, affiliation, date of publication, title, journal title, and publication type.\n\n                                 \nsee https://www.nlm.nih.gov/bsd/mms/medlineelements.html for more information.\n"

In [14]:
# load our papers database as a dataframe
df = df_from_papers_database()

In [16]:
len(df)

16389

In [18]:
df[df.pmid == "36379677"].author_affiliation.values

array(['Environment and Lifestyle Epidemiology Branch, International Agency for Research on Cancer (IARC), Lyon, France., Cancer Research Center, Cancer Institute of the Islamic Republic of Iran, Tehran, The Islamic Republic of Iran., Environment and Lifestyle Epidemiology Branch, International Agency for Research on Cancer (IARC), Lyon, France., Environment and Lifestyle Epidemiology Branch, International Agency for Research on Cancer (IARC), Lyon, France., Research Directorate, Veterans Affairs Canada, Charlottetown, Prince Edward Island, Canada., Cancer Research Center, Cancer Institute of the Islamic Republic of Iran, Tehran, The Islamic Republic of Iran., Health Sciences Unit, Faculty of Social Sciences, University of Tampere Faculty of Social Sciences, Tampere, Finland., Cancer Research Center, Cancer Institute of the Islamic Republic of Iran, Tehran, The Islamic Republic of Iran., Department of Epidemiology and Biostatistics, Kerman University of Medical Sciences, Kerman, The Is

In [20]:
df['author_affiliation'].head().values

array(['International Agency for Research on Cancer, Lyon, France. straif@iarc.fr',
       'Institute of Occupational Health, University of Birmingham, Edgbaston, Birmingham, UK. T.M.Sorahan@bham.ac.uk',
       'The HEART (Hypertension and Endothelial Function with Aerobic and Resistance Training) Laboratory, Health & Exercise Physiology Department, Ursinus College, Collegeville (Ms Derella, Ms Aichele, Ms Chavis, Mr Perez, Ms Getty, Ms Wisdo, Dr Feairheller); Temple School of Medicine, Temple University (Ms Oakman); Physical Therapy Division, Duke University, Durham, North Carolina (Ms Cromwell); Physical Therapy Division, Drexel University (Ms Hill), Philadelphia, Pennsylvania.',
       'Department of Orthopaedics, Royal Lancaster Infirmary, UK.',
       'School of Sport, Exercise and Health Sciences, Loughborough University, Loughborough (Dr Varela-Mato, Dr Clemes, Dr King, Dr Munir); National Institute of Health Research Leicester-Loughborough Diet, Lifestyle and Physical Activity 

In [22]:
# seems like a reasonable source of countries
countries = pd.read_excel('https://datacatalogfiles.worldbank.org/ddh-published/0037712/DR0090755/CLASS.xlsx')
countries = countries.head(218) # junk we don't need after this row
countries = countries[['Economy', 'Code', 'Income group']] # lose columns we don't need
countries['Economy'] = countries['Economy'].str.replace("(","") # it's best not to have brackets for later regex 
countries['Economy'] = countries['Economy'].str.replace(")","")

In [24]:
countries['Income group'].unique()

array(['Low income', 'Upper middle income', 'High income',
       'Lower middle income', nan], dtype=object)

In [26]:
# low and middle == 'Low income' or 'Lower middle income'
list_of_lmics = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income"))].dropna().Economy.to_list()
list_of_hics = countries[countries['Income group'] == 'High income'].dropna().Economy.to_list()

In [28]:
list_of_lmics_codes = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income"))].dropna().Code.to_list()

In [30]:
countries[countries['Economy'] =='Iran, Islamic Rep.']

Unnamed: 0,Economy,Code,Income group
91,"Iran, Islamic Rep.",IRN,Upper middle income


In [32]:
list_of_lmics

['Afghanistan',
 'Angola',
 'Bangladesh',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Côte d’Ivoire',
 'Djibouti',
 'Egypt, Arab Rep.',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Gambia, The',
 'Ghana',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Jordan',
 'Kenya',
 'Kiribati',
 "Korea, Dem. People's Rep.",
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Madagascar',
 'Malawi',
 'Mali',
 'Mauritania',
 'Micronesia, Fed. Sts.',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Nepal',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Pakistan',
 'Papua New Guinea',
 'Philippines',
 'Rwanda',
 'Samoa',
 'São Tomé and Príncipe',
 'Senegal',
 'Sierra Leone',
 'Solomon Islands',
 'Somalia',
 'South Sudan',
 'Sri Lanka',
 'Sudan',
 'Syrian Arab Republic',
 'Tajikistan',
 'Tanzania',
 'Timor-Leste',
 'Togo',
 'Tunisia',
 'Uganda'

In [33]:
# v probably don't need to do this / doesn't help
list_of_lmics = [i.split(',')[0] for i in list_of_lmics] # simplify names (i.e throw away sting after ',')

In [34]:
# this is list we're using for matching
# obviously it'll go wrong for korea
list_of_lmics

['Afghanistan',
 'Angola',
 'Bangladesh',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'Comoros',
 'Congo',
 'Congo',
 'Côte d’Ivoire',
 'Djibouti',
 'Egypt',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Gambia',
 'Ghana',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Jordan',
 'Kenya',
 'Kiribati',
 'Korea',
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Madagascar',
 'Malawi',
 'Mali',
 'Mauritania',
 'Micronesia',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Nepal',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Pakistan',
 'Papua New Guinea',
 'Philippines',
 'Rwanda',
 'Samoa',
 'São Tomé and Príncipe',
 'Senegal',
 'Sierra Leone',
 'Solomon Islands',
 'Somalia',
 'South Sudan',
 'Sri Lanka',
 'Sudan',
 'Syrian Arab Republic',
 'Tajikistan',
 'Tanzania',
 'Timor-Leste',
 'Togo',
 'Tunisia',
 'Uganda',
 'Uzbekistan',
 'Vanuatu',
 'Vietnam',
 'West Bank and Gaza',


In [35]:
# none of these relate to north korea so we can remove 'korea' as an lmic
df[df['author_affiliation'].str.contains('Korea')]['author_affiliation'].values

array(['Department of Preventive Medicine, College of Medicine, Seoul National University, 103 Daehangno, Jongno-gu, Seoul 110-799, Republic of Korea.',
       "From the Department of Urology, Sanggye Paik Hospital, Inje University College of Medicine, Seoul, Republic of Korea (J.Y.K.); From the Department of Preventive Medicine, Yonsei University College of Medicine, Seoul, Republic of Korea (J.Y.); Division of Health Administration, College of Software and Digital Healthcare Convergence, Yonsei University, Wonju, Republic of Korea (K.Y.); Department of Research and Analysis, National Health Insurance Service Ilsan Hospital, Goyang, Republic of Korea (W.R.L.); Department of Occupational and Environmental Medicine, Gil Medical Center, Gachon University College of Medicine, Incheon, Republic of Korea (W.L.); and Department of Occupational and Environmental Medicine, Seoul St Mary's Hospital, College of Medicine, The Catholic University of Korea, Seoul, Republic of Korea (M.Y.K.).",
    

In [36]:
list_of_lmics.remove('Korea')

In [37]:
list_of_lmics

['Afghanistan',
 'Angola',
 'Bangladesh',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'Comoros',
 'Congo',
 'Congo',
 'Côte d’Ivoire',
 'Djibouti',
 'Egypt',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Gambia',
 'Ghana',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Jordan',
 'Kenya',
 'Kiribati',
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Madagascar',
 'Malawi',
 'Mali',
 'Mauritania',
 'Micronesia',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Nepal',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Pakistan',
 'Papua New Guinea',
 'Philippines',
 'Rwanda',
 'Samoa',
 'São Tomé and Príncipe',
 'Senegal',
 'Sierra Leone',
 'Solomon Islands',
 'Somalia',
 'South Sudan',
 'Sri Lanka',
 'Sudan',
 'Syrian Arab Republic',
 'Tajikistan',
 'Tanzania',
 'Timor-Leste',
 'Togo',
 'Tunisia',
 'Uganda',
 'Uzbekistan',
 'Vanuatu',
 'Vietnam',
 'West Bank and Gaza',
 'Yemen',


In [38]:
# papers to check that Asaad gave us, now all present, Iran wasn't previously due to issue above
df[df.pmid.isin(["35228261",
"36280382",
"30530485",
"15613613",
"36572527",
"36379677",
"19671533",
"34799440",
"25794507"])]

Unnamed: 0,pmid,author,author_affiliation,date,title,journal,pub_type
1888,19671533,"Fullerton DG, Semple S, Kalambo F, Suseno A, M...",Malawi-Liverpool-Wellcome Clinical Research La...,2009 Nov,Biomass fuel use and indoor air pollution in h...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't"
1958,15613613,"Saha A, Kulkarni PK, Shah A, Patel M, Saiyed HN","Occupational Medicine Division, National Insti...",2005 Jan,Ocular morbidity and fuel use: an experience f...,Occupational and environmental medicine,Journal Article
2956,36280382,"Rabbani G, Nimmi N, Benke GP, Dharmage SC, Bui...","Bangladesh Betar, Dhaka, Bangladesh., Institut...",2023 Jan,Ever and cumulative occupational exposure and ...,Occupational and environmental medicine,"Journal Article, Meta-Analysis, Review, System..."
6734,36572527,"Das D, Dutta HK, Borbora D, Brahma RC, Das JM","Department of Surgery, Assam Medical College a...",2023 Feb,Assessing the relationship between hypospadias...,Occupational and environmental medicine,Journal Article
7895,36379677,"Hosseini B, Olsson A, Bouaoun L, Hall A, Hadji...","Environment and Lifestyle Epidemiology Branch,...",2022 Dec,Lung cancer risk in relation to jobs held in a...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't"
8973,35228261,"Lee MS, Eum KD, Golam M, Quamruzzaman Q, Kile ...","Department of Environmental Health, Harvard T....",2022 May,Household use of crop residues and fuelwood fo...,Occupational and environmental medicine,"Journal Article, Research Support, N.I.H., Ext..."
12671,25794507,"Phung D, Rutherford S, Chu C, Wang X, Nguyen M...",Centre for Environment and Population Health (...,2015 Jul,Temperature as a risk factor for hospitalisati...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't"


In [39]:
len(df)

16389

In [40]:
# pre 2000
results(df[df['date'] < "2000"])

['Occupational medicine (Oxford, England)'
 'American journal of industrial medicine'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine'
 'Occupational medicine (Philadelphia, Pa.)']


author_affiliation
False    3977
True       40
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.990042
True     0.009958
Name: proportion, dtype: float64
contains lmic



In [41]:
# 2000 and later
results(df[df['date'] >= "2000"])

['Occupational and environmental medicine'
 'Journal of occupational and environmental medicine'
 'Occupational medicine (Philadelphia, Pa.)'
 'Occupational medicine (Oxford, England)'
 'American journal of industrial medicine']


author_affiliation
False    12169
True       203
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.983592
True     0.016408
Name: proportion, dtype: float64
contains lmic



In [42]:
df['journal'].unique()

array(['Occupational and environmental medicine',
       'Journal of occupational and environmental medicine',
       'Occupational medicine (Oxford, England)',
       'Occupational medicine (Philadelphia, Pa.)',
       'American journal of industrial medicine'], dtype=object)

In [43]:
# lets call OM Pa OM Ox.... 
df = df.replace('Occupational medicine (Philadelphia, Pa.)', 'Occupational medicine (Oxford, England)' )
journals = df['journal'].unique()
df.to_csv('for_asaad.csv')

In [44]:
# pre 2000
results(df[df['date'] < "2000"])

['Occupational medicine (Oxford, England)'
 'American journal of industrial medicine'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine']


author_affiliation
False    3977
True       40
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.990042
True     0.009958
Name: proportion, dtype: float64
contains lmic



In [45]:
# pre 2000 by journal
[results(df[(df['date'] < "2000") & (df['journal'] == journal)]) for journal in journals]

['Occupational and environmental medicine']


author_affiliation
False    707
True       3
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.995775
True     0.004225
Name: proportion, dtype: float64
contains lmic

['Journal of occupational and environmental medicine']


author_affiliation
False    558
True       5
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.991119
True     0.008881
Name: proportion, dtype: float64
contains lmic

['Occupational medicine (Oxford, England)']


author_affiliation
False    1063
True       15
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.986085
True     0.013915
Name: proportion, dtype: float64
contains lmic

['American journal of industrial medicine']


author_affiliation
False    1649
True       17
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.989796
True     0.010204
Name: proportion, dtype: float64
contains lmic



[None, None, None, None]

In [46]:
# after 2000 by journal
results(df[(df['date'] >= "2000")])

['Occupational and environmental medicine'
 'Journal of occupational and environmental medicine'
 'Occupational medicine (Oxford, England)'
 'American journal of industrial medicine']


author_affiliation
False    12169
True       203
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.983592
True     0.016408
Name: proportion, dtype: float64
contains lmic



In [47]:
# after 2000 by journal
[results(df[(df['date'] >= "2000") & (df['journal'] == journal)]) for journal in journals]

['Occupational and environmental medicine']


author_affiliation
False    3155
True       46
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.985629
True     0.014371
Name: proportion, dtype: float64
contains lmic

['Journal of occupational and environmental medicine']


author_affiliation
False    4163
True       82
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.980683
True     0.019317
Name: proportion, dtype: float64
contains lmic

['Occupational medicine (Oxford, England)']


author_affiliation
False    2123
True       35
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.983781
True     0.016219
Name: proportion, dtype: float64
contains lmic

['American journal of industrial medicine']


author_affiliation
False    2728
True       40
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.985549
True     0.014451
Name: proportion, dtype: float64
contains lmic



[None, None, None, None]

In [48]:
# we use this data to make our Choropleth

lmic_dict_papers = {}
for lmic in list_of_lmics:
    print(lmic,len(df) - df.author_affiliation.str.contains(lmic).value_counts()[0])
    lmic_dict_papers[lmic] = len(df) - df.author_affiliation.str.contains(lmic).value_counts()[0]

Afghanistan 0
Angola 0
Bangladesh 6
Benin 3
Bhutan 2
Bolivia 0
Burkina Faso 0
Burundi 0
Cabo Verde 0
Cambodia 0
Cameroon 0
Central African Republic 0
Chad 0
Comoros 0
Congo 1
Congo 1
Côte d’Ivoire 0
Djibouti 0
Egypt 14
Eritrea 0
Eswatini 0
Ethiopia 8
Gambia 0
Ghana 4
Guinea 0
Guinea-Bissau 0
Haiti 0
Honduras 0
India 101
Jordan 5
Kenya 1
Kiribati 0
Kyrgyz Republic 0
Lao PDR 0
Lebanon 18
Lesotho 0
Liberia 0
Madagascar 0
Malawi 1
Mali 6
Mauritania 0
Micronesia 0
Morocco 1
Mozambique 0
Myanmar 0
Nepal 3
Nicaragua 7
Niger 12
Nigeria 12
Pakistan 15
Papua New Guinea 0
Philippines 4
Rwanda 0
Samoa 0
São Tomé and Príncipe 0
Senegal 1
Sierra Leone 1
Solomon Islands 0
Somalia 0
South Sudan 0
Sri Lanka 15
Sudan 5
Syrian Arab Republic 0
Tajikistan 0
Tanzania 6
Timor-Leste 0
Togo 1
Tunisia 3
Uganda 1
Uzbekistan 0
Vanuatu 0
Vietnam 4
West Bank and Gaza 0
Yemen 0
Zambia 2
Zimbabwe 1


In [49]:
# e.g
df[df.author_affiliation.str.contains('Pakistan')]

Unnamed: 0,pmid,author,author_affiliation,date,title,journal,pub_type
2561,35608827,"Ariza-Montes A, Mahmood F, Han H, Saleem M","Universidad Loyola Andalucia, Spain (Dr Ariza-...",2022 May 1,The Mental Well-Being of Health Care Professio...,Journal of occupational and environmental medi...,Journal Article
3969,32956236,"Sarfraz A, Sarfraz Z, Anwer A, Sarfraz M, Sidd...","Aga Khan University, Karachi, Pakistan (Dr Azz...",2020 Nov,"Availability, Use, and Satisfaction of Persona...",Journal of occupational and environmental medi...,Journal Article
4226,33234873,"Kumar D, Saghir T, Ali G, Yasin U, Furnaz S, K...",National Institute of Cardiovascular Diseases ...,2021 Feb 1,Psychosocial Impact of COVID-19 on Healthcare ...,Journal of occupational and environmental medi...,Journal Article
5616,27032413,"Khattak I, Mushtaq MH, Ahmad MU, Khan MS, Haid...","Department of Epidemiology and Public Health, ...",2016 Jul,Zoonotic tuberculosis in occupationally expose...,"Occupational medicine (Oxford, England)",Journal Article
5677,35672913,"Rabbani U, Razzaq S, Irfan M, Semple S, Nafees AA","From the Family Medicine Academy, Qassim Healt...",2022 Sep 1,Indoor Air Pollution and Respiratory Health in...,Journal of occupational and environmental medi...,"Journal Article, Research Support, Non-U.S. Gov't"
6475,26265670,"Khan AW, Kundi M, Moshammer H","Institute for Environmental Health, Center for...",2015 Oct,Diminished pulmonary function in long-term wor...,Occupational and environmental medicine,"Comparative Study, Journal Article, Research S..."
6861,32890224,"Khisroon M, Khan A, Hassan N, Zaidi F, Farooqi J","Department of Zoology, University of Peshawar,...",2020 Sep,Biomonitoring of DNA Damage in Photocopiers' W...,Journal of occupational and environmental medi...,"Journal Article, Research Support, Non-U.S. Gov't"
7278,14534448,"Meo SA, Azeem MA, Subhan MM","Department of Physiology, Hamdard College of M...",2003 Oct,Lung function in Pakistani welding workers.,Journal of occupational and environmental medi...,"Comparative Study, Journal Article"
7497,33021515,"Adnan S, Hanif M, Khan AH, Latif M, Ullah K, B...","Pakistan Meteorological Department (Dr Adnan, ...",2021 Feb 1,Impact of Heat Index and Ultraviolet Index on ...,Journal of occupational and environmental medi...,"Journal Article, Multicenter Study"
11805,23155188,"Nafees AA, Fatmi Z, Kadir MM, Sathiakumar N","Division of Environmental Health Sciences, Dep...",2013 Feb,Pattern and predictors for respiratory illness...,Occupational and environmental medicine,"Journal Article, Research Support, N.I.H., Ext..."


In [50]:
df_lmic_count = pd.DataFrame([lmic_dict_papers]).melt()
df_lmic_count.columns = ['Economy','N']

# use dataframe ealier 
countriesdf = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income"))]
countriesdf.loc[:,'Economy'] = countriesdf['Economy'].str.split(",").str[0] #simplify names for consistency

gdf = pd.merge(countriesdf.dropna(), df_lmic_count)

In [51]:
# gdf = gdf[gdf['N'] > 0] # lets not include LMICS with zero papers

In [70]:
print('Number of papers where the author affiliation includes one or more instances of the country name. Low and lower middle income countries, as defined by the world bank, shown for four occupational lung disease journals (Journal of occupational and environmental medicine, Occupational medicine, American journal of industrial medicine, Occupational and environmental medicine) accessed via PubMed 1987-2024')

Number of papers where the author affiliation includes one or more instances of the country name. Low and lower middle income countries, as defined by the world bank, shown for four occupational lung disease journals (Journal of occupational and environmental medicine, Occupational medicine, American journal of industrial medicine, Occupational and environmental medicine) accessed via PubMed 1987-2024


In [63]:
# colorcale choice https://plotly.com/python/builtin-colorscales/

fig = go.Figure(data=go.Choropleth(
    locations = gdf['Code'],
    z = gdf['N'],
    text = gdf['Economy'],
    colorscale = 'Viridis',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'N of papers per LMIC'))

fig.show()

In [54]:
df.date.min()

'1987'

In [55]:
df.date.max()[:4]

'2024'

In [73]:
!python --version

Python 3.10.12
