In [1]:
# issue of mis match with asaads online search
# identified that not all entries have fields we desire e.g. abstract
# I had been discarding them
# now save 'notpapers'
# and relax criteria e.g. do we really care if no abstract or key words?? 
# n.b we should prob do something useful with other fields (or not require them)

In [2]:
from Bio import Entrez
from Bio import Medline
from tqdm import tqdm
import pandas as pd
import sqlite3
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import xlrd
import plotly.graph_objects as go
%matplotlib inline

In [3]:
def get_count(term):
    """
    Use pubmed api to get a count of number of results for a search term
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  rettype="count",
                                  #field="DP",
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count

def chunked_pmids(term, chunksize=1000):
    """
    Use pubmed api to fetch blocks of pmids for a search term
    """
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print("{} blocks of to process".format(len(retmax_requests)))
    
    for i, retmax in enumerate(retmax_requests):
        
        print("Processing block {}".format(i))
        
        pmid_handle = Entrez.esearch(db="pubmed",
                                     sort="relevance",
                                     retmode="xml",
                                     usehistory='y',
                                     retstart=retmax,
                                     retmax=chunksize,
                                     #field="DP",
                                     term=term)
        pmids.append(Entrez.read(pmid_handle)["IdList"])
            
    return pmids

def fetch_medline(pmids):
    """
    Use pubmed api to fetch medline record for pmids
    """
    
    Entrez.email = "carl.reynolds@imperial.ac.uk"
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records

def getpapers(pmid_chunks):
    """
    Fetch_medline(chunk) returns a generator object of medline records. we iterate through it saving the records 
    to a list. We make a dict of the list indexed by the pubmed id. 
    """
    
    papers = []
    notpapers = []
    
    print("fetching medline records:")
    
    for chunk in tqdm(pmid_chunks):
        records = fetch_medline(chunk)
        for record in records:
            try:
                  papers.append((record['PMID'], (', '.join(record['AU'])),(', '.join(record['AD'])),
                                 record['DP'], record['TI'], record['JT'], 
                                 (', '.join(record['PT']))))            
            except: 
                notpapers.append(record)
                continue
            
    return papers, notpapers


def save_papers(papers):
    """
    Save our papers to an sqlite database
    """
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    
    # Drop table if already exists
    c.execute("DROP TABLE IF EXISTS papers")
    
    # Create table
    c.execute('''CREATE TABLE papers
             (pmid, author, author_affiliation, date, title, journal, pub_type)''')

    # Insert a rows of data
    c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?,?)', papers)

    # Save (commit) the changes
    conn.commit()

    # We can also close the connection if we are done with it.
    # Just be sure any changes have been committed or they will be lost.
    conn.close()
    
def df_from_papers_database():
    """
    Load our papers
    """
    conn = sqlite3.connect('papers.db')
    df = pd.read_sql_query('SELECT * FROM papers', conn)
    return df

def fetch_papers(year):
    """
    Fetch our papers
    """
    print("{} records to fetch".format(get_count(year)))
    pmid_chunks = chunked_pmids(year, 500)
    papers, notpapers = getpapers(pmid_chunks)
    
    return papers, notpapers

def results(df):
    """
    Make some results
    """
    print(df['journal'].unique())
    print('\n')
    
    print(df['author_affiliation'].str.contains('|'.join(list_of_lmics)).value_counts())
    print('contains lmic\n')

    print(df['author_affiliation'].str.contains('|'.join(list_of_lmics)).value_counts(normalize=True))
    print('contains lmic\n')

"""
# deprecated geocoding experiments, if did pursue this would start with named entity recognition
# we wanna know which author affiliations are LMIC countries
# initial tack, let's geocode then use country boundaries.... 

# Import the geocoding tool
from geopandas.tools import geocode

# Geocode addresses using Nominatim. Remember to provide a custom "application name" in the user_agent parameter!
geo = geocode(df['author_affiliation'][0][-10:], provider='nominatim', user_agent='drcjar_geotimes', timeout=4)

geocode("Imperial College London", provider='nominatim', user_agent='drcjar_geotimes', timeout=4).values

geocode("From the American College of Occupational and Environmental Medicine, ElkGrove, Illinois.", provider='nominatim', user_agent='drcjar_geotimes', timeout=4)

# our datas author_affiliation field is not readily geocoded
# it gets pretty hacky fast e.g https://pypi.org/project/pubmed-author-affiliation/ chopping strings based on 
# prescence of 'university' or 'institution' to make geocodeable
# more sensible approach is to search for LMIC names in strings

"""

'\n# deprecated geocoding experiments, if did pursue this would start with named entity recognition\n# we wanna know which author affiliations are LMIC countries\n# initial tack, let\'s geocode then use country boundaries.... \n\n# Import the geocoding tool\nfrom geopandas.tools import geocode\n\n# Geocode addresses using Nominatim. Remember to provide a custom "application name" in the user_agent parameter!\ngeo = geocode(df[\'author_affiliation\'][0][-10:], provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4)\n\ngeocode("Imperial College London", provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4).values\n\ngeocode("From the American College of Occupational and Environmental Medicine, ElkGrove, Illinois.", provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4)\n\n# our datas author_affiliation field is not readily geocoded\n# it gets pretty hacky fast e.g https://pypi.org/project/pubmed-author-affiliation/ chopping strings based on \n# prescen

In [4]:
# make a dict of journal papers retrieved using our pubmed paper fetching function and save them
# we currently save them to a sqllite database which is overkill for small number of results; could use 
# .csv instead but might be helpful for scaling or web apps
# uncommment to run; commented out so we don't download afresh each run

# target occ lung dis journals
target_journals = ["journal of occupational and environmental medicine [Journal]", "occupational medicine [Journal]","the american journal of industrial medicine [journal]","occupational environmental medicine [journal]"]
print(target_journals)
print("\n")


papers = {}
notpapers = {}

for journal in target_journals:
    print("fetching {}\n".format(journal))
    papers[journal], notpapers[journal] = fetch_papers(journal)

['journal of occupational and environmental medicine [Journal]', 'occupational medicine [Journal]', 'the american journal of industrial medicine [journal]', 'occupational environmental medicine [journal]']


fetching journal of occupational and environmental medicine [Journal]

6078 records to fetch
13 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
Processing block 12
fetching medline records:


100%|███████████████████████████████████████████| 13/13 [00:39<00:00,  3.02s/it]


fetching occupational medicine [Journal]

4840 records to fetch
10 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
fetching medline records:


100%|███████████████████████████████████████████| 10/10 [00:27<00:00,  2.73s/it]


fetching the american journal of industrial medicine [journal]

5854 records to fetch
12 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
Processing block 10
Processing block 11
fetching medline records:


100%|███████████████████████████████████████████| 12/12 [00:34<00:00,  2.92s/it]


fetching occupational environmental medicine [journal]

4733 records to fetch
10 blocks of to process
Processing block 0
Processing block 1
Processing block 2
Processing block 3
Processing block 4
Processing block 5
Processing block 6
Processing block 7
Processing block 8
Processing block 9
fetching medline records:


100%|███████████████████████████████████████████| 10/10 [00:35<00:00,  3.59s/it]


In [5]:
"""
example of 'not a paper' for our purposes this is a pubmed record that does not have all the fields 
of interest for our analysis. 

this is why N of records in pubmed search is not equal to N of records in our analysis.  

we consider records that have the following fields: AU, AD, DP, TI, JT, and PT. These correspond to 
author, affiliation, date of publication, title, journal title, and publication type.

                                 
see https://www.nlm.nih.gov/bsd/mms/medlineelements.html for more information.
"""

notpapers['journal of occupational and environmental medicine [Journal]'][9]

{'PMID': '38748237',
 'OWN': 'NLM',
 'STAT': 'Publisher',
 'LR': '20240515',
 'IS': '1536-5948 (Electronic) 1076-2752 (Linking)',
 'DP': '2024 May 15',
 'TI': 'Support needs for return to work among self-employed workers: A focus group study.',
 'LID': '10.1097/JOM.0000000000003148 [doi]',
 'AB': 'OBJECTIVE: The aim of this study is to gain insight into the facilitators, barriers, and support needs of Dutch self-employed workers when returning to work (RTW) after sick leave. METHODS: Three focus groups were conducted, involving 15 Dutch self-employed workers who were on sick leave due to health problems. The transcripts were analysed through thematic content analysis. RESULTS: Five main themes regarding barriers, facilitators and needs of self-employed workers to RTW were identified: autonomy, social support, client management, financial security and information on sick leave. Having autonomy was considered a facilitator for RTW. However, the participants expressed a need for more fina

In [6]:
len(papers['journal of occupational and environmental medicine [Journal]'])

5504

In [7]:
len(notpapers['journal of occupational and environmental medicine [Journal]'])

574

In [8]:
len(papers['journal of occupational and environmental medicine [Journal]'])+len(notpapers['journal of occupational and environmental medicine [Journal]'])

6078

In [9]:
len(papers["occupational medicine [Journal]"])

3806

In [10]:
len(papers["the american journal of industrial medicine [journal]"])

4955

In [11]:
len(papers["occupational environmental medicine [journal]"])

4187

In [None]:
# smash together our dict of lists into a single list
papers = list(set().union(*papers.values()))
# notpapers = list(set().union(*notpapers.values()))

save_papers(papers)

In [15]:
# load our papers database as a dataframe
df = df_from_papers_database()

In [16]:
len(df)

18452

In [17]:
df[df.pmid == "36379677"].author_affiliation.values

array(['Environment and Lifestyle Epidemiology Branch, International Agency for Research on Cancer (IARC), Lyon, France., Cancer Research Center, Cancer Institute of the Islamic Republic of Iran, Tehran, The Islamic Republic of Iran., Environment and Lifestyle Epidemiology Branch, International Agency for Research on Cancer (IARC), Lyon, France., Environment and Lifestyle Epidemiology Branch, International Agency for Research on Cancer (IARC), Lyon, France., Research Directorate, Veterans Affairs Canada, Charlottetown, Prince Edward Island, Canada., Cancer Research Center, Cancer Institute of the Islamic Republic of Iran, Tehran, The Islamic Republic of Iran., Health Sciences Unit, Faculty of Social Sciences, University of Tampere Faculty of Social Sciences, Tampere, Finland., Cancer Research Center, Cancer Institute of the Islamic Republic of Iran, Tehran, The Islamic Republic of Iran., Department of Epidemiology and Biostatistics, Kerman University of Medical Sciences, Kerman, The Is

In [18]:
df['author_affiliation'].head().values

array(['Unit of Epidemiology, Regional Health Service ASL TO3, Grugliasco (Turin), Italy., Unit of Epidemiology, Regional Health Service ASL TO3, Grugliasco (Turin), Italy., Department of Clinical and Biological Sciences, University of Turin, Orbassano (Turin), Italy., Department of Epidemiology and Environmental Health, Regional Environmental Protection Agency (ARPA Piemonte), Turin, Italy., Department of Public Health, Prevention and Security Area Work Environments, Local Health Authority, Bologna, Italy., Department of Public Health, Prevention and Security Area Work Environments, Local Health Authority, Bologna, Italy., Department of Public Health, Prevention and Security Area Work Environments, Local Health Authority, Bologna, Italy., Unit of Epidemiology, Regional Health Service ASL TO3, Grugliasco (Turin), Italy.',
       'Department of Primary and Interdisciplinary Care, University of Antwerp, Antwerp, Belgium. kathleen.vanroyen@ua.ac.be',
       'From the Research Group on Psy

In [19]:
# seems like a reasonable source of countries
countries = pd.read_excel('https://datacatalogfiles.worldbank.org/ddh-published/0037712/DR0090755/CLASS.xlsx')
countries = countries.head(218) # junk we don't need after this row
countries = countries[['Economy', 'Code', 'Income group']] # lose columns we don't need
countries['Economy'] = countries['Economy'].str.replace("(","") # it's best not to have brackets for later regex 
countries['Economy'] = countries['Economy'].str.replace(")","")

In [20]:
countries['Income group'].unique()

array(['Low income', 'Upper middle income', 'High income',
       'Lower middle income', nan], dtype=object)

In [21]:
# low and middle == 'Low income' or 'Lower middle income'
list_of_lmics = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income"))].dropna().Economy.to_list()
list_of_hics = countries[countries['Income group'] == 'High income'].dropna().Economy.to_list()

In [22]:
list_of_lmics_codes = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income"))].dropna().Code.to_list()

In [23]:
list_of_lmics

['Afghanistan',
 'Angola',
 'Bangladesh',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Côte d’Ivoire',
 'Djibouti',
 'Egypt, Arab Rep.',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Gambia, The',
 'Ghana',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Jordan',
 'Kenya',
 'Kiribati',
 "Korea, Dem. People's Rep.",
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Madagascar',
 'Malawi',
 'Mali',
 'Mauritania',
 'Micronesia, Fed. Sts.',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Nepal',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Pakistan',
 'Papua New Guinea',
 'Philippines',
 'Rwanda',
 'Samoa',
 'São Tomé and Príncipe',
 'Senegal',
 'Sierra Leone',
 'Solomon Islands',
 'Somalia',
 'South Sudan',
 'Sri Lanka',
 'Sudan',
 'Syrian Arab Republic',
 'Tajikistan',
 'Tanzania',
 'Timor-Leste',
 'Togo',
 'Tunisia',
 'Uganda'

In [24]:
list_of_lmics = [i.split(',')[0] for i in list_of_lmics] # simplify names (i.e throw away sting after ',')

In [25]:
# papers to check that Asaad gave us, now all present, Iran wasn't previously due to issue above
df[df.pmid.isin(["35228261",
"36280382",
"30530485",
"15613613",
"36572527",
"36379677",
"19671533",
"34799440",
"25794507"])]

Unnamed: 0,pmid,author,author_affiliation,date,title,journal,pub_type
2277,36572527,"Das D, Dutta HK, Borbora D, Brahma RC, Das JM","Department of Surgery, Assam Medical College a...",2023 Feb,Assessing the relationship between hypospadias...,Occupational and environmental medicine,Journal Article
2664,15613613,"Saha A, Kulkarni PK, Shah A, Patel M, Saiyed HN","Occupational Medicine Division, National Insti...",2005 Jan,Ocular morbidity and fuel use: an experience f...,Occupational and environmental medicine,Journal Article
5806,34799440,"Nafees AA, Muneer MZ, De Matteis S, Amaral A, ...","Department of Community Health Sciences, Aga K...",2022 Apr,Impact of using different predictive equations...,Occupational and environmental medicine,"Journal Article, Randomized Controlled Trial, ..."
8964,19671533,"Fullerton DG, Semple S, Kalambo F, Suseno A, M...",Malawi-Liverpool-Wellcome Clinical Research La...,2009 Nov,Biomass fuel use and indoor air pollution in h...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't"
10710,36280382,"Rabbani G, Nimmi N, Benke GP, Dharmage SC, Bui...","Bangladesh Betar, Dhaka, Bangladesh., Institut...",2023 Jan,Ever and cumulative occupational exposure and ...,Occupational and environmental medicine,"Journal Article, Meta-Analysis, Review, System..."
10900,25794507,"Phung D, Rutherford S, Chu C, Wang X, Nguyen M...",Centre for Environment and Population Health (...,2015 Jul,Temperature as a risk factor for hospitalisati...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't"
11732,35228261,"Lee MS, Eum KD, Golam M, Quamruzzaman Q, Kile ...","Department of Environmental Health, Harvard T....",2022 May,Household use of crop residues and fuelwood fo...,Occupational and environmental medicine,"Journal Article, Research Support, N.I.H., Ext..."
12788,36379677,"Hosseini B, Olsson A, Bouaoun L, Hall A, Hadji...","Environment and Lifestyle Epidemiology Branch,...",2022 Dec,Lung cancer risk in relation to jobs held in a...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't"
16394,30530485,"Adler C, Friesen MC, Yeboah ED, Tettey Y, Biri...","Rollins School of Public Health, Emory Univers...",2019 Feb,Usual adult occupation and risk of prostate ca...,Occupational and environmental medicine,"Journal Article, Research Support, N.I.H., Int..."


In [26]:
len(df)

18452

In [27]:
# pre 2000
results(df[df['date'] < "2000"])

['American journal of industrial medicine'
 'Occupational medicine (Philadelphia, Pa.)'
 'Occupational medicine (Oxford, England)'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine']


author_affiliation
False    4406
True       45
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.98989
True     0.01011
Name: proportion, dtype: float64
contains lmic



In [28]:
# 2000 and later
results(df[df['date'] >= "2000"])

['American journal of industrial medicine'
 'Occupational medicine (Oxford, England)'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine'
 'Occupational medicine (Philadelphia, Pa.)']


author_affiliation
False    13563
True       438
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.968717
True     0.031283
Name: proportion, dtype: float64
contains lmic



In [29]:
df['journal'].unique()

array(['American journal of industrial medicine',
       'Occupational medicine (Oxford, England)',
       'Journal of occupational and environmental medicine',
       'Occupational medicine (Philadelphia, Pa.)',
       'Occupational and environmental medicine'], dtype=object)

In [30]:
# lets call OM Pa OM Ox.... 
df = df.replace('Occupational medicine (Philadelphia, Pa.)', 'Occupational medicine (Oxford, England)' )
journals = df['journal'].unique()
df.to_csv('for_asaad.csv')

In [31]:
# pre 2000
results(df[df['date'] < "2000"])

['American journal of industrial medicine'
 'Occupational medicine (Oxford, England)'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine']


author_affiliation
False    4406
True       45
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.98989
True     0.01011
Name: proportion, dtype: float64
contains lmic



In [32]:
# pre 2000 by journal
[results(df[(df['date'] < "2000") & (df['journal'] == journal)]) for journal in journals]

['American journal of industrial medicine']


author_affiliation
False    1925
True       19
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.990226
True     0.009774
Name: proportion, dtype: float64
contains lmic

['Occupational medicine (Oxford, England)']


author_affiliation
False    1116
True       15
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.986737
True     0.013263
Name: proportion, dtype: float64
contains lmic

['Journal of occupational and environmental medicine']


author_affiliation
False    559
True       6
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.989381
True     0.010619
Name: proportion, dtype: float64
contains lmic

['Occupational and environmental medicine']


author_affiliation
False    806
True       5
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.993835
True     0.006165
Name: proportion, dtype: float64
contains lmic



[None, None, None, None]

In [45]:
# after 2000 by journal
results(df[(df['date'] >= "2000")])

['American journal of industrial medicine'
 'Occupational medicine (Oxford, England)'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine']


author_affiliation
False    13563
True       438
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.968717
True     0.031283
Name: proportion, dtype: float64
contains lmic



In [34]:
# after 2000 by journal
[results(df[(df['date'] >= "2000") & (df['journal'] == journal)]) for journal in journals]

['American journal of industrial medicine']


author_affiliation
False    2906
True      105
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.965128
True     0.034872
Name: proportion, dtype: float64
contains lmic

['Occupational medicine (Oxford, England)']


author_affiliation
False    2617
True       58
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.978318
True     0.021682
Name: proportion, dtype: float64
contains lmic

['Journal of occupational and environmental medicine']


author_affiliation
False    4762
True      177
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.964163
True     0.035837
Name: proportion, dtype: float64
contains lmic

['Occupational and environmental medicine']


author_affiliation
False    3278
True       98
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.970972
True     0.029028
Name: proportion, dtype: float64
contains lmic



[None, None, None, None]

In [35]:
lmic_dict_papers = {}
for lmic in list_of_lmics:
    print(lmic,len(df) - df.author_affiliation.str.contains(lmic).value_counts()[0])
    lmic_dict_papers[lmic] = len(df) - df.author_affiliation.str.contains(lmic).value_counts()[0]

Afghanistan 0
Angola 0
Bangladesh 7
Benin 4
Bhutan 2
Bolivia 0
Burkina Faso 0
Burundi 0
Cabo Verde 0
Cambodia 0
Cameroon 0
Central African Republic 0
Chad 0
Comoros 0
Congo 2
Congo 2
Côte d’Ivoire 0
Djibouti 0
Egypt 16
Eritrea 0
Eswatini 0
Ethiopia 9
Gambia 0
Ghana 6
Guinea 0
Guinea-Bissau 0
Haiti 0
Honduras 1
India 108
Jordan 7
Kenya 2
Kiribati 0
Korea 211
Kyrgyz Republic 0
Lao PDR 0
Lebanon 20
Lesotho 1
Liberia 0
Madagascar 0
Malawi 1
Mali 8
Mauritania 0
Micronesia 0
Morocco 1
Mozambique 0
Myanmar 0
Nepal 3
Nicaragua 8
Niger 13
Nigeria 13
Pakistan 19
Papua New Guinea 0
Philippines 4
Rwanda 0
Samoa 0
São Tomé and Príncipe 0
Senegal 1
Sierra Leone 1
Solomon Islands 0
Somalia 0
South Sudan 0
Sri Lanka 18
Sudan 5
Syrian Arab Republic 0
Tajikistan 0
Tanzania 8
Timor-Leste 0
Togo 1
Tunisia 3
Uganda 1
Uzbekistan 0
Vanuatu 0
Vietnam 4
West Bank and Gaza 0
Yemen 0
Zambia 3
Zimbabwe 3


In [36]:
df_lmic_count = pd.DataFrame([lmic_dict_papers]).melt()
df_lmic_count.columns = ['Economy','N']

# use dataframe ealier 
countriesdf = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income"))]
countriesdf.loc[:,'Economy'] = countriesdf['Economy'].str.split(",").str[0] #simplify names for consistency

gdf = pd.merge(countriesdf.dropna(), df_lmic_count)

In [37]:
# gdf = gdf[gdf['N'] > 0] # lets not include LMICS with zero papers

In [44]:
# colorcale choice https://plotly.com/python/builtin-colorscales/

fig = go.Figure(data=go.Choropleth(
    locations = gdf['Code'],
    z = gdf['N'],
    text = gdf['Economy'],
    colorscale = 'Viridis',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'Number of times country appears in a paper',
))

fig.show()