In [1]:
import Bio
from Bio import Entrez
Entrez.email = "carl.reynolds@imperial.ac.uk"
from Bio import Medline
from tqdm import tqdm
import pandas as pd
import sqlite3
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import xlrd
import plotly.graph_objects as go
import time
from retrying import retry
%matplotlib inline

In [2]:
def get_count(term):
    """
    Use pubmed API to get a count of number of results for a search term
    """
    count_handle = Entrez.esearch(db="pubmed",
                                  sort="relevance",
                                  retmode="xml",
                                  retmax=0,  # Request only the count
                                  term=term)
    count_results = Entrez.read(count_handle)
    count = int(count_results["Count"])
    
    return count


def chunked_pmids(term, chunksize):
    """
    Use pubmed API to fetch blocks of PMIDs for a search term
    """
    
    count = get_count(term)
    
    retmax_requests = list(range(0, count, chunksize))
                
    pmids = []
    
    print(f"{len(retmax_requests)} blocks to process, total papers count: {count}")
    
    for i, retmax in enumerate(retmax_requests):
        
        print(f"Processing block {i + 1}/{len(retmax_requests)}")
        
        try: 
            pmid_handle = Entrez.esearch(db="pubmed",
                                         sort="relevance",
                                         retmode="xml",
                                         usehistory='y',
                                         retstart=retmax,
                                         retmax=chunksize,
                                         term=term)
            result = Entrez.read(pmid_handle)
            id_list = result["IdList"]
            
            # Check if the IdList is empty
            if not id_list:
                print(f"No PMIDs found for block {i + 1}")
                continue
            
            pmids.append(id_list)
        
        except Exception as e:
            print(f"Error fetching block {i}: {e}")
            continue
            
        # Add a delay to respect API rate limits
        time.sleep(5)  # Adjust the delay as needed based on API limits
            
    return pmids

def fetch_medline(pmids):
    """
    Use pubmed api to fetch medline record for pmids
    """
    handle = Entrez.efetch(db='pubmed',
                           id=pmids,
                           rettype='medline',
                           retmode='text')
    records = Medline.parse(handle)
    
    return records
    

def save_new_papers(papers):
    """
    Save only new papers (that are not already in the database) to the SQLite database.
    """
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()

    # Create table if it doesn't already exist
    c.execute('''CREATE TABLE IF NOT EXISTS papers
                 (pmid TEXT PRIMARY KEY, author TEXT, author_affiliation TEXT, date TEXT, 
                 title TEXT, journal TEXT, pub_type TEXT)''')

    # Fetch existing PMIDs from the database
    existing_pmids = set(row[0] for row in c.execute('SELECT pmid FROM papers').fetchall())

    new_papers = []
    for paper in papers:
        pmid = paper[0]
        if pmid not in existing_pmids:
            new_papers.append(paper)

    # Insert new papers into the database
    if new_papers:
        c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?,?)', new_papers)
        conn.commit()
        print(f"{len(new_papers)} new papers added to the database.")
    else:
        print("No new papers to add.")

    conn.close()

def getpapers(pmid_chunks, max_retries=10):
    """
    Fetch medline records for PMIDs with a retry mechanism if an error occurs.
    Append newly found complete papers to the database.
    """
    papers = []
    notpapers = []

    print("Fetching MEDLINE records:")

    total_fetched = 0  # To track total number of papers processed

    for i, chunk in enumerate(tqdm(pmid_chunks)):
        retries = 0
        success = False

        while retries < max_retries and not success:
            try:
                print(f"Fetching chunk {i + 1}/{len(pmid_chunks)} (attempt {retries + 1})")
                records = fetch_medline(chunk)
                records_list = list(records)  # Convert generator to list to count easily
                chunk_size = len(records_list)
                print(f"Fetched {chunk_size} records for chunk {i + 1}/{len(pmid_chunks)}")

                total_fetched += chunk_size

                for record in records_list:
                    try:
                        paper = (
                            record['PMID'], 
                            ', '.join(record.get('AU', ['Unknown'])), 
                            ', '.join(record.get('AD', ['Unknown'])),
                            record.get('DP', 'Unknown'), 
                            record.get('TI', 'Unknown'), 
                            record.get('JT', 'Unknown'), 
                            ', '.join(record.get('PT', ['Unknown']))
                        )
                        # Check if paper has missing fields
                        if 'Unknown' not in paper:
                            papers.append(paper)  # Add to the list of complete papers
                        else:
                            notpapers.append(record)  # Incomplete record
                    except KeyError as e:
                        print(f"Missing field {e} in record with PMID {record.get('PMID', 'N/A')}")
                        notpapers.append(record)
                        continue

                success = True

            except Exception as e:
                retries += 1
                print(f"Error fetching chunk {i + 1}: {e}. Retrying {retries}/{max_retries}...")
                time.sleep(2)  # Wait for 2 seconds before retrying

        if not success:
            print(f"Failed to fetch chunk {i + 1} after {max_retries} attempts. Skipping...")

    print(f"Total fetched records: {total_fetched}")
    print(f"Total complete papers: {len(papers)}")
    print(f"Total papers with missing fields: {len(notpapers)}")

    # Save only new, complete papers to the database
    save_new_papers(papers)

    return papers, notpapers

    

def save_papers(papers):
    """
    Save our papers to an sqlite database
    """
    conn = sqlite3.connect('papers.db')
    c = conn.cursor()
    
    # Drop table if already exists
    c.execute("DROP TABLE IF EXISTS papers")
    
    # Create table
    c.execute('''CREATE TABLE papers
             (pmid, author, author_affiliation, date, title, journal, pub_type)''')

    # Insert a rows of data
    c.executemany('INSERT INTO papers VALUES (?,?,?,?,?,?,?)', papers)

    # Save (commit) the changes
    conn.commit()

    # We can also close the connection if we are done with it.
    # Just be sure any changes have been committed or they will be lost.
    conn.close()
    
def df_from_papers_database():
    """
    Load our papers
    """
    conn = sqlite3.connect('papers.db')
    df = pd.read_sql_query('SELECT * FROM papers', conn)
    conn.close()  # Close the connection
    return df

def fetch_papers(term):
    """
    Fetch our papers
    """
    print("{} records to fetch".format(get_count(term)))
    pmid_chunks = chunked_pmids(term, 500)
    papers, notpapers = getpapers(pmid_chunks)
    
    return papers, notpapers

def results(df):
    """
    Make some results
    """
    print(df['journal'].unique())
    print('\n')
    
    print(df['author_affiliation'].str.contains('|'.join(list_of_lmics)).value_counts())
    print('contains lmic\n')

    print(df['author_affiliation'].str.contains('|'.join(list_of_lmics)).value_counts(normalize=True))
    print('contains lmic\n')

"""
# deprecated geocoding experiments, if did pursue this would start with named entity recognition
# we wanna know which author affiliations are LMIC countries
# initial tack, let's geocode then use country boundaries.... 

# Import the geocoding tool
from geopandas.tools import geocode

# Geocode addresses using Nominatim. Remember to provide a custom "application name" in the user_agent parameter!
geo = geocode(df['author_affiliation'][0][-10:], provider='nominatim', user_agent='drcjar_geotimes', timeout=4)

geocode("Imperial College London", provider='nominatim', user_agent='drcjar_geotimes', timeout=4).values

geocode("From the American College of Occupational and Environmental Medicine, ElkGrove, Illinois.", provider='nominatim', user_agent='drcjar_geotimes', timeout=4)

# our datas author_affiliation field is not readily geocoded
# it gets pretty hacky fast e.g https://pypi.org/project/pubmed-author-affiliation/ chopping strings based on 
# prescence of 'university' or 'institution' to make geocodeable
# more sensible approach is to search for LMIC names in strings
"""

'\n# deprecated geocoding experiments, if did pursue this would start with named entity recognition\n# we wanna know which author affiliations are LMIC countries\n# initial tack, let\'s geocode then use country boundaries.... \n\n# Import the geocoding tool\nfrom geopandas.tools import geocode\n\n# Geocode addresses using Nominatim. Remember to provide a custom "application name" in the user_agent parameter!\ngeo = geocode(df[\'author_affiliation\'][0][-10:], provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4)\n\ngeocode("Imperial College London", provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4).values\n\ngeocode("From the American College of Occupational and Environmental Medicine, ElkGrove, Illinois.", provider=\'nominatim\', user_agent=\'drcjar_geotimes\', timeout=4)\n\n# our datas author_affiliation field is not readily geocoded\n# it gets pretty hacky fast e.g https://pypi.org/project/pubmed-author-affiliation/ chopping strings based on \n# prescen

In [3]:
# make a dict of journal papers retrieved using our pubmed paper fetching function and save them
# we currently save them to a sqllite database which is overkill for small number of results; could use 
# .csv instead but might be helpful for scaling or web apps
# uncommment to run; commented out so we don't download afresh each run

# target occ lung dis journals


target_journals = ["journal of occupational and environmental medicine [Journal]", 
                   "occupational medicine [Journal]",
                   "the american journal of industrial medicine [journal]",
                   "occupational environmental medicine [journal]"]
print(target_journals)
print("\n")


papers = {}
notpapers = {}

"""
for journal in target_journals:
    print("fetching {}\n".format(journal))
    papers[journal], notpapers[journal] = fetch_papers(journal)
"""

['journal of occupational and environmental medicine [Journal]', 'occupational medicine [Journal]', 'the american journal of industrial medicine [journal]', 'occupational environmental medicine [journal]']




'\nfor journal in target_journals:\n    print("fetching {}\n".format(journal))\n    papers[journal], notpapers[journal] = fetch_papers(journal)\n'

In [4]:
journal = "journal of occupational and environmental medicine [Journal]"
papers[journal], notpapers[journal] = fetch_papers(journal)

6352 records to fetch
13 blocks to process, total papers count: 6352
Processing block 1/13
Processing block 2/13
Processing block 3/13
Processing block 4/13
Processing block 5/13
Processing block 6/13
Processing block 7/13
Processing block 8/13
Processing block 9/13
Processing block 10/13
Processing block 11/13
Processing block 12/13
Processing block 13/13
Fetching MEDLINE records:


  0%|                                                                                                                                                                               | 0/13 [00:00<?, ?it/s]

Fetching chunk 1/13 (attempt 1)


  8%|████████████▊                                                                                                                                                          | 1/13 [00:02<00:33,  2.78s/it]

Fetched 500 records for chunk 1/13
Fetching chunk 2/13 (attempt 1)


 15%|█████████████████████████▋                                                                                                                                             | 2/13 [00:05<00:29,  2.66s/it]

Fetched 500 records for chunk 2/13
Fetching chunk 3/13 (attempt 1)


 23%|██████████████████████████████████████▌                                                                                                                                | 3/13 [00:08<00:26,  2.67s/it]

Fetched 500 records for chunk 3/13
Fetching chunk 4/13 (attempt 1)


 31%|███████████████████████████████████████████████████▍                                                                                                                   | 4/13 [00:10<00:23,  2.59s/it]

Fetched 500 records for chunk 4/13
Fetching chunk 5/13 (attempt 1)


 38%|████████████████████████████████████████████████████████████████▏                                                                                                      | 5/13 [00:13<00:21,  2.72s/it]

Fetched 500 records for chunk 5/13
Fetching chunk 6/13 (attempt 1)


 46%|█████████████████████████████████████████████████████████████████████████████                                                                                          | 6/13 [00:15<00:18,  2.61s/it]

Fetched 500 records for chunk 6/13
Fetching chunk 7/13 (attempt 1)


 54%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                             | 7/13 [00:18<00:14,  2.48s/it]

Fetched 500 records for chunk 7/13
Fetching chunk 8/13 (attempt 1)


 62%|██████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                | 8/13 [00:20<00:11,  2.38s/it]

Fetched 500 records for chunk 8/13
Fetching chunk 9/13 (attempt 1)


 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                   | 9/13 [00:22<00:09,  2.47s/it]

Fetched 500 records for chunk 9/13
Fetching chunk 10/13 (attempt 1)


 77%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 10/13 [00:25<00:07,  2.52s/it]

Fetched 500 records for chunk 10/13
Fetching chunk 11/13 (attempt 1)


 85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                         | 11/13 [00:27<00:04,  2.48s/it]

Fetched 500 records for chunk 11/13
Fetching chunk 12/13 (attempt 1)


 92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 12/13 [00:30<00:02,  2.43s/it]

Fetched 500 records for chunk 12/13
Fetching chunk 13/13 (attempt 1)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:32<00:00,  2.47s/it]

Fetched 352 records for chunk 13/13
Total fetched records: 6352
Total complete papers: 5792
Total papers with missing fields: 560
297 new papers added to the database.





In [5]:
journal =  "occupational medicine [Journal]"
papers[journal], notpapers[journal] = fetch_papers(journal)

4922 records to fetch
10 blocks to process, total papers count: 4922
Processing block 1/10
Processing block 2/10
Processing block 3/10
Processing block 4/10
Processing block 5/10
Processing block 6/10
Processing block 7/10
Processing block 8/10
Processing block 9/10
Processing block 10/10
Fetching MEDLINE records:


  0%|                                                                                                                                                                               | 0/10 [00:00<?, ?it/s]

Fetching chunk 1/10 (attempt 1)


 10%|████████████████▋                                                                                                                                                      | 1/10 [00:03<00:31,  3.47s/it]

Fetched 500 records for chunk 1/10
Fetching chunk 2/10 (attempt 1)


 20%|█████████████████████████████████▍                                                                                                                                     | 2/10 [00:06<00:25,  3.18s/it]

Fetched 500 records for chunk 2/10
Fetching chunk 3/10 (attempt 1)


 30%|██████████████████████████████████████████████████                                                                                                                     | 3/10 [00:08<00:18,  2.69s/it]

Fetched 500 records for chunk 3/10
Fetching chunk 4/10 (attempt 1)


 40%|██████████████████████████████████████████████████████████████████▊                                                                                                    | 4/10 [00:10<00:14,  2.50s/it]

Fetched 500 records for chunk 4/10
Fetching chunk 5/10 (attempt 1)


 50%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 5/10 [00:13<00:12,  2.52s/it]

Fetched 500 records for chunk 5/10
Fetching chunk 6/10 (attempt 1)


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 6/10 [00:16<00:11,  2.78s/it]

Fetched 500 records for chunk 6/10
Fetching chunk 7/10 (attempt 1)


 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 7/10 [00:19<00:07,  2.67s/it]

Fetched 500 records for chunk 7/10
Fetching chunk 8/10 (attempt 1)


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 8/10 [00:21<00:05,  2.73s/it]

Fetched 500 records for chunk 8/10
Fetching chunk 9/10 (attempt 1)


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 9/10 [00:24<00:02,  2.63s/it]

Fetched 500 records for chunk 9/10
Fetching chunk 10/10 (attempt 1)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.61s/it]

Fetched 422 records for chunk 10/10
Total fetched records: 4922
Total complete papers: 3881
Total papers with missing fields: 1041
65 new papers added to the database.





In [6]:
journal = "the american journal of industrial medicine [journal]"
papers[journal], notpapers[journal] = fetch_papers(journal)

5949 records to fetch
12 blocks to process, total papers count: 5949
Processing block 1/12
Processing block 2/12
Processing block 3/12
Processing block 4/12
Processing block 5/12
Processing block 6/12
Processing block 7/12
Processing block 8/12
Processing block 9/12
Processing block 10/12
Processing block 11/12
Processing block 12/12
Fetching MEDLINE records:


  0%|                                                                                                                                                                               | 0/12 [00:00<?, ?it/s]

Fetching chunk 1/12 (attempt 1)


  8%|█████████████▉                                                                                                                                                         | 1/12 [00:02<00:27,  2.51s/it]

Fetched 500 records for chunk 1/12
Fetching chunk 2/12 (attempt 1)


 17%|███████████████████████████▊                                                                                                                                           | 2/12 [00:05<00:25,  2.51s/it]

Fetched 500 records for chunk 2/12
Fetching chunk 3/12 (attempt 1)


 25%|█████████████████████████████████████████▊                                                                                                                             | 3/12 [00:07<00:22,  2.49s/it]

Fetched 500 records for chunk 3/12
Fetching chunk 4/12 (attempt 1)


 33%|███████████████████████████████████████████████████████▋                                                                                                               | 4/12 [00:09<00:19,  2.40s/it]

Fetched 500 records for chunk 4/12
Fetching chunk 5/12 (attempt 1)


 42%|█████████████████████████████████████████████████████████████████████▌                                                                                                 | 5/12 [00:12<00:17,  2.45s/it]

Fetched 500 records for chunk 5/12
Fetching chunk 6/12 (attempt 1)


 50%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 6/12 [00:14<00:14,  2.39s/it]

Fetched 500 records for chunk 6/12
Fetching chunk 7/12 (attempt 1)


 58%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                     | 7/12 [00:16<00:11,  2.31s/it]

Fetched 500 records for chunk 7/12
Fetching chunk 8/12 (attempt 1)


 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                       | 8/12 [00:18<00:08,  2.22s/it]

Fetched 500 records for chunk 8/12
Fetching chunk 9/12 (attempt 1)


 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 9/12 [00:21<00:06,  2.28s/it]

Fetched 500 records for chunk 9/12
Fetching chunk 10/12 (attempt 1)


 83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                           | 10/12 [00:23<00:04,  2.19s/it]

Fetched 500 records for chunk 10/12
Fetching chunk 11/12 (attempt 1)


 92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏             | 11/12 [00:25<00:02,  2.28s/it]

Fetched 500 records for chunk 11/12
Fetching chunk 12/12 (attempt 1)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:27<00:00,  2.29s/it]

Fetched 449 records for chunk 12/12
Total fetched records: 5949
Total complete papers: 5130
Total papers with missing fields: 819
68 new papers added to the database.





In [7]:
journal =  "occupational environmental medicine [journal]"
papers[journal], notpapers[journal] = fetch_papers(journal)

4807 records to fetch
10 blocks to process, total papers count: 4807
Processing block 1/10
Processing block 2/10
Processing block 3/10
Processing block 4/10
Processing block 5/10
Processing block 6/10
Processing block 7/10
Processing block 8/10
Processing block 9/10
Processing block 10/10
Fetching MEDLINE records:


  0%|                                                                                                                                                                               | 0/10 [00:00<?, ?it/s]

Fetching chunk 1/10 (attempt 1)


 10%|████████████████▋                                                                                                                                                      | 1/10 [00:03<00:31,  3.50s/it]

Fetched 500 records for chunk 1/10
Fetching chunk 2/10 (attempt 1)


 20%|█████████████████████████████████▍                                                                                                                                     | 2/10 [00:06<00:25,  3.22s/it]

Fetched 500 records for chunk 2/10
Fetching chunk 3/10 (attempt 1)


 30%|██████████████████████████████████████████████████                                                                                                                     | 3/10 [00:09<00:21,  3.08s/it]

Fetched 500 records for chunk 3/10
Fetching chunk 4/10 (attempt 1)


 40%|██████████████████████████████████████████████████████████████████▊                                                                                                    | 4/10 [00:11<00:17,  2.86s/it]

Fetched 500 records for chunk 4/10
Fetching chunk 5/10 (attempt 1)


 50%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 5/10 [00:15<00:14,  2.96s/it]

Fetched 500 records for chunk 5/10
Fetching chunk 6/10 (attempt 1)


 60%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 6/10 [00:18<00:11,  2.99s/it]

Fetched 500 records for chunk 6/10
Fetching chunk 7/10 (attempt 1)


 70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 7/10 [00:20<00:08,  2.92s/it]

Fetched 500 records for chunk 7/10
Fetching chunk 8/10 (attempt 1)


 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 8/10 [00:23<00:05,  2.84s/it]

Fetched 500 records for chunk 8/10
Fetching chunk 9/10 (attempt 1)


 90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                | 9/10 [00:26<00:02,  2.91s/it]

Fetched 500 records for chunk 9/10
Fetching chunk 10/10 (attempt 1)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.86s/it]

Fetched 307 records for chunk 10/10
Total fetched records: 4807
Total complete papers: 4245
Total papers with missing fields: 562
61 new papers added to the database.





In [8]:
"""
example of 'not a paper' for our purposes this is a pubmed record that does not have all the fields 
of interest for our analysis. 

this is why N of records in pubmed search is not equal to N of records in our analysis.  

we consider records that have the following fields: AU, AD, DP, TI, JT, and PT. These correspond to 
author, affiliation, date of publication, title, journal title, and publication type.

                                 
see https://www.nlm.nih.gov/bsd/mms/medlineelements.html for more information.
"""

# notpapers['journal of occupational and environmental medicine [Journal]'][9]

"\nexample of 'not a paper' for our purposes this is a pubmed record that does not have all the fields \nof interest for our analysis. \n\nthis is why N of records in pubmed search is not equal to N of records in our analysis.  \n\nwe consider records that have the following fields: AU, AD, DP, TI, JT, and PT. These correspond to \nauthor, affiliation, date of publication, title, journal title, and publication type.\n\n                                 \nsee https://www.nlm.nih.gov/bsd/mms/medlineelements.html for more information.\n"

In [9]:
# load our papers database as a dataframe
df = df_from_papers_database()

In [10]:
len(df) #16577 #16266 #16092 #16189 16734 #15370 #16260 #15796 #15921 #16346 #18702(total 21578 records)

19938

In [11]:
df.pmid.nunique() #18522

18895

In [12]:
df = df.drop_duplicates() # we've duplicates in database at present

In [13]:
len(df[df.journal ==  'American journal of industrial medicine'])

5024

In [14]:
len(notpapers['the american journal of industrial medicine [journal]']) 

819

In [15]:
len(df[df.journal ==  'American journal of industrial medicine']) + len(notpapers['the american journal of industrial medicine [journal]']) 

5843

In [16]:
# seems like a reasonable source of countries
countries = pd.read_excel('https://datacatalogfiles.worldbank.org/ddh-published/0037712/DR0090755/CLASS.xlsx')
countries = countries.head(218) # junk we don't need after this row
countries = countries[['Economy', 'Code', 'Income group']] # lose columns we don't need
countries['Economy'] = countries['Economy'].str.replace("(","") # it's best not to have brackets for later regex 
countries['Economy'] = countries['Economy'].str.replace(")","")

In [17]:
countries['Income group'].unique()

array(['Low income', 'Upper middle income', 'High income',
       'Lower middle income', nan], dtype=object)

In [20]:
list_of_hics = countries[countries['Income group'] == 'High income'].dropna().Economy.to_list()

In [23]:
# low and middle == 'Low income' or 'Lower middle income'
list_of_lmics = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income") | (countries['Income group'] == "Upper middle income"))].dropna().Economy.to_list()

In [24]:
list_of_lmics_codes = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income") | (countries['Income group'] == "Upper middle income") )].dropna().Code.to_list()

In [25]:
list_of_lmics

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Azerbaijan',
 'Bangladesh',
 'Belarus',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Dem. Rep.',
 'Congo, Rep.',
 'Costa Rica',
 'Côte d’Ivoire',
 'Cuba',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt, Arab Rep.',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Gabon',
 'Gambia, The',
 'Georgia',
 'Ghana',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Indonesia',
 'Iran, Islamic Rep.',
 'Iraq',
 'Jamaica',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 "Korea, Dem. People's Rep.",
 'Kosovo',
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Madagascar',
 'Malawi',
 'Malaysi

In [26]:
# v probably don't need to do this / doesn't help
list_of_lmics = [i.split(',')[0] for i in list_of_lmics] # simplify names (i.e throw away sting after ',')

In [27]:
# this is list we're using for matching
# obviously it'll go wrong for korea
list_of_lmics

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Azerbaijan',
 'Bangladesh',
 'Belarus',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo',
 'Costa Rica',
 'Côte d’Ivoire',
 'Cuba',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Gabon',
 'Gambia',
 'Georgia',
 'Ghana',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Jamaica',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Korea',
 'Kosovo',
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Marshall Islands',
 'Mauritania',
 'Mau

In [28]:
# none of these relate to north korea so we can remove 'korea' as an lmic
df[df['author_affiliation'].str.contains('Korea')]['author_affiliation'].values

array(['Department of Preventive Medicine, Dongguk University College of Medicine, Gyeongju-si, South Korea., Department of Preventive Medicine, Dongguk University College of Medicine, Gyeongju-si, South Korea., Department of Preventive Medicine, College of Medicine and Medical Research Institute, Chungbuk National University, Cheongju-si, South Korea.',
       'School of Public Health, Seoul National University, Seoul, Korea.',
       'Department of Preventive Medicine, Korea University College of Medicine, Seoul, South Korea., Department of Epidemiology and Health Informatics, Graduate School of Public Health, Korea University, Seoul, South Korea., Department of Preventive Medicine, Korea University College of Medicine, Seoul, South Korea., Department of Epidemiology and Health Informatics, Graduate School of Public Health, Korea University, Seoul, South Korea., Department of Preventive Medicine, Korea University College of Medicine, Seoul, South Korea., Department of Epidemiology an

In [29]:
list_of_lmics.remove('Korea')

In [30]:
list_of_lmics

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Azerbaijan',
 'Bangladesh',
 'Belarus',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Burkina Faso',
 'Burundi',
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo',
 'Costa Rica',
 'Côte d’Ivoire',
 'Cuba',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Gabon',
 'Gambia',
 'Georgia',
 'Ghana',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Jamaica',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kosovo',
 'Kyrgyz Republic',
 'Lao PDR',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Marshall Islands',
 'Mauritania',
 'Mauritius',
 

In [31]:
df['journal'].unique()

array(['Journal of occupational and environmental medicine',
       'American journal of industrial medicine',
       'Occupational and environmental medicine',
       'Occupational medicine (Oxford, England)',
       'Occupational medicine (Philadelphia, Pa.)'], dtype=object)

In [32]:
# lets call OM Pa OM Ox.... 
df = df.replace('Occupational medicine (Philadelphia, Pa.)', 'Occupational medicine (Oxford, England)' )
journals = df['journal'].unique()

In [33]:
# all the data
results(df)

['Journal of occupational and environmental medicine'
 'American journal of industrial medicine'
 'Occupational and environmental medicine'
 'Occupational medicine (Oxford, England)']


author_affiliation
False    17425
True      1470
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.922202
True     0.077798
Name: proportion, dtype: float64
contains lmic



In [34]:
# pre 2000
results(df[df['date'] < "2000"])

['American journal of industrial medicine'
 'Journal of occupational and environmental medicine'
 'Occupational and environmental medicine'
 'Occupational medicine (Oxford, England)']


author_affiliation
False    4271
True      180
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.95956
True     0.04044
Name: proportion, dtype: float64
contains lmic



In [35]:
# pre 2000 by journal
[results(df[(df['date'] < "2000") & (df['journal'] == journal)]) for journal in journals]

['Journal of occupational and environmental medicine']


author_affiliation
False    553
True      12
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.978761
True     0.021239
Name: proportion, dtype: float64
contains lmic

['American journal of industrial medicine']


author_affiliation
False    1849
True       95
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.951132
True     0.048868
Name: proportion, dtype: float64
contains lmic

['Occupational and environmental medicine']


author_affiliation
False    776
True      35
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.956843
True     0.043157
Name: proportion, dtype: float64
contains lmic

['Occupational medicine (Oxford, England)']


author_affiliation
False    1093
True       38
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.966401
True     0.033599
Name: proportion, dtype: float64
contains lmic



[None, None, None, None]

In [36]:
# after 2000 
results(df[(df['date'] >= "2000")])

['Journal of occupational and environmental medicine'
 'Occupational and environmental medicine'
 'American journal of industrial medicine'
 'Occupational medicine (Oxford, England)']


author_affiliation
False    13154
True      1290
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.91069
True     0.08931
Name: proportion, dtype: float64
contains lmic



In [37]:
# after 2000 by journal
[results(df[(df['date'] >= "2000") & (df['journal'] == journal)]) for journal in journals]

['Journal of occupational and environmental medicine']


author_affiliation
False    4635
True      544
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.89496
True     0.10504
Name: proportion, dtype: float64
contains lmic

['American journal of industrial medicine']


author_affiliation
False    2787
True      293
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.90487
True     0.09513
Name: proportion, dtype: float64
contains lmic

['Occupational and environmental medicine']


author_affiliation
False    3149
True      289
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.915939
True     0.084061
Name: proportion, dtype: float64
contains lmic

['Occupational medicine (Oxford, England)']


author_affiliation
False    2583
True      164
Name: count, dtype: int64
contains lmic

author_affiliation
False    0.940299
True     0.059701
Name: proportion, dtype: float64
contains lmic



[None, None, None, None]

In [38]:
# we use this data to make our Choropleth
lmic_dict_papers = {}
for lmic in list_of_lmics:
    print(lmic,len(df) - df.author_affiliation.str.contains(lmic).value_counts()[0])
    lmic_dict_papers[lmic] = len(df) - df.author_affiliation.str.contains(lmic).value_counts()[0]

Afghanistan 0
Albania 0
Algeria 2
Angola 0
Argentina 12
Armenia 1
Azerbaijan 0
Bangladesh 7
Belarus 4
Belize 0
Benin 4
Bhutan 2
Bolivia 0
Bosnia and Herzegovina 2
Botswana 4
Brazil 161
Burkina Faso 0
Burundi 0
Cabo Verde 0
Cambodia 0
Cameroon 0
Central African Republic 0
Chad 0
China 398
Colombia 23
Comoros 0
Congo 2
Congo 2
Costa Rica 18
Côte d’Ivoire 0
Cuba 1
Djibouti 0
Dominica 2
Dominican Republic 2
Ecuador 5
Egypt 17
El Salvador 5
Equatorial Guinea 0
Eritrea 0
Eswatini 0
Ethiopia 9
Fiji 0
Gabon 0
Gambia 0
Georgia 232
Ghana 6
Grenada 1
Guatemala 5
Guinea 0
Guinea-Bissau 0
Haiti 0
Honduras 1
India 117
Indonesia 13
Iran 36
Iraq 3
Jamaica 4
Jordan 7
Kazakhstan 7
Kenya 2
Kiribati 0
Kosovo 0
Kyrgyz Republic 0
Lao PDR 0
Lebanon 21
Lesotho 1
Liberia 0
Libya 0
Madagascar 0
Malawi 1
Malaysia 20
Maldives 0
Mali 8
Marshall Islands 0
Mauritania 0
Mauritius 0
Mexico 74
Micronesia 0
Moldova 1
Mongolia 2
Montenegro 3
Morocco 1
Mozambique 0
Myanmar 0
Namibia 0
Nepal 3
Nicaragua 8
Niger 14
Nigeria 

In [39]:
lmic_dict_papers

{'Afghanistan': 0,
 'Albania': 0,
 'Algeria': 2,
 'Angola': 0,
 'Argentina': 12,
 'Armenia': 1,
 'Azerbaijan': 0,
 'Bangladesh': 7,
 'Belarus': 4,
 'Belize': 0,
 'Benin': 4,
 'Bhutan': 2,
 'Bolivia': 0,
 'Bosnia and Herzegovina': 2,
 'Botswana': 4,
 'Brazil': 161,
 'Burkina Faso': 0,
 'Burundi': 0,
 'Cabo Verde': 0,
 'Cambodia': 0,
 'Cameroon': 0,
 'Central African Republic': 0,
 'Chad': 0,
 'China': 398,
 'Colombia': 23,
 'Comoros': 0,
 'Congo': 2,
 'Costa Rica': 18,
 'Côte d’Ivoire': 0,
 'Cuba': 1,
 'Djibouti': 0,
 'Dominica': 2,
 'Dominican Republic': 2,
 'Ecuador': 5,
 'Egypt': 17,
 'El Salvador': 5,
 'Equatorial Guinea': 0,
 'Eritrea': 0,
 'Eswatini': 0,
 'Ethiopia': 9,
 'Fiji': 0,
 'Gabon': 0,
 'Gambia': 0,
 'Georgia': 232,
 'Ghana': 6,
 'Grenada': 1,
 'Guatemala': 5,
 'Guinea': 0,
 'Guinea-Bissau': 0,
 'Haiti': 0,
 'Honduras': 1,
 'India': 117,
 'Indonesia': 13,
 'Iran': 36,
 'Iraq': 3,
 'Jamaica': 4,
 'Jordan': 7,
 'Kazakhstan': 7,
 'Kenya': 2,
 'Kiribati': 0,
 'Kosovo': 0,
 'K

In [40]:
# Countries, counts, and the pmids for fun
# Loop through the data and update the dictionary based on LMIC countries found
lmic_stats = {}

for idx, row in df.iterrows():
    pmid = row['pmid']
    affiliation = str(row['author_affiliation'])
    for country in list_of_lmics:
        if country in affiliation:
            if country not in lmic_stats:
                # Initialize the country entry if it doesn't exist
                lmic_stats[country] = {'Count': 0, 'PMIDs': []}
            # Update the count and pmids    
            lmic_stats[country]['Count'] += 1
            lmic_stats[country]['PMIDs'].append(pmid)

# Convert the dictionary to a DataFrame for easier display
lmic_df = pd.DataFrame([(country, info['Count'], ", ".join(map(str, info['PMIDs']))) for country, info in lmic_stats.items()],
                       columns=['Country', 'Count', 'PMIDs'])

In [41]:
lmic_df

Unnamed: 0,Country,Count,PMIDs
0,Brazil,161,"19233830, 11307685, 31090672, 34157150, 316251..."
1,South Africa,159,"29574825, 33298754, 19943319, 29210092, 131114..."
2,Ethiopia,9,"29419423, 16140842, 37875370, 21919031, 975094..."
3,Serbia,9,"31587045, 16421395, 36977359, 16627545, 362552..."
4,Georgia,232,"33484179, 27501104, 38272665, 35244089, 289902..."
...,...,...,...
64,Sierra Leone,1,36257667
65,Bosnia and Herzegovina,2,"30657988, 37984917"
66,Lesotho,1,29211908
67,Moldova,1,37984917


In [42]:
lmic_df.to_csv('lmic_df.csv', index=False)

In [43]:
# Papers, lmic counts, and lmics for fun
# Function to count the LMIC countries in a given affiliation string
def count_lmic_countries(affiliation, list_of_lmics):
    count = 0
    for country in list_of_lmics:
        if country in affiliation:
            count += 1
    return count

# Apply the function to the affiliation field
df['LMIC_Count'] = df['author_affiliation'].apply(lambda x: count_lmic_countries(str(x), list_of_lmics))

def list_lmic_countries(affiliation, list_of_lmics):
    countries_found = [country for country in list_of_lmics if country in affiliation]
    return ", ".join(countries_found)

# Apply the function to create a new column with the LMIC country names
df['LMIC_Countries'] = df['author_affiliation'].apply(lambda x: list_lmic_countries(str(x), list_of_lmics))


In [44]:
df[df['LMIC_Count']>0]['LMIC_Count'].value_counts() # as in 225 papers have only a single LMIC, 1 has 4 etc

LMIC_Count
1     1394
2       58
3        9
4        6
5        2
10       1
Name: count, dtype: int64

In [45]:
df[df['LMIC_Count']>3]

Unnamed: 0,pmid,author,author_affiliation,date,title,journal,pub_type,LMIC_Count,LMIC_Countries
2735,32079717,"Coggon D, Ntani G, Walker-Bone K, Felli VE, Ha...",Medical Research Council Lifecourse Epidemiolo...,2020 May,Associations of sickness absence for pain in t...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't",10,"Brazil, Colombia, Costa Rica, Ecuador, Georgia..."
3432,27566784,"Garcia-Trabanino R, Jakobsson K, Guzman Quilo ...","Centro de Hemodialisis, San Salvador, El Salva...",2016 Nov,"In reply to: ""Should we consider renaming 'Mes...",Occupational and environmental medicine,"Comment, Letter",5,"Costa Rica, El Salvador, Guatemala, Mexico, Ni..."
11958,33769403,"Kalia N, Moraga JA, Manzanares M, Friede V, Ku...","Department of Medicine, Johns Hopkins School o...",2021 Apr 1,Use of Vinegar and Water to Identify COVID-19 ...,Journal of occupational and environmental medi...,Journal Article,4,"Brazil, Costa Rica, Mexico, Nicaragua"
13193,25919593,"Juarez-Garcia A, Vera-Calzaretta A, Blanco-Gom...","Universidad Autonoma del Estado de Morelos, Mo...",2015 Jun,Validity of the effort/reward imbalance questi...,American journal of industrial medicine,"Journal Article, Research Support, Non-U.S. Go...",4,"Argentina, Colombia, Mexico, Peru"
16083,28598943,"Joob B, Wiwanitkit V","Sanitation Medical Academic Center, Bangkok, T...",2017 Jun,Mosquito Control Practices and Zika Knowledge ...,Journal of occupational and environmental medi...,"Comment, Journal Article",4,"India, Niger, Nigeria, Thailand"
16389,36257667,"Atwoli L, Erhabor GE, Gbakima AA, Haileamlak A...","Editor-in-Chief, East African Medical Journal....",2023 Feb 14,COP27 Climate Change Conference: urgent action...,"Occupational medicine (Oxford, England)",Journal Article,4,"Ethiopia, Ghana, Mali, Sierra Leone"
16575,29211908,"Moyo D, Zungu M, Erick P, Tumoyagae T, Mwansa ...",Baines Occupational and Travel Medicine Centre...,2017 Dec 2,Occupational health and safety in the Southern...,"Occupational medicine (Oxford, England)",Journal Article,5,"Botswana, Lesotho, South Africa, Zambia, Zimbabwe"
19050,37984917,"Nys E, Pauwels S, Adam B, Amaro J, Athanasiou ...",External Service for Prevention and Protection...,2023 Nov 23,Recognition of COVID-19 with occupational orig...,Occupational and environmental medicine,Journal Article,4,"Bosnia and Herzegovina, Moldova, North Macedon..."
19534,40063851,"Stufano A, Omokhodion FO, Moyo D, de Maria L, ...","From the Section of Occupational Medicine, Int...",2025 Jun 1,Occupational Risk Management in the Context of...,Journal of occupational and environmental medi...,Journal Article,4,"Niger, Nigeria, South Africa, Zimbabwe"


In [46]:
# e.g
df[df.author_affiliation.str.contains('Pakistan')]

Unnamed: 0,pmid,author,author_affiliation,date,title,journal,pub_type,LMIC_Count,LMIC_Countries
1416,32719018,"Khisroon M, Humayun M, Khan A, Farooqi J, Huma...","Department of Zoology, University of Peshawar,...",2020 Nov,Polymorphism in GSTM1 and GSTT1 genes influenc...,Occupational and environmental medicine,Journal Article,1,Pakistan
1557,32890224,"Khisroon M, Khan A, Hassan N, Zaidi F, Farooqi J","Department of Zoology, University of Peshawar,...",2020 Sep,Biomonitoring of DNA Damage in Photocopiers' W...,Journal of occupational and environmental medi...,"Journal Article, Research Support, Non-U.S. Gov't",1,Pakistan
2082,26185127,"Awais M, Hafeez S, Rehman A, Baloch NU","Department of Radiology, Aga Khan University H...",2015 Sep,Vibration-induced multifocal carpal osteonecro...,Occupational and environmental medicine,"Case Reports, Letter",1,Pakistan
2234,33021515,"Adnan S, Hanif M, Khan AH, Latif M, Ullah K, B...","Pakistan Meteorological Department (Dr Adnan, ...",2021 Feb 1,Impact of Heat Index and Ultraviolet Index on ...,Journal of occupational and environmental medi...,"Journal Article, Multicenter Study",1,Pakistan
2735,32079717,"Coggon D, Ntani G, Walker-Bone K, Felli VE, Ha...",Medical Research Council Lifecourse Epidemiolo...,2020 May,Associations of sickness absence for pain in t...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't",10,"Brazil, Colombia, Costa Rica, Ecuador, Georgia..."
3385,33234873,"Kumar D, Saghir T, Ali G, Yasin U, Furnaz S, K...",National Institute of Cardiovascular Diseases ...,2021 Feb 1,Psychosocial Impact of COVID-19 on Healthcare ...,Journal of occupational and environmental medi...,Journal Article,1,Pakistan
3695,36717255,"Nafees AA, Muneer MZ, Irfan M, Kadir MM, Sempl...","Department of Community Health Sciences, Aga K...",2023 Mar,Byssinosis and lung health among cotton textil...,Occupational and environmental medicine,"Journal Article, Research Support, Non-U.S. Gov't",1,Pakistan
4917,26265670,"Khan AW, Kundi M, Moshammer H","Institute for Environmental Health, Center for...",2015 Oct,Diminished pulmonary function in long-term wor...,Occupational and environmental medicine,"Comparative Study, Journal Article, Research S...",1,Pakistan
6063,14534448,"Meo SA, Azeem MA, Subhan MM","Department of Physiology, Hamdard College of M...",2003 Oct,Lung function in Pakistani welding workers.,Journal of occupational and environmental medi...,"Comparative Study, Journal Article",1,Pakistan
7699,35672913,"Rabbani U, Razzaq S, Irfan M, Semple S, Nafees AA","From the Family Medicine Academy, Qassim Healt...",2022 Sep 1,Indoor Air Pollution and Respiratory Health in...,Journal of occupational and environmental medi...,"Journal Article, Research Support, Non-U.S. Gov't",1,Pakistan


In [53]:
df_lmic_count = pd.DataFrame([lmic_dict_papers]).melt()
df_lmic_count.columns = ['Economy','N']

# use dataframe ealier 
countriesdf = countries[((countries['Income group'] == "Low income") | (countries['Income group'] == "Lower middle income") | (countries['Income group'] == "Upper middle income"))]
countriesdf.loc[:,'Economy'] = countriesdf['Economy'].str.split(",").str[0] #simplify names for consistency

gdf = pd.merge(countriesdf.dropna(), df_lmic_count)

In [54]:
# replace North Korea for the map
korea = countriesdf[countriesdf.Economy == 'Korea']
korea = korea.assign(N=[0])
gdf = pd.concat([gdf, korea])

In [55]:
# gdf = gdf[gdf['N'] > 0] # lets not include LMICS with zero papers

In [56]:
print('Number of papers where the author affiliation includes one or more instances of the country name. Low and lower middle income countries, as defined by the world bank, shown for four occupational lung disease journals (Journal of occupational and environmental medicine, Occupational medicine, American journal of industrial medicine, Occupational and environmental medicine) accessed via PubMed 1987-2024')

Number of papers where the author affiliation includes one or more instances of the country name. Low and lower middle income countries, as defined by the world bank, shown for four occupational lung disease journals (Journal of occupational and environmental medicine, Occupational medicine, American journal of industrial medicine, Occupational and environmental medicine) accessed via PubMed 1987-2024


In [57]:
# colorcale choice https://plotly.com/python/builtin-colorscales/

fig = go.Figure(data=go.Choropleth(
    locations = gdf['Code'],
    z = gdf['N'],
    text = gdf['Economy'],
    colorscale = 'Viridis',
    autocolorscale=False,
    reversescale=True,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_title = 'N of papers per LMIC'))

fig.show()

In [58]:
df.to_csv('main_df.csv', index=False)