In [60]:
# !pip install -U kaleido
# !pip install --upgrade plotly

### Import Libraries

In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
from geopy.geocoders import Nominatim
from time import sleep

### Import data

In [2]:
df = pd.read_csv('D:/Daniels/exercises/ubuntu/phd_work/data/pprl_papers_data_extraction.csv')
df.head()

Unnamed: 0,search_no,study_title,link,geographic_region,state_country,city,evaluation,date_done,year,topic,...,More explanation on AIMS,Sections,Authors,Skimming_through_scanning,fulltext,mendeley_zotero,citations,Authors_x,Databases,author_affiliation
0,828144128180,,https://doi.org/,,,,,,,,...,,,,,,,,,,
1,828144128646,Protecting the Privacy of Cancer Patients Usin...,https://doi.org/10.31557/APJCP.2019.20.5.1437,None;,,,,8/24/2025,2019.0,blocking based techniques; sensitive rules;,...,,,Krishnamoorthy S,,,,Asian Pac J Cancer Prev. 2019 May 25;20(5):143...,Krishnamoorthy S,Asian Pac J Cancer Prev,
2,828144128141,Identifying cerebral palsy from routinely-coll...,https://doi.org/10.2147/CLEP.S200748,Europe;,England; Wales;,England; Wales;,general-info,8/24/2025,2019.0,prevalence; Prevalence estimates; cerebral pal...,...,,,,,,,,,,
3,828144128853,Using national laboratory data to assess cumul...,https://doi.org/10.1002/jia2.25326,Africa;,South Africa,"Durban, South Africa;",general-info,8/24/2025,2019.0,PEPFAR-funded HIV clinic; government-funded cl...,...,,,,,,,,,,
4,828144128282,Retrospective Analysis of Cancer Care Performa...,https://doi.org/10.31557/APJCP.2019.20.6.1701,Asia;,Malaysia;,,general-info,8/24/2025,2019.0,nasopharyngeal carcinoma (NPC); Subang Jaya Me...,...,,,,,,,,,,


In [3]:
df.shape

(541, 88)

### Split locations

In [64]:
# Remove trailing semicolons and split into lists
df['countries_list'] = df['state_country'].str.rstrip(';').str.split(';')

# Explode into multiple rows
df_exploded = df.explode('countries_list')

# Clean up spaces and underscores
df_exploded['countries_clean'] = df_exploded['countries_list'].str.replace('_', ' ').str.strip()

# Drop NaNs or empty values
df_exploded = df_exploded[df_exploded['countries_clean'].notna() & (df_exploded['countries_clean'] != '')]


In [65]:
df_exploded.countries_clean.nunique()

72

### Geolocator - more time taken here

In [None]:
geolocator = Nominatim(user_agent="geoapi", timeout=10)

def geocode_place(place):
    try:
        loc = geolocator.geocode(place)
        time.sleep(1)  # delay to avoid rate limit
        if loc:
            return pd.Series([loc.latitude, loc.longitude])
    except Exception as e:
        print(f"Error geocoding {place}: {e}")
        return pd.Series([None, None])
    return pd.Series([None, None])


# Apply with delay
df_exploded[['lat', 'lon']] = df_exploded['countries_clean'].apply(
    lambda x: geocode_place(x)
)


In [67]:
import plotly.express as px

fig = px.scatter_geo(df_exploded,
                     lat='lat',
                     lon='lon',
                     text='countries_clean',
                     projection="natural earth",
                     title="World Map with Mixed and Multiple Locations")

fig.write_html("maps/map_output.html")

fig.show()


KeyboardInterrupt: 

### Map without country names (Size not factored)

In [None]:
import plotly.express as px

fig = px.scatter_geo(df_exploded,
                     lat='lat',
                     lon='lon',
                     # text='countries_clean',
                     hover_name='countries_clean',
                     projection="natural earth",
                     title="World Map showing PPRL Locations")

# Hide country names (labels), keep borders and land
fig.update_geos(
    showcountries=False,    # hides country names
    showcoastlines=True,
    showland=True,
    landcolor="lightgray"
)

# Optional: adjust layout margins if needed
fig.update_layout(
    margin={"r":0, "t":30, "l":0, "b":0}
)

# Save map
fig.write_html("maps/map_output_ii.html")

# Show map
fig.show()


### Map 3 - with size as a factor

In [None]:
# Count occurrences of each location
size_df = df_exploded['countries_clean'].value_counts().reset_index()
size_df.columns = ['countries_clean', 'size']

# Merge back to main DataFrame
df_exploded = df_exploded.merge(size_df, on='countries_clean')
df_exploded.head()

In [None]:
import plotly.express as px

fig = px.scatter_geo(
    df_exploded,
    lat='lat',
    lon='lon',
    hover_name='countries_clean',  # Appears on hover only
    color='countries_clean',                # Colors by region
    size='size',                  # Bubble size
    projection="natural earth",
    title="World Map with Colored and Scaled Bubbles by Region"
)

# Hide country labels and customize map appearance
fig.update_geos(
    showcountries=False,
    showland=True,
    landcolor="lightgray",
    showcoastlines=True,
    coastlinecolor="gray"
)

# Customize layout and legend appearance
fig.update_layout(
    legend_title_text='Region',
    margin={"r":0,"t":30,"l":0,"b":0}
)

# Save and show
# fig.write_image("maps/pprl_map_colored_iii.png", scale=2)
fig.write_html("maps/pprl_map_colored_iii.html")
fig.show()

In [None]:
# import kaleido
# print("Kaleido is working!")

### Top 12 countries

In [None]:
# Get top 12 countries sorted by size
top_12_countries = size_df.sort_values(by='size', ascending=False).head(12)

# Styled Pandas Table with formatting
styled_table = top_12_countries.style \
    .set_caption("Top 12 Countries with Most Record Linkage Work") \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'caption', 'props': [('color', '#2a3f5f'),
                                          ('font-size', '18px'),
                                          ('font-weight', 'bold'),
                                          ('text-align', 'center')]},
        {'selector': 'th', 'props': [('background-color', '#f2f2f2'),
                                     ('font-size', '14px'),
                                     ('text-align', 'center')]},
        {'selector': 'td:nth-child(1)', 'props': [('text-align', 'left'), ('min-width', '200px')]},
        {'selector': 'td:nth-child(2)', 'props': [('text-align', 'center'), ('min-width', '100px')]}
    ]) \
    .format({'size': '{:,}'})

styled_table

### Export styled table

In [None]:
with open("maps/top_12_countries.html", "w") as f:
    f.write(styled_table.to_html())

### End of the maps