# Data Analytics Project to Understand the Research Landscape of Semaglutide- A weight loss drug- using publicly available data. 

This Project used data from pubmed.gov and clinicaltrials.gov to study the research landscape of Semaglutide, a diabetic medication and a popular weight loss drug. This data is from 2023 and prior. Some of the key questions this analytics project seeks to answer: Who are the key research institutions working on  semaglutide around the world? Which centers are driving both clinical trials and peer-reviewed research on this medication? Which countries have the most research and clinical trials in this space. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import plotly.express as px
import folium
import fuzzy_pandas as fpd




ModuleNotFoundError: No module named 'fuzzy_pandas'

## Load and preprocess data - drop duplicates , remove spaces

In [3]:

countries = pd.read_excel("data/Semaglutide_pubmed_country.xlsx", sheet_name="codes", engine="openpyxl")
df = pd.read_excel("data/World_semaglutide_Brooke_18May2023_v1.xlsx", sheet_name="World Pubs")


df = df.drop_duplicates(subset=["PMID", "Centers"])
df = df[["PMID", "FullName", "Centers", "FinalCountry", "COIS", "COIS Novo", "JT", "AT", "PT", "PubDate"]]
df = df.replace(r"^ +| +$", r"", regex=True)
df = df[df["Centers"] != "none"]
df["FinalCountry"] = df["FinalCountry"].astype(str)



## Classify Publication types

In [None]:

guidelines = ['Guideline', 'Congress', "Consensus"]
trials = ["Trial", "Research", "Study"]

df["PubType"] = np.where(df['PT'].str.contains('|'.join(guidelines), case=False, na=False), 'Guideline',
                  np.where(df['PT'].str.contains('|'.join(trials), case=False, na=False), 'Trial', 'Journal Article'))



## Fuzzy matching to match countries with publications

In [None]:

df_country_counts = df["FinalCountry"].value_counts().reset_index().rename(columns={"index": "FinalCountry", "FinalCountry": "count"})
df2 = fpd.fuzzy_merge(df_country_counts, countries, left_on=["FinalCountry"], right_on=["name"], threshold=0.8, method="levenshtein")
df3 = fpd.fuzzy_merge(df, countries, left_on=["FinalCountry"], right_on=["name"], threshold=0.9, method="levenshtein")

df3 = df3.drop_duplicates(subset=["FullName", "Centers", "PMID"])
df4 = df3.groupby(["FinalCountry", "Centers", "PubType"]).size().unstack('PubType', fill_value=0).reset_index()



### The United States, United Kingdom and China are the major countries involved in Semaglutide Research

In [None]:
# Top countries based on publications
top_countries = df4["FinalCountry"].value_counts().head(10).index
df_bar = df4[df4["FinalCountry"].isin(top_countries)].groupby("FinalCountry").sum().reset_index()

fig = px.bar(df_bar, x="FinalCountry", y=["Journal Article", "Trial", "Guideline"],
             title="Top Countries by Publication Type", barmode="group")
fig.show()

### Distribution of Publication types - Journal Articles  and Trials are the major publications

In [None]:
#Distribution of Publication Type
pub_counts = df["PubType"].value_counts().reset_index()
pub_counts.columns = ["Publication Type", "Count"]

fig = px.pie(pub_counts, values="Count", names="Publication Type", 
             title="Overall Distribution of Publication Types", 
             hole=0.4)  # donut style
fig.show()


In [None]:
top_countries = df["FinalCountry"].value_counts().head(15).reset_index()
top_countries.columns = ["Country", "Publications"]

fig = px.bar(top_countries, x="Country", y="Publications",
             title="Top 15 Countries by Semaglutide Publications")
fig.show()

### Sharp Increase in Semaglutide publications from 2016-2022. The data is from mid way of 2023, so it looks like publications are decreasing. 

In [None]:
df["PubYear"] = pd.to_datetime(df["PubDate"], errors='coerce').dt.year
yearly_trend = df.groupby("PubYear")["PMID"].count().reset_index()
yearly_trend.columns = ["Year", "Publications"]

fig = px.line(yearly_trend, x="Year", y="Publications", 
              markers=True, title="Publications Over Time")
fig.show()


### Interactive Map that shows countries and publications. The bubble size is indicative of number of publications. Click on the bubbles to see the centers in each country as well as counts of publications. 

In [None]:
n = folium.Map(location=[20, 0], tiles="cartodb positron", zoom_start=2)

for i in range(len(df2)):
    country = df2.iloc[i]['name']
    lat = df2.iloc[i]['latitude']
    lon = df2.iloc[i]['longitude']
    color = df2.iloc[i]['bins']
    count = df2.iloc[i]['count']
    
    center_data = df3[df3["name"] == country][["Centers", "PubType"]].drop_duplicates()
    pub_counts = df3[df3["name"] == country]["PubType"].value_counts().to_dict()
    
    popup_html = f"""
    <div style='font-family: Arial; font-size: 12px'>
        <h4 style='color:#466964'>{country}</h4>
        <p><b>Centers:</b></p>
        <ul>
            {''.join([f"<li>{c}</li>" for c in center_data['Centers'].unique()[:5]])}
        </ul>
        <p><b>Publications:</b></p>
        <ul>
            <li>Journal Articles: {pub_counts.get('Journal Article', 0)}</li>
            <li>Trials: {pub_counts.get('Trial', 0)}</li>
            <li>Guidelines: {pub_counts.get('Guideline', 0)}</li>
        </ul>
    </div>
    """
    
    iframe = folium.IFrame(html=popup_html, width=300, height=250)
    popup = folium.Popup(iframe, max_width=300)

    folium.CircleMarker(
        location=[lat, lon],
        popup=popup,
        radius=max(3, count * 0.035),
        weight=0,  # 👈 No border
        fill=True,
        fill_opacity=0.7,
        fill_color=color
    ).add_to(n)

n


### Conclusions: 
1. There is a sharp increase in publications and trials on semaglutide since 2016. This is due to its use as a diabetic drug as well as more recently weightloss
2. The countries majorly involved in semaglutide research are the USA, UK, China and Denmark. This is based on publication and clinical trial numbers. 
3.The published work includes journal articles - 54%, clinical trial publications 45% and guidelines 0.3 percent. 