This data set is from the kaggle competition @ https://www.kaggle.com/datasets/thedevastator/a-quick-overview-of-clinical-trials?select=AERO-BirdsEye-Data.csv

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

# Preliminary Data Processing

In [3]:
trials = pd.read_csv("AERO-BirdsEye-Data.csv")

In [4]:
trials.head()


Unnamed: 0,index,NCT,Sponsor,Title,Summary,Start_Year,Start_Month,Phase,Enrollment,Status,Condition
0,0,NCT00003305,Sanofi,A Phase II Trial of Aminopterin in Adults and ...,RATIONALE: Drugs used in chemotherapy use diff...,1997,7,Phase 2,75,Completed,Leukemia
1,1,NCT00003821,Sanofi,Phase II Trial of Aminopterin in Patients With...,RATIONALE: Drugs used in chemotherapy use diff...,1998,1,Phase 2,0,Withdrawn,Endometrial Neoplasms
2,2,NCT00004025,Sanofi,"Phase I/II Trial of the Safety, Immunogenicity...",RATIONALE: Vaccines made from a person's white...,1999,3,Phase 1/Phase 2,36,Unknown status,Melanoma
3,3,NCT00005645,Sanofi,Phase II Trial of ILX295501 Administered Orall...,RATIONALE: Drugs used in chemotherapy use diff...,1999,5,Phase 2,0,Withdrawn,Ovarian Neoplasms
4,4,NCT00008281,Sanofi,"A Multicenter, Open-Label, Randomized, Three-A...",RATIONALE: Drugs used in chemotherapy use diff...,2000,10,Phase 3,0,Unknown status,Colorectal Neoplasms


In [5]:
np.shape(trials)

(13748, 11)

In [6]:
trials.describe(include=[np.number])

Unnamed: 0,index,Start_Year,Start_Month,Enrollment
count,13748.0,13748.0,13748.0,13748.0
mean,6873.5,2009.155586,6.691155,440.783678
std,3968.850085,4.797615,3.486359,1944.530768
min,0.0,1984.0,1.0,0.0
25%,3436.75,2006.0,4.0,40.0
50%,6873.5,2009.0,7.0,124.0
75%,10310.25,2013.0,10.0,365.0
max,13747.0,2020.0,12.0,84496.0


This describe method returns strange values from the numerical data in the file.  So I used more specific parameters.

In [7]:
trials.describe(include=['object', 'category'])

Unnamed: 0,NCT,Sponsor,Title,Summary,Phase,Status,Condition
count,13748,13748,13604,13748,13485,13748,13748
unique,13748,10,13434,13565,7,9,867
top,NCT00135564,GSK,Human Photoallergy Test,#NAME?,Phase 3,Completed,"Diabetes Mellitus, Type 2"
freq,1,2473,7,11,4887,10568,536


In [8]:
trials.dtypes

index           int64
NCT            object
Sponsor        object
Title          object
Summary        object
Start_Year      int64
Start_Month     int64
Phase          object
Enrollment      int64
Status         object
Condition      object
dtype: object

In [9]:
trials.isnull().sum()

index            0
NCT              0
Sponsor          0
Title          144
Summary          0
Start_Year       0
Start_Month      0
Phase          263
Enrollment       0
Status           0
Condition        0
dtype: int64

In [10]:
trials.isnull().mean() * 100

index          0.000000
NCT            0.000000
Sponsor        0.000000
Title          1.047425
Summary        0.000000
Start_Year     0.000000
Start_Month    0.000000
Phase          1.913006
Enrollment     0.000000
Status         0.000000
Condition      0.000000
dtype: float64

In [16]:
null_titles = trials[trials["Title"].isnull()]
print(null_titles)

       index          NCT Sponsor Title  \
43        43  NCT00067093  Sanofi   NaN   
96        96  NCT00131300  Sanofi   NaN   
97        97  NCT00131326  Sanofi   NaN   
99        99  NCT00131768  Sanofi   NaN   
109      109  NCT00140608  Sanofi   NaN   
...      ...          ...     ...   ...   
13454  13454  NCT02168530   Roche   NaN   
13483  13483  NCT02402712   Roche   NaN   
13611  13611  NCT02824055   Roche   NaN   
13632  13632  NCT02952911   Roche   NaN   
13633  13633  NCT02952924   Roche   NaN   

                                                 Summary  Start_Year  \
43     Patients who have deep vein thrombosis (blood ...        2003   
96     This clinical study is to evaluate the safety ...        2004   
97     This is a clinical study to investigate the sa...        2003   
99     This clinical study is to evaluate the safety ...        2003   
109    The purpose of this study is to investigate if...        2003   
...                                                

In [12]:
trials = trials.fillna("Unknown")

Changing null values to "Unknown" to prevent misrepresentation in the data.  

In [13]:
trials.isnull().sum()

index          0
NCT            0
Sponsor        0
Title          0
Summary        0
Start_Year     0
Start_Month    0
Phase          0
Enrollment     0
Status         0
Condition      0
dtype: int64

# Data Analysis

In [9]:
trials_summary = pd.DataFrame()
trials_summary['values']=(trials.groupby(['Sponsor']).size())
trials_summary = trials_summary.reset_index().rename(columns={"index": "Sponsor"})

In [12]:
color_key = px.colors.qualitative.Prism[0:10]
color_mapping = dict(zip(sorted(trials_summary["Sponsor"].unique()), color_key))

fig1 = px.pie(data_frame = trials_summary, 
              values = 'values', 
              names = "Sponsor", 
              title='<b>Share of Sponsors for All Trials<b>', 
              color_discrete_sequence = px.colors.qualitative.Prism[0:10],
    
              width = 600,
              height = 600
            )

fig1.update_traces(textposition = 'inside', 
                   textinfo = 'label+percent')
fig1.update(layout_showlegend = False)
fig1.update_layout(title_x = .5, 
                   title_y = .9, 
                   font = dict(size = 14))

fig1.show()

### Chi Squared Test

We will use the Chi Squared Test to see if there is a relationship between two categorical variables.  First, we will look at the relationship between Sponsor and Phase, to see if certain sponsors are more likely to advance to a higher phase. Is condition related to status of the trial?

In [14]:
import pandas as pd
from scipy.stats import chi2_contingency

In [15]:
contingency = pd.crosstab(trials["Sponsor"], trials["Phase"])
print(contingency)

Phase     Early Phase 1  Phase 1  Phase 1/Phase 2  Phase 2  Phase 2/Phase 3  \
Sponsor                                                                       
AbbVie                0       80               13      130                4   
Bayer                 0      104               17      168                6   
GSK                   0      723               12      684                6   
Gilead                0       71               18      152                6   
JNJ                   9      177               18      242                9   
Merck                 0      364               28      386               15   
Novartis              0      264               96      658               43   
Pfizer                1      398               37      486               24   
Roche                 0      162               29      302                9   
Sanofi                0      173               54      388               17   

Phase     Phase 3  Phase 4  Unknown  
Sponsor      

In [29]:
chi2, p, dof, expected = chi2_contingency(contingency)


expected_df = pd.DataFrame(expected, 
                           index=contingency.index, 
                           columns=contingency.columns)

print("Expected counts:")
print(expected_df)

Expected counts:
Phase     Early Phase 1     Phase 1  Phase 1/Phase 2     Phase 2  \
Sponsor                                                            
AbbVie         0.303317   76.314518         9.766802  109.072738   
Bayer          0.450247  113.282223        14.497963  161.908932   
GSK            1.798807  452.579866        57.921589  646.851033   
Gilead         0.303317   76.314518         9.766802  109.072738   
JNJ            0.831394  209.178644        26.770876  298.969159   
Merck          1.287460  323.924935        41.456212  462.970614   
Novartis       1.687518  424.579575        54.338086  606.831539   
Pfizer         1.432936  360.526622        46.140530  515.283678   
Roche          0.796479  200.394239        25.646640  286.414024   
Sanofi         1.108525  278.904859        35.694501  398.625546   

Phase     Phase 2/Phase 3     Phase 3     Phase 4    Unknown  
Sponsor                                                       
AbbVie           4.216104  148.230943   

In [18]:
print("Chi2 Statistic:", chi2)
print("Degrees of Freedom:", dof)
print("p-value:", p)

Chi2 Statistic: 982.1362690716578
Degrees of Freedom: 63
p-value: 4.697192094436259e-165


The p-value of 4.7 tells us that we fail to reject the null hypothesis: there is no correlation between sponsor and phase number.  We should also check for Sponsors that have expected counts < 1, because the Chi Squared Test will be less reliable in this case.  

In [28]:

percent_below_1 = (expected_df < 1).sum(axis=1) / expected_df.shape[1] * 100
percent_below_5 = (expected_df < 5).sum(axis=1) / expected_df.shape[1] * 100


flag_below_1 = percent_below_1[percent_below_1 > 20]


flag_below_5 = percent_below_5[percent_below_5 > 20]

print("⚠️ Sponsors with >20% of expected counts < 1:")
print(flag_below_1)

print("\n⚠️ Sponsors with >20% of expected counts < 5:")
print(flag_below_5)


⚠️ Sponsors with >20% of expected counts < 1:
Series([], dtype: float64)

⚠️ Sponsors with >20% of expected counts < 5:
Sponsor
AbbVie    25.0
Gilead    25.0
dtype: float64


Sponsors AbbVie and Gilead have expected counts less than 5, so the Chi Squared test might be less reliable for looking at correlations in this case.

### Sponsors by phase

In [69]:
import plotly.express as px
import pandas as pd  # Ensure pandas is imported for groupby


phase_colors = {
    "Early Phase 1": "#005AB5",   
    "Phase 1": "#2CA02C",       
    "Phase 1/Phase 2": "#1AC6FF", 
    "Phase 2": "#FF8000",         
    "Phase 2/Phase 3": "#8A2BE2", 
    "Phase 3": "#E31A1C",         
    "Phase 4": "#A65628",         
    "Unknown": "#666666"          
}


trials_agg = trials.groupby(['Sponsor', 'Phase']).size().reset_index(name='Count')


fig = px.bar(
    trials_agg, 
    x="Sponsor",
    y="Count",  
    color="Phase",
    title="Number of Trials per Sponsor by Phase",
    barmode="group",
    color_discrete_map=phase_colors
)

fig.update_layout(
    plot_bgcolor="white",
    paper_bgcolor="white",
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=True, gridcolor="lightgray"),
    font=dict(size=14, color="black"),
    legend_title="Phase"
)

fig.show()

### Conditions Studied

In [56]:
import pandas as pd
import plotly.express as px

# Count number of trials per condition
cond_summary = trials["Condition"].value_counts().reset_index()
cond_summary.columns = ["Condition", "Count"]

# Keep only top 10 conditions
cond_top20 = cond_summary.head(20)

# Bar chart
fig = px.bar(
    cond_top20,
    x="Condition",
    y="Count",
    title="Top 20 Conditions by Number of Trials",
    color="Condition",
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(
    xaxis_title="Condition",
    yaxis_title="Number of Trials",
    showlegend=False
)

fig.show()
