In [None]:
# analysis.ipynb

# Immuno-Oncology Clinical Trials Analysis from https://clinicaltrials.gov/
# Immuno-Oncology Clinical Trials Analysis from https://eudract.ema.europa.eu/

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rapidfuzz import process

In [None]:
# Data from ClinicalTrials - 05/07/2025 (dd/mm/yyyy)
df1 = pd.read_csv("ctg-studies.csv")  
print(df1.head())

In [None]:
# Data from EudraCT - 05/07/2025 (dd/mm/yyyy)
df2 = pd.read_csv("ctis-studies.csv") 
print(df2.head())

In [None]:
# ADDING IDs

df1['id'] = range(1, len(df1) + 1)
df2['id'] = range(len(df1) + 1, len(df1) + len(df2) + 1)


In [None]:
# LOOKING FOR DUPLICATES

df1['Study Title'] = df1['Study Title'].str.lower()
df2['Title of the trial'] = df2['Title of the trial'].str.lower()
duplicates = []

for i in df2.index:
    trial_title = df2.loc[i, 'Title of the trial']
    trial_id = df2.loc[i, 'id']

    match = process.extractOne(trial_title, df1['Study Title'], score_cutoff=95)

    if match:
        best_title, score, pos = match
        matched_id = df1.loc[pos, 'id']
        matched_title = df1.loc[pos, 'Study Title']
        print(f"\nMATCH FOUND:")
        print(f"  df2 ID {trial_id}  ->  \"{trial_title}\"")
        print(f"  df1 ID {matched_id}  ->  \"{matched_title}\"")
        print(f"  Similarity score: {score}")
        print("---")
        duplicates.append( (trial_id, matched_id) )

# TO BE IMPROVED...

In [None]:
# MERGING DATA: FINAL DF

# IDs
ids_df2_to_exclude = [ pair[0] for pair in duplicates ]
df2_clean = df2[ ~df2['id'].isin(ids_df2_to_exclude) ]
df = pd.concat([df1[['id']], df2_clean[['id']]],ignore_index=True)

In [None]:
# ADDING LOCATIONS

df['location'] = None

ids_df1 = set(df1['id'])

for idx, row in df.iterrows():
    trial_id = row['id']
    
    if trial_id in ids_df1:
        loc_string = df1.loc[df1['id'] == trial_id, 'Locations'].values[0]
        
        countries = set()
        if pd.notna(loc_string):
            # separar per |
            sites = loc_string.split("|")
            for site in sites:
                parts = site.strip().split(",")
                if len(parts) > 0:
                    country = parts[-1].strip()
                    countries.add(country)
            # convertir a string
            country_str = ", ".join(sorted(countries)) if countries else None
        else:
            country_str = None
        
        df.loc[idx, 'location'] = country_str
        
    else:
        loc_string = df2_clean.loc[df2_clean['id'] == trial_id, 'Location(s) and recruitment status'].values[0]
        
        if pd.notna(loc_string):
            country = loc_string.strip().split(":")[0].strip()
        else:
            country = None
        
        df.loc[idx, 'location'] = country

In [None]:
# ADDING MEDICAL CONDITIONS

df['condition'] = None

ids_df1 = set(df1['id'])

for idx, row in df.iterrows():
    trial_id = row['id']
    
    if trial_id in ids_df1:
        cond_string = df1.loc[df1['id'] == trial_id, 'Conditions'].values[0]
        
        if pd.notna(cond_string):
            condition = cond_string.strip()
        else:
            condition = None
            
        df.loc[idx, 'condition'] = condition
        
    else:
        cond_string = df2_clean.loc[df2_clean['id'] == trial_id, 'Medical conditions'].values[0]
        
        if pd.notna(cond_string):
            condition = cond_string.strip()
        else:
            condition = None
            
        df.loc[idx, 'condition'] = condition

In [None]:
# ADDING INTERVENTIONS

df['intervention'] = None

ids_df1 = set(df1['id'])

for idx, row in df.iterrows():
    trial_id = row['id']
    
    if trial_id in ids_df1:
        intv_string = df1.loc[df1['id'] == trial_id, 'Interventions'].values[0]
        
        if pd.notna(intv_string):
            intervention = intv_string.strip()
        else:
            intervention = None
            
        df.loc[idx, 'intervention'] = intervention
        
    else:
        intv_string = df2_clean.loc[df2_clean['id'] == trial_id, 'Product'].values[0]
        
        if pd.notna(intv_string):
            intervention = intv_string.strip()
        else:
            intervention = None
            
        df.loc[idx, 'intervention'] = intervention

In [None]:
# ADDING TRIAL PHASES

df['phase'] = None

ids_df1 = set(df1['id'])

for idx, row in df.iterrows():
    trial_id = row['id']
    
    if trial_id in ids_df1:
        phase_string = df1.loc[df1['id'] == trial_id, 'Phases'].values[0]
        
        if pd.notna(phase_string):
            phase_string = phase_string.strip().upper()
            
            if phase_string == "PHASE1":
                phase = 1
            elif phase_string == "PHASE2":
                phase = 2
            elif phase_string == "PHASE3":
                phase = 3
            elif phase_string == "PHASE1|PHASE2":
                phase = 12
            elif phase_string == "PHASE2|PHASE3":
                phase = 23
            else:
                phase = None
                print(f"[ALERTA] Valor desconegut a df1 id {trial_id}: {phase_string}")
        else:
            phase = None
            
        df.loc[idx, 'phase'] = phase
        
    else:
        phase_string = df2_clean.loc[df2_clean['id'] == trial_id, 'Trial phase'].values[0]
        
        if pd.notna(phase_string):
            phase_string = phase_string.strip().lower()
            
            found_phase1 = "phase i" in phase_string
            found_phase2 = "phase ii" in phase_string
            found_phase3 = "phase iii" in phase_string
            
            if found_phase1 and found_phase2:
                phase = 12
            elif found_phase2 and found_phase3:
                phase = 23
            elif found_phase1:
                phase = 1
            elif found_phase2:
                phase = 2
            elif found_phase3:
                phase = 3
            else:
                phase = None
                print(f"[ALERTA] Valor desconegut a df2 id {trial_id}: {phase_string}")
        else:
            phase = None
            
        df.loc[idx, 'phase'] = phase


In [None]:
# ADDING SPONSORS

df['sponsor_type'] = None

ids_df1 = set(df1['id'])

for idx, row in df.iterrows():
    trial_id = row['id']
    
    if trial_id in ids_df1:
        sponsor_type = df1.loc[df1['id'] == trial_id, 'Funder Type'].values[0]
        df.loc[idx, 'sponsor_type'] = sponsor_type.strip() if pd.notna(sponsor_type) else None
        
    else:
        sponsor_type = df2_clean.loc[df2_clean['id'] == trial_id, 'Sponsor type'].values[0]
        df.loc[idx, 'sponsor_type'] = sponsor_type.strip() if pd.notna(sponsor_type) else None

df['sponsor'] = None

ids_df1 = set(df1['id'])

for idx, row in df.iterrows():
    trial_id = row['id']
    
    if trial_id in ids_df1:
        sponsor_values = df1.loc[df1['id'] == trial_id, 'Sponsor'].values
        if len(sponsor_values) > 0 and pd.notna(sponsor_values[0]) and sponsor_values[0].strip() != "":
            sponsor = sponsor_values[0]
        else:
            sponsor = None
    else:
        sponsor_values = df2_clean.loc[df2_clean['id'] == trial_id, 'Sponsor/Co-Sponsors'].values
        if len(sponsor_values) > 0 and pd.notna(sponsor_values[0]) and sponsor_values[0].strip() != "":
            sponsor = sponsor_values[0]
        else:
            sponsor = None
    
    df.loc[idx, 'sponsor'] = sponsor


In [None]:
# FINAL DATA SET TO CSV
df.to_csv("final_dataset_studies.csv", index=False)

In [None]:
# SUMMARY OF INTERVENTION FIELD

# KEYWORD MAP
intervention_map = {
    "checkpoint inhibitor": ["pd-1", "pd-l1", "ctla-4", "nivolumab", "pembrolizumab", "durvalumab", "atezolizumab", "ipilimumab", "cemiplimab"],
    "cancer vaccine": ["vaccine", "mrna"],
    "adoptive cell therapy": ["car-t", "tcr", "nk cell", "cell therapy"],
    "oncolytic virus": ["oncolytic virus", "herpesvirus", "adenovirus"],
    "cytokine / immunomodulator": ["cytokine", "il-2", "interleukin", "ifn", "interferon"],
    "combination product": ["combination", "combo", "integrating"],
    "radiotherapy": ["radiation", "radiotherapy", "irradiation", "ccrt"],
    "biomarker / diagnostics": ["biomarker", "diagnostic", "marker", "assay"],
    "supportive therapy": ["rehabilitation", "physical activity", "nutrition", "quality-of-life", "psychosocial"],
    "chemotherapy": ["chemo", "platinum", "paclitaxel", "cisplatin", "carboplatin"],
    "other": []
}


# FUNCTION TO CLASSIFY
def classify_intervention(text):
    if pd.isna(text):
        return "unknown"
    text_lower = text.lower()
    for group, keywords in intervention_map.items():
        if any(keyword in text_lower for keyword in keywords):
            return group
    return "other"

# APPLYING TO DATAFRAME
df['intervention_group'] = df['intervention'].apply(classify_intervention)


interv_counts = df['intervention_group'].value_counts()


total_unique = interv_counts.shape[0]
table_str = (
    f"{'Intervention Group':30} | {'Count'}\n"
    + "-"*45 + "\n"
)
for name, count in interv_counts.items():
    table_str += f"{name[:30]:30} | {count}\n"

table_str += "-"*45 + "\n"
table_str += f"{'TOTAL UNIQUE':30} | {total_unique}\n"
print(table_str)

interv_summary = df['intervention_group'].value_counts().reset_index()
interv_summary.columns = ['Intervention Group', 'Count']

plt.figure(figsize=(10,6))
sns.barplot(
    data=interv_summary,
    x="Intervention Group",
    y="Count",
    palette="Blues_d"
)
plt.xticks(rotation=30, ha="right")
plt.title("Distribution of Clinical Trials by Intervention Group", fontsize=14, weight="bold")
plt.ylabel("Number of Trials")
plt.xlabel("")
plt.tight_layout()
plt.show()

In [None]:
# SUMMARY OF CONDITIONS

# KEYWORD MAP
condition_map = {
    "gynecologic": ["cervical", "vulvar", "uterine", "ovarian", "endometrial", "gynaecological"],
    "genitourinary": ["prostatic", "prostate", "bladder", "renal", "kidney", "testicular", "penile"],
    "digestive": ["colorectal", "colon", "rectal", "gastric", "stomach", "pancreatic", "esophageal", "liver", "hepatocellular", "anal"],
    "lung/thoracic": ["lung", "mesothelioma", "pleural", "thoracic"],
    "breast": ["breast"],
    "hematologic": ["leukemia", "lymphoma", "myeloma", "hematologic"],
    "skin": ["melanoma", "skin"],
    "brain/CNS": ["glioblastoma", "glioma", "cns", "brain"],
    "head/neck": ["head", "neck", "oropharyngeal", "nasopharyngeal", "oral"],
    "sarcoma": ["sarcoma"],
    "multiple": ["advanced cancer", "advanced malignancy", "solid tumor", "metastatic", "locally advanced"],
    "other": []
}

# FUNCTION TO CLASSIFY
def classify_condition(text):
    if pd.isna(text):
        return "unknown"
    text_lower = text.lower()
    for group, keywords in condition_map.items():
        if any(keyword in text_lower for keyword in keywords):
            return group
    return "other"

# APPLYING TO DATAFRAME
df['condition_group'] = df['condition'].apply(classify_condition)

summary_table = df['condition_group'].value_counts().reset_index()
summary_table.columns = ["Condition Group", "Count"]
print(summary_table)

plt.figure(figsize=(10,6))
sns.barplot(
    data=summary_table,
    x="Condition Group",
    y="Count",
    palette="Blues_d"
)
plt.xticks(rotation=30, ha="right")
plt.title("Distribution of Clinical Trials by Condition Group", fontsize=14, weight="bold")
plt.ylabel("Number of Trials")
plt.xlabel("")
plt.tight_layout()
plt.show()


In [None]:
# SUMMARY OF SPONSOR TYPE DISTRIBUTION

sponsors = df['sponsor_type'].value_counts().reset_index()
sponsors.columns = ['sponsor_type', 'count']

def clean_sponsor_type(s):
    if pd.isna(s):
        return "Other"
    parts = [part.strip() for part in s.split(",")]
    unique_parts = list(dict.fromkeys(parts))
    first = unique_parts[0]
    return first

df['sponsor_type_clean'] = df['sponsor_type'].apply(clean_sponsor_type)

def regroup_category(s):
    if s in ["NIH", "FED", "OTHER_GOV"]:
        return "Government"
    elif s in ["INDUSTRY", "Pharmaceutical company"]:
        return "Industry"
    elif "Hospital/Clinic" in s:
        return "Hospital/Clinic"
    elif s == "Patient organisation/association":
        return "Patient Organisation"
    elif s == "NETWORK":
        return "Network"
    elif s in ["Educational Institution", "Laboratory/Research/Testing facility"]:
        return "Academic"
    elif s == "OTHER":
        return "Other"
    else:
        return "Other"

df['sponsor_type_grouped'] = df['sponsor_type_clean'].apply(regroup_category)


sponsor_group_counts = df['sponsor_type_grouped'].value_counts()
total_trials = sponsor_group_counts.sum()
table_str = (
    f"{'Sponsor Type':25} | {'Count':5} | {'% of Total'}\n"
    + "-"*55 + "\n"
)
for name, count in sponsor_group_counts.items():
    pct = round(count / total_trials * 100, 1)
    table_str += f"{name:25} | {count:<5} | {pct:.1f}%\n"
table_str += "-"*55 + "\n"
table_str += f"{'TOTAL':25} | {total_trials:<5} | 100%\n"
print(table_str)



plt.figure(figsize=(10,6))
sns.barplot(
    y=sponsor_group_counts.index,
    x=sponsor_group_counts.values,
    palette="Blues_d"
)
plt.title("Grouped Sponsor Types", fontsize=14, weight="bold")
plt.xlabel("Number of Trials")
plt.ylabel("")
plt.tight_layout()
plt.show()


In [None]:

# SUMMARY OF SPONSORS 

sponsor_counts = df['sponsor'].value_counts()


total_unique = sponsor_counts.shape[0]
total_counts = sponsor_counts.sum()

table_str = (
    f"{'Sponsor':50} | {'Count':5} | {'% of Total'}\n"
    + "-"*80 + "\n"
)

for name, count in sponsor_counts.items():
    pct = round(count / total_counts * 100, 1)
    table_str += f"{name[:50]:50} | {count:<5} | {pct:.1f}%\n"

table_str += "-"*80 + "\n"
table_str += f"{'TOTAL UNIQUE':50} | {total_counts:<5} | 100%\n"

print(table_str)


In [None]:
# SUMMARY OF COUNTRY DISTRIBUTION

country_list = []

for loc in df['location']:
    if pd.notna(loc):
        countries = [c.strip() for c in loc.split(",")]
        country_list.extend(countries)

countries_series = pd.Series(country_list)
countries_counts = countries_series.value_counts()

print("== Countries hosting trials ==")
print(countries_counts)

# PIE CHART (3% threshold)
countries_pct = countries_series.value_counts(normalize=True) * 100
threshold = 3
large_countries = countries_pct[countries_pct >= threshold]
small_countries = countries_pct[countries_pct < threshold]
others = small_countries.sum()
final_countries = large_countries.copy()
if others > 0:
    final_countries["Others"] = others

import matplotlib.pyplot as plt

plt.figure(figsize=(8,8))
plt.pie(
    final_countries,
    labels=final_countries.index,
    autopct='%1.1f%%',
    startangle=140,
    wedgeprops={"edgecolor":"white"}
)
plt.title("Clinical Trials by Country", fontsize=14, weight="bold")
plt.tight_layout()
plt.show()


In [None]:
# SUMMARY OF PHASES DISTRIBIUTION 

phase_labels = {
    1: "Phase I",
    2: "Phase II",
    3: "Phase III",
    12: "Phase I & II",
    23: "Phase II & III"
}

df['phase_named'] = df['phase'].map(phase_labels)
phase_counts = df['phase_named'].value_counts().reindex(
    ["Phase I", "Phase II", "Phase III", "Phase I & II", "Phase II & III"]
).fillna(0)

plt.figure(figsize=(8,5))
sns.barplot(
    x=phase_counts.index,
    y=phase_counts.values,
    palette="Purples_d"
)

plt.title("Clinical Trial Phase Distribution", fontsize=14, weight="bold")
plt.ylabel("Number of Trials")
plt.xlabel("")
plt.tight_layout()
plt.show()
