In [1]:
import os
import pandas as pd

In [2]:
# List of ROR to exclude from results
excluded_rors = [
    "02n6c9837", # Sanofi
    "02xnj2427"  # Hôpital de Ville-Évrard
]

cle_false = "SANS communication\nidentifiée"
cle_true = "AVEC communication\nidentifiée"

LAST_YEAR = 2023

In [3]:
# Clear previous results
os.system(f"rm -Rf ./../publipostage2")
os.system(f"mkdir -p ./../publipostage2")

0

In [4]:
# Load all national data about clinical trials
url = "https://storage.gra.cloud.ovh.net/v1/AUTH_32c5d10cb0fe4519b957064a111717e3/bso_dump/bso-clinical-trials.jsonl.gz"
df = pd.read_json(url, lines=True)

In [5]:
# Mapping to normalize sponsor name and match to ROR
mapping = pd.read_csv("https://raw.githubusercontent.com/dataesr/bso-clinical-trials/main/bsoclinicaltrials/server/main/bso-lead-sponsors-mapping.csv")

In [6]:
df_interventional_completed = df[(df.study_type=="Interventional") & (df.status_simplified=="Completed")]
df_interventional_completed_academic = df_interventional_completed[df_interventional_completed.lead_sponsor_type=="academique"]
df_interventional_academic = df[(df.study_type=="Interventional") & (df.lead_sponsor_type=="academique")]
mycols = ["lead_sponsor", "study_completion_year", "NCTId", "eudraCT", "CTIS", "study_type", "status_simplified",
          "acronym", "title", "has_results_or_publications", "has_results_or_publications_within_1y",
          "has_results_or_publications_within_3y", "intervention_type"]
df_interventional_completed_academic = df_interventional_completed_academic[mycols]
df_interventional_academic = df_interventional_academic[mycols]
dd = pd.merge(df_interventional_completed_academic, mapping, left_on="lead_sponsor", right_on="sponsor", indicator=True, how="inner")
dd_ror = dd[dd.ror.apply(lambda x:isinstance(x, str) and "ror.org/" in x)]
dd_ror["ror_simple"] = dd_ror.ror.apply(lambda x:x.split("/")[-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dd_ror["ror_simple"] = dd_ror.ror.apply(lambda x:x.split("/")[-1])


In [7]:
def get_percent(df_data):
    stats = {"results_last_10_years": {}, "results_last_10_years_drug": {}, "results_within_1_year_last_10_years": {}, "results_within_1_year_last_10_years_drug": {}}
    number_of_trials = len(df_data.index)
    data_results_last_10_years = pd.DataFrame(df_data[(df_data.study_completion_year > LAST_YEAR - 10) & (df_data.study_completion_year <= LAST_YEAR)].has_results_or_publications.value_counts(dropna=False))
    data_results_last_10_years_drug = pd.DataFrame(df_data[(df_data.study_completion_year > LAST_YEAR - 10) & (df_data.study_completion_year <= LAST_YEAR) & (df_data.intervention_type == "DRUG")].has_results_or_publications.value_counts(dropna=False))
    data_results_within_1_year_last_10_years = pd.DataFrame(df_data[(df_data.study_completion_year > LAST_YEAR - 10) & (df_data.study_completion_year <= LAST_YEAR)].has_results_or_publications_within_1y.value_counts(dropna=False))
    data_results_within_1_year_last_10_years_drug = pd.DataFrame(df_data[(df_data.study_completion_year > LAST_YEAR - 10) & (df_data.study_completion_year <= LAST_YEAR) & (df_data.intervention_type == "DRUG")].has_results_or_publications_within_1y.value_counts(dropna=False))
    try:
        stats["results_last_10_years"][cle_true] = data_results_last_10_years[data_results_last_10_years.index==True].values[0][0]
    except Exception as e:
        print(e)
        stats["results_last_10_years"][cle_true] = 0
    try:
        stats["results_last_10_years"][cle_false] = data_results_last_10_years[data_results_last_10_years.index==False].values[0][0]
    except Exception as e:
        print(e)
        stats["results_last_10_years"][cle_false] = 0
    try:
        stats["results_last_10_years_drug"][cle_true] = data_results_last_10_years_drug[data_results_last_10_years_drug.index==True].values[0][0]
    except Exception as e:
        print(e)
        stats["results_last_10_years_drug"][cle_true] = 0
    try:
        stats["results_last_10_years_drug"][cle_false] = data_results_last_10_years_drug[data_results_last_10_years_drug.index==False].values[0][0]
    except Exception as e:
        print(e)
        stats["results_last_10_years_drug"][cle_false] = 0
    try:
        stats["results_within_1_year_last_10_years"][cle_true] = data_results_within_1_year_last_10_years[data_results_within_1_year_last_10_years.index==True].values[0][0]
    except Exception as e:
        print(e)
        stats["results_within_1_year_last_10_years"][cle_true] = 0
    try:
        stats["results_within_1_year_last_10_years"][cle_false] = data_results_within_1_year_last_10_years[data_results_within_1_year_last_10_years.index==False].values[0][0]
    except Exception as e:
        print(e)
        stats["results_within_1_year_last_10_years"][cle_false] = 0
    try:
        stats["results_within_1_year_last_10_years_drug"][cle_true] = data_results_within_1_year_last_10_years_drug[data_results_within_1_year_last_10_years_drug.index==True].values[0][0]
    except Exception as e:
        print(e)
        stats["results_within_1_year_last_10_years_drug"][cle_true] = 0
    try:
        stats["results_within_1_year_last_10_years_drug"][cle_false] = data_results_within_1_year_last_10_years_drug[data_results_within_1_year_last_10_years_drug.index==False].values[0][0]
    except Exception as e:
        print(e)
        stats["results_within_1_year_last_10_years_drug"][cle_false] = 0
    try:
        results_last_10_years = int(round(100 * stats["results_last_10_years"][cle_true] / (stats["results_last_10_years"][cle_false] + stats["results_last_10_years"][cle_true]), 0))
    except ZeroDivisionError:
        results_last_10_years = 0
    try:
        results_last_10_years_drug = int(round(100 * stats["results_last_10_years_drug"][cle_true] / (stats["results_last_10_years_drug"][cle_false] + stats["results_last_10_years_drug"][cle_true]), 0))
    except ZeroDivisionError:
        results_last_10_years_drug = 0
    try:
        results_within_1_year_last_10_years = int(round(100 * stats["results_within_1_year_last_10_years"][cle_true] / (stats["results_within_1_year_last_10_years"][cle_false] + stats["results_within_1_year_last_10_years"][cle_true]), 0))
    except ZeroDivisionError:
        results_within_1_year_last_10_years = 0
    try:
        results_within_1_year_last_10_years_drug = int(round(100 * stats["results_within_1_year_last_10_years_drug"][cle_true] / (stats["results_within_1_year_last_10_years_drug"][cle_false] + stats["results_within_1_year_last_10_years_drug"][cle_true]), 0))
    except ZeroDivisionError:
        results_within_1_year_last_10_years_drug = 0
    return f"{number_of_trials}", f"{results_last_10_years} %", f"{results_last_10_years_drug} %", f"{results_within_1_year_last_10_years} %", f"{results_within_1_year_last_10_years_drug} %"

In [8]:
print("france")
indicators_france = get_percent(df_interventional_completed)
print(indicators_france)
print("academique")
print(get_percent(df_interventional_completed_academic))

france
('16781', '51 %', '76 %', '32 %', '53 %')
academique
('6715', '28 %', '44 %', '13 %', '20 %')


In [None]:
dd_perim = pd.merge(df_interventional_academic, mapping, left_on="lead_sponsor", right_on="sponsor", indicator=True, how="inner")
dd_perim_ror = dd_perim[dd_perim.ror.apply(lambda x:isinstance(x, str) and "ror.org/" in x)]
dd_perim_ror["ror_simple"] = dd_perim_ror.ror.apply(lambda x:x.split("/")[-1])
dd_perim_ror.ror_simple.value_counts().head(65)
df_perimetre = pd.DataFrame(dd_perim_ror.ror_simple.value_counts().head(64)).reset_index()
df_perimetre.columns = ["ror", "nb"]
df_perimetre = df_perimetre[df_perimetre.ror.apply(lambda x:x not in excluded_rors)]
df_name = dd_perim_ror[["ror_simple", "sponsor_normalized"]].drop_duplicates()
df_perimetre2 = pd.merge(df_perimetre, df_name, left_on="ror", right_on="ror_simple", how="left")
df_perimetre2[["ror", "nb", "sponsor_normalized"]].to_csv("../publipostage2/perimetre.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dd_perim_ror["ror_simple"] = dd_perim_ror.ror.apply(lambda x:x.split("/")[-1])


In [10]:
def get_status(row):
    if row.has_results_or_publications is False:
        return "\U00002B1B"
    if row.has_results_or_publications_within_1y is False and row.has_results_or_publications_within_3y is False:
        return "\U0001F7E5"
    if row.has_results_or_publications_within_1y is False and row.has_results_or_publications_within_3y is True:
        return "\U0001F7E7"
    if row.has_results_or_publications_within_1y is True and row.has_results_or_publications_within_3y is True:
        return "\U0001F7E9"

def get_status_label(row):
    if row.has_results_or_publications is False:
        return "noir"
    if row.has_results_or_publications_within_1y is False and row.has_results_or_publications_within_3y is False:
        return "rouge"
    if row.has_results_or_publications_within_1y is False and row.has_results_or_publications_within_3y is True:
        return "orange"
    if row.has_results_or_publications_within_1y is True and row.has_results_or_publications_within_3y is True:
        return "vert"

In [11]:
def clean_year(y):
    try:
        return str(int(y))
    except:
        return None

In [None]:
def make_data(ror):
    os.system(f"mkdir -p ./../publipostage2/{ror}")
    df_tmp = dd_ror[dd_ror.ror_simple == ror].sort_values(by="study_completion_year")
    sponsor_names = pd.DataFrame(df_tmp.lead_sponsor.value_counts()).reset_index()
    sponsor_names.columns = ["variant", "number_of_trials"]
    sponsor_names[["variant"]].to_csv(f"./../publipostage2/{ror}/liste_variantes_noms_{ror}.csv", index=False, header=False)
    df_tmp["status"] = df_tmp.apply(lambda row: get_status(row), axis=1)
    df_tmp["status_label"] = df_tmp.apply(lambda row: get_status_label(row), axis=1)
    global_stat = {"ror": ror}
    try:
        global_stat["sponsor_name"] = df_tmp.sponsor_normalized.values[0]
    except:
        print("skip " + ror)
        return
    global_stat["sponsor_article"] = "du"
    if global_stat["sponsor_name"][0] in ["A", "E", "I", "O", "U", "Y"]:
        global_stat["sponsor_article"] = "de l'"
    if global_stat["sponsor_name"].split(" ")[0].lower() in ["hospices", "hopitaux", "hôpitaux"]:
        global_stat["sponsor_article"] = "des"
    if global_stat["sponsor_name"].split(" ")[0].lower() in ["hopital", "hôpital"]:
        global_stat["sponsor_article"] = "de l'"
    if global_stat["sponsor_name"].split(" ")[0].lower() in ["clinique"]:
        global_stat["sponsor_article"] = "de la"
    global_stat["number_of_trials"], global_stat["10Y_indicator"], global_stat["10Ydrug_indicator"], global_stat["10Y_lastyear_indicator"], global_stat["10Y_lastyear_drug_indicator"] = get_percent(df_tmp)
    _, global_stat["10Y_indicator_france"], global_stat["10Ydrug_indicator_france"], global_stat["10Y_lastyear_indicator_france"], global_stat["10Y_lastyear_drug_indicator_france"] = indicators_france
    pd.DataFrame([global_stat]).to_csv(f"./../publipostage2/{ror}/indicators_{ror}.csv", index=False)
    df_tmp["study_completion_year"] = df_tmp["study_completion_year"].apply(lambda x:clean_year(x))
    df_liste_essais = df_tmp[["status", "status_label", "NCTId", "eudraCT", "CTIS", "study_completion_year",
                              "title", "acronym",
                              "has_results_or_publications_within_1y", "has_results_or_publications_within_3y", "has_results_or_publications", "intervention_type"]]
    df_liste_essais.columns = ["statut", "status_label", "NCTId", "eudraCT", "CTIS", "completion_year", 
                               "clinical_trial_title", "acronym", 
                               "results_1y", "results_3y", "results", "intervention_type"]
    df_liste_essais.to_csv(f"./../publipostage2/{ror}/liste_essais_cliniques_identifies_{ror}.csv", index=False)
    df_liste_essais.to_excel(f"./../publipostage2/{ror}/liste_essais_cliniques_identifies_{ror}.xlsx", index=False)
    return global_stat

In [None]:
global_data = []
rors_to_compute = df_perimetre.ror.to_list()
for ix, current_ror in enumerate(rors_to_compute):
    if current_ror in excluded_rors:
        continue
    d = make_data(current_ror)
    global_data.append(d)
# Write a file to expose the stats for each academic sponsor
pd.DataFrame(global_data).to_csv(f"./../publipostage2/indicators.csv", index=False)

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with