# Initial codes

This notebook illustrates how Bayesian inference can be used to infer response rates of each group (basket) in a Basket trial.

In [1]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
import sys, os
from os.path import exists

sys.path.append('..')
sys.path.append('.')

In [3]:
import numpy as np
import pandas as pd
import arviz as az
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [4]:
from pyBasket.common import load_obj
from pyBasket.common import (
    GROUP_STATUS_EARLY_STOP_FUTILE,
    GROUP_STATUS_COMPLETED_INEFFECTIVE,
    GROUP_STATUS_COMPLETED_EFFECTIVE,
    MODEL_INDEPENDENT,
    MODEL_INDEPENDENT_BERN,
    MODEL_HIERARCHICAL_BERN,
    MODEL_BHM,
    MODEL_PYBASKET
)

In [5]:
def get_output_filenames(result_dir, scenario, with_clustering_info, n_clusters):
    base_filename = f'scenario_{scenario}_clustering_{with_clustering_info}'
    if with_clustering_info:
        base_filename += f'_ncluster_{n_clusters}'

    out_trial_results = os.path.join(result_dir, base_filename + '_trial_results.p')
    return out_trial_results

In [6]:
out_dir = os.path.abspath(os.path.join('..', 'scripts', 'results'))
out_dir

'/Users/joewandy/Work/git/pyBasket/scripts/results'

In [7]:
scenarios = range(0, 6)
clustering = [False, True]
n_clusters = [5, 10]

In [8]:
def gather_data(scenarios, clustering, n_clusters, out_dir):
    data = {}

    for scenario in scenarios:
        for cl in clustering:
            nc_values = [None] if not cl else n_clusters

            for nc in nc_values:
                try:
                    fname = get_output_filenames(out_dir, scenario, cl, nc)
                    print(fname)                
                    trial_results = load_obj(fname)
                    key = (scenario, cl, nc)
                    data[key] = trial_results
                except FileNotFoundError:
                    pass
                
    return data

In [9]:
data = gather_data(scenarios, clustering, n_clusters, out_dir)

/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_0_clustering_False_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_0_clustering_True_ncluster_5_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_0_clustering_True_ncluster_10_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_1_clustering_False_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_1_clustering_True_ncluster_5_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_1_clustering_True_ncluster_10_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_2_clustering_False_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_2_clustering_True_ncluster_5_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_2_clustering_True_ncluster_10_trial_results.p
/Users/joewandy/Work/git/pyBasket/scripts/results/scenario_3_clustering_False_trial_results

In [10]:
def get_analysis(data, trial_idx, scenario, use_clustering, num_clusters, last_only=True):
    key = (scenario, use_clustering, num_clusters)
    analysis_results = {}

    if key not in data:
        print(f"No trial result found for key {key}")
        return

    trial_result = data[key]

    if trial_idx >= len(trial_result):
        print(f"Trial index {trial_idx} out of range.")
        return

    for analysis_name in trial_result[trial_idx].idfs:
        idfs = trial_result[trial_idx].idfs[analysis_name]
        if last_only and idfs:
            idfs = idfs[-1]
        analysis_results[analysis_name] = idfs

    return analysis_results

def get_analysis_by_name(analysis_results, analysis_name):
    if analysis_name not in analysis_results:
        print(f"No analysis results found for {analysis_name}")
        return

    idfs = analysis_results[analysis_name]

    return idfs


def display_analysis(idfs):
    if isinstance(idfs, list):
        for df in idfs:
            display(df)
    elif idfs is not None:
        display(idfs)
    else:
        print("No data to display.")


In [11]:
def get_true_arr(scenario):

    if scenario == 0:
        y_true = np.array([False, False, False, False, False, False])
    elif scenario == 1:
        y_true = np.array([True, False, False, False, False, False])
    elif scenario == 2:
        y_true = np.array([True, True, False, False, False, False])
    elif scenario == 3:
        y_true = np.array([True, True, True, False, False, False])
    elif scenario == 4:
        y_true = np.array([True, True, True, True, False, False])
    elif scenario == 5:
        y_true = np.array([True, True, True, True, True, False])
        
    y_true_all = []
    for trial_idx in range(total):
        y_true_all.append(y_true)
    y_true_all = np.array(y_true_all)
    return y_true_all

In [12]:
def get_pred_arr(data, total, analysis_name, scenario, use_clustering, num_clusters):
    y_pred_all = []
    for trial_idx in range(total):

        # get the data first
        analysis_results = get_analysis(data, trial_idx, scenario, use_clustering, num_clusters)

        df = get_analysis_by_name(analysis_results, analysis_name)
        df_status = df['group_status'] == GROUP_STATUS_COMPLETED_EFFECTIVE
        y_pred = df_status.values
        y_pred_all.append(y_pred)

    y_pred_all = np.array(y_pred_all)
    return y_pred_all

|                   | Predicted Negative | Predicted Positive |
|-------------------|--------------------|--------------------|
| Actual Negative   |        TN          |        FP          |
| Actual Positive   |        FN          |        TP          |

In [35]:
def get_percent_reject(y_true_all, y_pred_all, basket_idx):
    y_true = y_true_all[:, basket_idx]
    y_pred = y_pred_all[:, basket_idx]
    count = np.sum((y_true == False) & (y_pred == True))
    total = y_true_all.shape[0]
    return count / total * 100

In [36]:
def get_metrics(y_true_all, y_pred_all, basket_idx):
    cm = confusion_matrix(y_true_all[:, basket_idx], y_pred_all[:, basket_idx], labels=[False, True])
    TN = float(cm[0][0])
    FP = float(cm[0][1])
    FN = float(cm[1][0])
    TP = float(cm[1][1])
    accuracy = (TN + TP) / (TN + FP + FN + TP)

    try:
        precision = TP / (TP + FP)
    except ZeroDivisionError:
        precision = 0.0

    try:
        recall = TP / (TP + FN)
    except ZeroDivisionError:
        recall = 0.0

    try:
        sensitivity = TN / (TN + FP)
    except ZeroDivisionError:
        sensitivity = 0.0

    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except ZeroDivisionError:
        f1 = 0.0
        
    try:
        fpr = FP / (FP + TN)
    except ZeroDivisionError:
        fpr = 0.0
    
    return TN, FP, FN, TP, accuracy, precision, recall, sensitivity, f1, fpr

In [37]:
def analyze_rejections(data, total, n_scenario, n_baskets, analysis_names, use_clustering, num_clusters):
    df_data = []
    for analysis_name in analysis_names:
        for scenario_idx in range(n_scenario):
            y_true_all = get_true_arr(scenario_idx)
            y_pred_all = get_pred_arr(data, total, analysis_name, scenario_idx, use_clustering, num_clusters)

            for basket_idx in range(n_baskets):
                reject = get_percent_reject(y_true_all, y_pred_all, basket_idx)
                # TN, FP, FN, TP, accuracy, precision, recall, sensitivity, f1, fpr = get_metrics(y_true_all, y_pred_all, basket_idx)
                row = [scenario_idx, analysis_name, basket_idx, reject]
                df_data.append(row)

    reject_df = pd.DataFrame(df_data, columns=['scenario', 'analysis_name', 'basket_idx', 'reject'])
    reshaped_df = reject_df.pivot(index=['scenario', 'analysis_name'], columns='basket_idx', values='reject')
    return reshaped_df


In [39]:
total = 500
n_scenario = 6
n_baskets = 6
analysis_names = [
    MODEL_INDEPENDENT,
    # MODEL_INDEPENDENT_BERN,
    # MODEL_HIERARCHICAL_BERN,
    MODEL_BHM,
    MODEL_PYBASKET
]

In [40]:
use_clustering = False
num_clusters = None
df = analyze_rejections(data, total, n_scenario, n_baskets, analysis_names, use_clustering, num_clusters)
df

Unnamed: 0_level_0,basket_idx,0,1,2,3,4,5
scenario,analysis_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,BHM,11.8,12.0,11.4,11.6,11.2,12.6
0,independent,7.4,9.8,10.6,9.8,9.8,11.0
0,pyBasket,7.8,10.0,11.2,9.4,9.0,11.2
1,BHM,0.0,28.6,28.4,27.2,28.6,28.8
1,independent,0.0,11.4,12.2,11.0,11.6,10.6
1,pyBasket,0.0,8.4,9.0,7.4,7.0,8.6
2,BHM,0.0,0.0,48.8,49.4,46.6,48.2
2,independent,0.0,0.0,12.6,15.4,12.8,10.8
2,pyBasket,0.0,0.0,6.6,7.6,5.0,4.0
3,BHM,0.0,0.0,0.0,61.4,59.6,61.0


In [41]:
use_clustering = True
num_clusters = 5
df = analyze_rejections(data, total, n_scenario, n_baskets, analysis_names, use_clustering, num_clusters)
df

Unnamed: 0_level_0,basket_idx,0,1,2,3,4,5
scenario,analysis_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,BHM,0.8,0.8,0.0,0.0,0.2,0.2
0,independent,4.8,2.8,4.4,4.2,3.8,3.2
0,pyBasket,7.2,4.4,6.6,8.2,6.0,6.0
1,BHM,0.0,0.2,1.4,0.4,0.6,1.0
1,independent,0.0,2.2,5.4,2.4,2.0,2.8
1,pyBasket,0.0,3.0,6.2,4.4,2.6,3.4
2,BHM,0.0,0.0,0.2,0.0,0.0,0.2
2,independent,0.0,0.0,0.4,0.4,0.0,0.8
2,pyBasket,0.0,0.0,0.6,0.8,0.2,1.2
3,BHM,0.0,0.0,0.0,42.6,42.4,41.6


In [42]:
use_clustering = True
num_clusters = 10
df = analyze_rejections(data, total, n_scenario, n_baskets, analysis_names, use_clustering, num_clusters)
df

Unnamed: 0_level_0,basket_idx,0,1,2,3,4,5
scenario,analysis_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,BHM,1.4,0.6,1.2,0.4,0.6,0.8
0,independent,7.4,7.8,5.2,5.6,5.6,5.4
0,pyBasket,8.8,8.8,7.4,6.8,8.2,6.8
1,BHM,0.0,0.0,0.2,0.0,0.2,0.2
1,independent,0.0,0.2,0.2,1.0,1.0,0.4
1,pyBasket,0.0,1.0,0.6,2.4,2.4,1.8
2,BHM,0.0,0.0,2.4,2.0,2.2,1.6
2,independent,0.0,0.0,4.6,5.4,3.4,2.6
2,pyBasket,0.0,0.0,4.0,4.8,3.6,2.6
3,BHM,0.0,0.0,0.0,0.6,0.8,0.6
