### This is a Tutorial Notebook to get started with CTO as soon as possible, without any frills
- We will be attempting to reproduce baseline result:
    - Training on CTO training labels and testing on TOP test split
- Prequisites:
    - pip install all requirements as in the requirements.txt
    - Navigate to https://zenodo.org/doi/10.5281/zenodo.11535960 (this link always resolves to the latest version), and download the latest version of CTO! 
    I downloaded the v0.3 labeling.zip, and placed it in the parent directory.
    - Also git clone TOP for comparison purposes
        ```bash
        wget https://zenodo.org/records/11608615/files/labeling.zip -P ../
        git clone https://github.com/futianfan/clinical-trial-outcome-prediction ../
        ```

In [1]:
# loading the zipped data
import numpy as np
import pandas as pd
import os
import glob
import zipfile

train_data_mode = 'CTO' # in ['CTO', 'TOP']

# we always test on supervised TOP labels
test_df = pd.concat((pd.read_csv(f) for f in glob.glob("../clinical-trial-outcome-prediction/data/phase*test.csv")))

if train_data_mode == 'TOP':
    train_df = pd.concat((pd.read_csv(f) for f in glob.glob("../clinical-trial-outcome-prediction/data/phase*train.csv")))
    valid_df = pd.concat((pd.read_csv(f) for f in glob.glob("../clinical-trial-outcome-prediction/data/phase*valid.csv")))

elif train_data_mode == 'CTO':
    with zipfile.ZipFile('../labeling.zip', 'r') as zip_ref:
        all_names = zip_ref.namelist()
        # print(all_names)
        # print([_ for _ in all_names if "vs_top" in _])
        train_df = pd.read_csv(zip_ref.open('labeling/vs_top/train_dp.csv'))
        valid_df = pd.read_csv(zip_ref.open('labeling/vs_top/valid_dp.csv'))

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
# ============ preprocess by filling NAs and dropping duplocates ============
train_df = pd.concat([train_df, valid_df])
train_df.fillna('', inplace=True)
train_df.drop_duplicates(subset=['nctid'], inplace=True)
test_df.fillna('', inplace=True)
test_df.drop_duplicates(subset=['nctid'], inplace=True)

# ============ set features to phase + diseases + icdcodes + drugs + inclusion / exclusion criteria ============
train_df['features'] = train_df['phase'] + ' '  + train_df['diseases'] + ' '  + train_df['icdcodes'] + ' ' + train_df['drugs'] + ' ' + train_df['criteria']
test_df['features'] = test_df['phase'] + ' '  + test_df['diseases'] + ' '  + test_df['icdcodes'] + ' ' + test_df['drugs'] + ' ' + test_df['criteria']

# featurize the data
tfidf = TfidfVectorizer(max_features=2048, stop_words='english')
X_train = tfidf.fit_transform(train_df['features'])
X_test = tfidf.transform(test_df['features'])

In [3]:
# ============ define evalution ============
from sklearn.metrics import classification_report, f1_score, average_precision_score, roc_auc_score

def bootstrap_eval(y_true, y_pred, y_prob, num_samples=100):
    f1s = []
    aps = []
    rocs = []
    for _ in range(num_samples):
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        f1s.append(f1_score(y_true[indices], y_pred[indices]))
        aps.append(average_precision_score(y_true[indices], y_prob[indices]))
        rocs.append(roc_auc_score(y_true[indices], y_prob[indices]))
    return np.mean(f1s), np.std(f1s), np.mean(aps), np.std(aps), np.mean(rocs), np.std(rocs)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
# from xgboost import XGBClassifier # pip install xgboost
from sklearn.neural_network import MLPClassifier

print(f'Model, Phase, F1, AP, ROC')
# for model_name in ['svm', 'xgboost', 'mlp', 'rf', 'lr', ]:
for model_name in ['svm', 'lr']: # use fastest models for testing
    if model_name == 'rf':
        model = RandomForestClassifier(n_estimators=300, random_state=0, max_depth=10, n_jobs=4)
    elif model_name == 'lr':
        model = LogisticRegression(max_iter=1000, random_state=0)
    elif model_name == 'svm':
        model = LinearSVC(dual="auto", max_iter=10000, random_state=0)
        model = CalibratedClassifierCV(model) 
        # model = SVC(kernel='linear', probability=True, random_state=0) # performs worse than the above
    elif model_name == 'xgboost':
        model = XGBClassifier(n_estimators=300, random_state=0, max_depth=10, n_jobs=4)
    elif model_name == 'mlp':
        model = MLPClassifier(hidden_layer_sizes=(64, 64), max_iter=2000, random_state=0)
    else:
        raise ValueError('Unknown model name')

    model.fit(X_train, train_df['label'])
    test_df['pred'] = model.predict(X_test)
    test_df['prob'] = model.predict_proba(X_test)[:, 1]

    for phase in ['phase 1', 'phase 2', 'phase 3']:
        test_df_subset = test_df[test_df['phase'].str.lower().str.contains(phase)]
        f1_mean, f1_std, ap_mean, ap_std, roc_mean, roc_std = bootstrap_eval(test_df_subset['label'].values, test_df_subset['pred'].values, test_df_subset['prob'].values)
        print(f"{phase}, {model_name}, {f1_mean:.3f}, {f1_std:.3f}, {ap_mean:.3f}, {ap_std:.3f}, {roc_mean:.3f}, {roc_std:.3f}")

Model, Phase, F1, AP, ROC
phase 1, svm, 0.711, 0.015, 0.639, 0.027, 0.618, 0.023
phase 2, svm, 0.718, 0.009, 0.644, 0.017, 0.613, 0.015
phase 3, svm, 0.854, 0.007, 0.839, 0.013, 0.653, 0.018
phase 1, lr, 0.727, 0.016, 0.692, 0.025, 0.657, 0.020
phase 2, lr, 0.718, 0.010, 0.680, 0.017, 0.639, 0.013
phase 3, lr, 0.857, 0.008, 0.849, 0.014, 0.676, 0.018


In [5]:
# ================== Results: train on CTO, test on TOP ==================
# Model, Phase, F1, AP, ROC
# phase 1, svm, 0.715, 0.014, 0.648, 0.030, 0.624, 0.025
# phase 2, svm, 0.718, 0.011, 0.645, 0.019, 0.614, 0.015
# phase 3, svm, 0.854, 0.009, 0.841, 0.015, 0.656, 0.020
# phase 1, lr, 0.725, 0.015, 0.691, 0.029, 0.658, 0.023
# phase 2, lr, 0.716, 0.011, 0.679, 0.016, 0.641, 0.012
# phase 3, lr, 0.856, 0.008, 0.845, 0.015, 0.669, 0.020

# ================== Results: train on TOP, test on TOP ==================
# Model, Phase, F1, AP, ROC
# phase 1, svm, 0.627, 0.020, 0.642, 0.028, 0.590, 0.022
# phase 2, svm, 0.670, 0.013, 0.662, 0.019, 0.625, 0.013
# phase 3, svm, 0.812, 0.009, 0.873, 0.012, 0.699, 0.016
# phase 1, lr, 0.652, 0.018, 0.664, 0.027, 0.627, 0.020
# phase 2, lr, 0.674, 0.012, 0.697, 0.018, 0.648, 0.013
# phase 3, lr, 0.827, 0.010, 0.885, 0.012, 0.723, 0.015