In [None]:
from lfs import *
from tqdm import tqdm
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score, cohen_kappa_score, accuracy_score

hint_path = './clinical-trial-outcome-prediction/data/'
all_files = glob.glob(os.path.join(hint_path, "phase*train.csv")) + glob.glob(os.path.join(hint_path, "phase*valid.csv"))
hint = pd.concat((pd.read_csv(f) for f in all_files))
hint.rename(columns={'nctid': 'nct_id'}, inplace=True)
print(hint['label'].value_counts())

path = '../CTTI/'
# study_df = pd.read_csv('../CTTI/studies.txt', sep='|')
# study_df.dropna(subset=['phase'], inplace=True)
# study_df[study_df['phase'].str.contains('1')].shape[0], study_df[study_df['phase'].str.contains('2')].shape[0], study_df[study_df['phase'].str.contains('3')].shape[0]
phase_1_sum, phase_2_sum, phase_3_sum = 60549, 76972, 44087

# print('lf, phase, qunatile, -1.0, 0.0, 1.0, prop, coverage, acc, ck')
quantile_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9,]

output = []
for quantile in tqdm(quantile_list):
    funcs_all = [lf_num_sponsors(quantile=quantile), 
                 lf_num_patients(quantile=quantile), 
                 lf_patient_drop(quantile=quantile), 
                 lf_sites(quantile=quantile), 
                 lf_pvalues(quantile=quantile), 
                 lf_update_more_recent(quantile=quantile), 
                 lf_death_ae(quantile=quantile), 
                 lf_serious_ae(quantile=quantile), 
                 lf_all_ae(quantile=quantile), 
                 lf_status(), 
                 lf_amendments(quantile=quantile), 
                 lf_news_headlines(quantile=quantile)]
    funcs_all_name = ['num_sponsors', 'num_patients', 'patient_drop', 'sites', 'pvalues', 'update_more_recent', 'death_ae', 'serious_ae', 'all_ae', 'status', 'amendments', 'news_headlines']

    for i in range(len(funcs_all)):
        for phase in ['1', '2', '3']:
            names = funcs_all_name[i] 

            labels_df = hint[hint['phase'].str.contains(phase)]
            funcs = funcs_all[i][funcs_all[i]['nct_id'].isin(labels_df['nct_id'])]
            value_counts = funcs['lf'].value_counts()
            value_dict = value_counts.to_dict()

            for key in [-1.0, 0.0, 1.0]:
                if key not in value_dict:
                    value_dict[key] = 0

            positive_perc = value_dict[1.0] / (value_dict[1.0] + value_dict[0.0])
            if phase == '1':
                len_all_trials = phase_1_sum
            elif phase == '2':
                len_all_trials = phase_2_sum
            else:
                len_all_trials = phase_3_sum
            coverage = sum([value_dict[k] for k in value_dict.keys() if k!=-1.0]) / len_all_trials

            combined = pd.merge(labels_df.copy(), funcs, on='nct_id', how='left')
            combined = combined[combined['lf'] != -1].dropna(subset=['lf'])

            output.append(f"{names}, {phase}, {quantile}, {value_dict[-1.0]}, {value_dict[0.0]}, {value_dict[1.0]}, {positive_perc}, {coverage}, \
                        {accuracy_score(combined['label'], combined['lf'])}, {cohen_kappa_score(combined['label'], combined['lf'])}")
            # print(output[-1])
df = pd.DataFrame([x.split(',') for x in output], columns=['lf', 'phase', 'quantile', '-1.0', '0.0', '1.0', 'positive_perc', 'coverage', 'acc', 'ck'])
df.to_csv('lf_each_thresh.csv', index=False)

In [None]:
# get labels based on best threshold
import glob
import os
import pandas as pd
from lfs import get_lfs

df_list, status_lf = get_lfs(path='../CTTI/', lf_each_thresh_path='./lf_each_thresh.csv')

path = './clinical-trial-outcome-prediction/data/'
# all_files = glob.glob(os.path.join(path, "phase*train.csv")) + glob.glob(os.path.join(path, "phase*valid.csv")) + glob.glob(os.path.join(path, "phase*test.csv"))
all_files = glob.glob(os.path.join(path, "phase*test.csv"))
hint = pd.concat((pd.read_csv(f) for f in all_files))
hint.rename(columns={'nctid': 'nct_id'}, inplace=True)
print(hint['label'].value_counts())

In [None]:
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score, accuracy_score, cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from snorkel.labeling.model import LabelModel, MajorityLabelVoter
import pandas as pd
bad_top_test_df = pd.read_csv('./mismatched_status.csv').rename(columns={'nctid': 'nct_id'})

positive_props = [.4, .5, .5]
lrs = [.01, .01, .01]
all_combineds = []
all_combined_full = []
all_phases = ['1', '2', '3']

print("phase, acc, f1, prauc, rocauc, kappa")
for i in [0,1,2]:
    phase = all_phases[i]
    df2 = df_list[i].copy()
    L = df2.iloc[:,1:].values.astype('int')

    label_model = LabelModel(verbose=False, cardinality=2)
    # label_model = MajorityLabelVoter(cardinality=2)

    positive_prop = positive_props[i]
    label_model.fit(L, class_balance=[1-positive_prop, positive_prop], seed=0, lr=lrs[i], n_epochs=200)
    pred = label_model.predict(L)
    df2['pred'] = pred.astype('int')

    df2 = df2.sort_values('nct_id')
    status_subset = status_lf[status_lf['lf']!=-1]
    status_subset_dict = dict(zip(status_subset['nct_id'], status_subset['lf']))
    df2['pred'] = df2.apply(lambda x: status_subset_dict[x['nct_id']] if x['nct_id'] in status_subset_dict else x['pred'], axis=1)

    all_combined_full.append(df2.copy())

    hint_subset = hint[hint['phase'].str.contains(phase)]
    hint_subset = hint_subset[~hint_subset['nct_id'].isin(bad_top_test_df['nct_id'])]
    combined = pd.merge(hint_subset, df2, on='nct_id', how='left')
    combined = combined.dropna(subset=['pred'])
    combined = combined[combined['pred'] != -1]
    # print(phase, hint_subset.shape, combined.shape)
    print(phase,',', accuracy_score(combined['label'], combined['pred']), ',',
        f1_score(combined['label'], combined['pred']), ',', 
        average_precision_score(combined['label'], combined['pred']), ',',
        roc_auc_score(combined['label'], combined['pred']), ',',
        cohen_kappa_score(combined['label'], combined['pred']))

    all_combineds.append(combined)

combined = pd.concat(all_combineds)
print('all',',', accuracy_score(combined['label'], combined['pred']), ',',
    f1_score(combined['label'], combined['pred']), ',', 
    average_precision_score(combined['label'], combined['pred']), ',',
    roc_auc_score(combined['label'], combined['pred']), ',',
    cohen_kappa_score(combined['label'], combined['pred']))


In [None]:
print(combined['label'].value_counts() / combined['label'].value_counts().sum())
print(combined['pred'].value_counts() / combined['pred'].value_counts().sum())

In [None]:
# # save to csv
# # if dp or rf, save accordingly
# all_combineds[0].to_csv('dp_weakpred_phase1.csv', index=False)
# all_combineds[1].to_csv('dp_weakpred_phase2.csv', index=False)
# all_combineds[2].to_csv('dp_weakpred_phase3.csv', index=False)

# NOTE THAT THESE CONTAIN ALL PREDICTIONS FOR ALL PHASES, NOT JUST THE SPECIFIC PHASE, SO NEED TO FILTER
print(pd.read_csv('dp_weakpred_phase1.csv').shape)
print(pd.read_csv('dp_weakpred_phase2.csv').shape)
print(pd.read_csv('dp_weakpred_phase3.csv').shape)

print(pd.read_csv('rf_weakpred_phase1.csv').shape)
print(pd.read_csv('rf_weakpred_phase2.csv').shape)
print(pd.read_csv('rf_weakpred_phase3.csv').shape)