In [None]:
!python -m pip install multiple-hypothesis-testing

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/gdrive/')
    # import glob; print(glob.glob('/content/gdrive/Othercomputers/My Laptop/projects/RUNI/Thesis/*'))
    import sys
    sys.path.append('/content/gdrive/Othercomputers/My Laptop/projects/RUNI/Thesis')
except:
    pass

In [None]:
from Synthetic_Data_Generators import Multi_Class_Normal_Population as Data_Generator
from Synthetic_Data_Generators import signal_2_noise_roc
from Higher_Criticism import Higher_Criticism
higher_criticism = Higher_Criticism(use_import=False, gamma=1.0)

In [None]:
import scipy
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import functools
from sklearn import metrics

plt.rcParams['font.family'] = 'monospace'


In [None]:
monte_carlo=10000
N_range = [1000, 3000, 10000, 30000]
r_range = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
beta_range = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
collect_results = {}
for ind_N, N in enumerate(N_range):
    noise_generator = Data_Generator(**Data_Generator.params_pure_noise(N = N))
    noise_monte_carlo = higher_criticism.monte_carlo_statistics(monte_carlo=monte_carlo, data_generator=noise_generator)
    for ind_r, r in enumerate(r_range):
        for ind_beta, beta in enumerate(beta_range):
            signal_generator = Data_Generator(**Data_Generator.params_from_N_r_beta(N=N, r=r, beta=beta))
            singal_monte_carlo = higher_criticism.monte_carlo_statistics(monte_carlo=monte_carlo, data_generator=signal_generator)
            for key, noise_values in noise_monte_carlo.items():
                if key in ['nums_rejected', 'first_drawdown']:
                    continue
                if key not in collect_results:
                    collect_results[key] = np.empty(shape=(len(N_range),len(r_range),len(beta_range)), dtype=np.float32)
                signal_values = singal_monte_carlo[key]
                auc, _, _ = signal_2_noise_roc(signal_values=signal_values, noise_values=noise_values)
                collect_results[key][ind_N, ind_r, ind_beta] = auc



In [None]:
auc_results = {}
for mu, fraction, singal_res in collected_data:
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))
    for key, baseline_values in baseline_noise.items():
        if key in ['nums_rejected', 'first_drawdown']:
            continue
        signal_values = singal_res[key]
        sort_v_factor = -1 if np.mean(signal_values) > np.mean(baseline_values) else 1
        roc_values_tuples = []
        for v in baseline_values:
            roc_values_tuples.append((0,v*sort_v_factor))
        for v in signal_values:
            roc_values_tuples.append((1,v*sort_v_factor))
        def cmp_signal_values(t1, t2):
            v1, v2 = t1[1], t2[1]
            diff_v12 = v1 - v2
            if abs(diff_v12) <= 1e-6*min(abs(v1),abs(v2)):
                return t1[0] - t2[0]
            return 1 if v1 > v2 else -1
        sorted_tuples = sorted(roc_values_tuples, key = functools.cmp_to_key(cmp_signal_values))
        sorted_labels = [roc_tuple[0] for roc_tuple in sorted_tuples]
        sorted_values = [roc_tuple[1] for roc_tuple in reversed(sorted_tuples)]
        fpr, tpr, _ = metrics.roc_curve(y_true=sorted_labels, y_score= sorted_values)
        roc_auc = metrics.auc(fpr,tpr)
        ax.plot(fpr, tpr, label=key + f' AUC={roc_auc:.2f}')
    ax.set_title(f'ROC curves for classification of samples of {N} tests using {monte_carlo} monte carlo runs.\n'\
                 + f'Each sample is either pure noise or signal with mu={mu:.2f} fraction={fraction:.2f}')
    plt.legend()
    plt.show()
