In [74]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import json
from sklearn.metrics import roc_auc_score
from scipy.special import softmax

### Loading our Data

In [4]:
## Loading our data
with open('/home/colin/wbc_data_2021.json') as fp:
    wbc_data = json.load(fp)

In [86]:
normalized_diff_counts = {}
for order_id, data in wbc_data.items():
    norm_diff = np.mean([np.eye(9)[value['discrete']] for _, value in data.items()], axis=0)
    label = list(data.values())[0]['label']
    normalized_diff_counts[order_id] = {
        'diff': norm_diff,
        'label': label
    }
    
normalized_diff_counts_cont = {}
for order_id, data in wbc_data.items():
    norm_diff = np.mean([value['continous'] for _, value in data.items()], axis=0)
    label = list(data.values())[0]['label']
    normalized_diff_counts_cont[order_id] = {
        'diff': norm_diff,
        'label': label
    }

### Creating k-fold splits

In [87]:
from sklearn.model_selection import StratifiedKFold

In [93]:
discrete = False
if discrete:
    x_data = np.array([data['diff'] for data in normalized_diff_counts.values()])
    y_data = np.array([data['label'] for data in normalized_diff_counts.values()])
else:
    order_ids = np.array([key for key in normalized_diff_counts_cont.keys()])
    x_data = np.array([data['diff'] for data in normalized_diff_counts_cont.values()])
    y_data = np.array([data['label'] for data in normalized_diff_counts_cont.values()])

In [89]:
classifiers = [
    GaussianProcessClassifier(1.0 * RBF(1.0))]

names = ["Gaussian Process"]

In [94]:
stats = {}
predictions = {}
for classifier, name in zip(classifiers, names):

    folder = StratifiedKFold(n_splits=6, shuffle=True, random_state=1)
    aucs = []
    accs = []

    for train_split, test_split in folder.split(x_data, y_data):
        x_train = x_data[train_split]
        y_train = y_data[train_split]
        x_test  = x_data[test_split]
        y_test  = y_data[test_split]
        test_ids = order_ids[test_split]
        classifier.fit(x_train, y_train)
        test_acc = classifier.score(x_test, y_test)
        if name in ['Gaussian Process', "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes",  "QDA"]:
            preds = classifier.predict_proba(x_test)[:,1]
            for pred, test_id in zip(preds, test_ids):
                predictions[test_id] = pred
        else:
            preds = classifier.decision_function(x_test)
        test_auc = roc_auc_score(y_test, preds)
        accs.append(test_acc)
        aucs.append(test_auc)
    stats[name] = {
        'accs': accs,
        'mean_acc': np.mean(accs),
        'aucs': aucs,
        'mean_auc': np.mean(aucs)
    }
    print(name)

Gaussian Process


In [96]:
with open('/home/colin/gwbc_preds.json', 'w') as fp:
    json.dump(predictions, fp)

In [95]:
predictions

{'10051880741': 0.6139959697049733,
 '10049481303': 0.9458008831501274,
 '10050819299': 0.2597395894612191,
 '10051439308': 0.3709068173921253,
 '10049403339': 0.6763236692418104,
 '10049366433': 0.7839420007494482,
 '10050899804': 0.2400931517432241,
 '10049496932': 0.9557694813481703,
 '10051884777': 0.6520359961507438,
 '10049376321': 0.846478496053237,
 '10049401861': 0.7706570866956781,
 '10049315083': 0.508282354937192,
 '10049502585': 0.49312387611346153,
 '10049404369': 0.7803692831255375,
 '10050320686': 0.25249193989446894,
 '10049466756': 0.14350137015526343,
 '10050990507': 0.934824101797517,
 '10049464868': 0.09641418305125171,
 '10015882182': 0.05744774202571534,
 '10049492138': 0.49282744846268933,
 '10049306422': 0.3458424446262143,
 '10051259950': 0.7234735629325542,
 '10049488897': 0.8471384172408989,
 '10051254648': 0.3606195635865106,
 '10051439228': 0.4983838897367239,
 '10049385053': 0.5766242942227393,
 '10051880114': 0.1451948766332407,
 '10049467117': 0.0808922

In [73]:
stats

{'Linear SVM': {'accs': [0.5897435897435898,
   0.8717948717948718,
   0.6923076923076923,
   0.717948717948718,
   0.6923076923076923,
   0.6923076923076923],
  'mean_acc': 0.7094017094017095,
  'aucs': [0.7421052631578947,
   0.9578947368421052,
   0.7566137566137566,
   0.8306878306878307,
   0.8359788359788359,
   0.7671957671957672],
  'mean_auc': 0.8150793650793652},
 'RBF SVM': {'accs': [0.717948717948718,
   0.7948717948717948,
   0.6923076923076923,
   0.6666666666666666,
   0.6666666666666666,
   0.7435897435897436],
  'mean_acc': 0.7136752136752137,
  'aucs': [0.7052631578947369,
   0.8236842105263158,
   0.6772486772486772,
   0.7777777777777778,
   0.664021164021164,
   0.7486772486772487],
  'mean_auc': 0.73277870602432},
 'Gaussian Process': {'accs': [0.6923076923076923,
   0.9487179487179487,
   0.7435897435897436,
   0.717948717948718,
   0.7948717948717948,
   0.6923076923076923],
  'mean_acc': 0.7649572649572649,
  'aucs': [0.7605263157894737,
   0.9447368421052631,


In [66]:
stats

{'Linear SVM': {'accs': [0.5128205128205128,
   0.5128205128205128,
   0.5384615384615384,
   0.5384615384615384,
   0.5384615384615384,
   0.5384615384615384],
  'mean_acc': 0.5299145299145298,
  'aucs': [0.7315789473684211,
   0.7157894736842105,
   0.6137566137566137,
   0.5555555555555556,
   0.6296296296296295,
   0.6587301587301587],
  'mean_auc': 0.6508400631207649},
 'RBF SVM': {'accs': [0.7435897435897436,
   0.6666666666666666,
   0.6666666666666666,
   0.6153846153846154,
   0.6410256410256411,
   0.6153846153846154],
  'mean_acc': 0.6581196581196581,
  'aucs': [0.7947368421052631,
   0.768421052631579,
   0.6984126984126984,
   0.5873015873015873,
   0.7486772486772486,
   0.7328042328042328],
  'mean_auc': 0.7217256103221015},
 'Gaussian Process': {'accs': [0.7948717948717948,
   0.6923076923076923,
   0.7692307692307693,
   0.6153846153846154,
   0.7435897435897436,
   0.6923076923076923],
  'mean_acc': 0.717948717948718,
  'aucs': [0.8657894736842106,
   0.82894736842105

In [42]:
classifier.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [36]:
np.mean(accs)

0.6581196581196581

In [37]:
np.mean(aucs)

0.7217256103221015

0.7947368421052631

In [26]:
classifier.decision_function(x_test)

array([ 0.8386783 ,  1.2551983 ,  0.74524375,  0.80074106,  0.64063606,
        0.60723565,  0.76313235,  1.2668413 , -0.38123883,  1.27054067,
        0.59130328,  1.01130377,  0.52817078,  0.69674998,  0.7252097 ,
       -1.03380609,  0.90316381, -1.10123188, -0.73089965,  0.82258331,
        0.94020794,  0.80325237,  1.26169926,  0.2431773 ,  1.00280836,
        0.1503732 ,  0.45508818, -1.1919378 ,  0.96311038, -0.79684875,
       -1.14048102,  0.85524406, -0.51163491,  0.49241312, -1.37375542,
        1.08652685,  0.66493067, -1.25669196, -1.3562428 ])

In [28]:
classifier.predict(x_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0])