In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
import warnings

from AdaSample import AdaSample

In [2]:
HideFrac = 0.8
TrainFrac = 0.6
SampFrac = 1.0 #Subsampling of training data when preforming Adasampling

dat = datasets.load_breast_cancer()
X = dat['data']
y = dat['target']
N = X.shape[0]

In [3]:
y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [4]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SVM", SVC(probability=True)),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, samp_frac=SampFrac)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["PU_single"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["PU_ensemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Naive_clf"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["uNeg_clf"] = uNeg_clf.score(X_test, y_test)
pd.DataFrame.from_dict(clf_res)

100%|██████████| 25/25 [00:00<00:00, 369.92it/s]
100%|██████████| 10/10 [00:00<00:00, 578.41it/s]
  0%|          | 0/25 [00:00<?, ?it/s]

SGD_lasso
Training AdaSamples..
Training 10 Classifiers
RF
Training AdaSamples..


100%|██████████| 25/25 [00:00<00:00, 61.39it/s]
100%|██████████| 10/10 [00:00<00:00, 62.31it/s]


Training 10 Classifiers


 12%|█▏        | 3/25 [00:00<00:00, 24.30it/s]

SVM
Training AdaSamples..


100%|██████████| 25/25 [00:01<00:00, 25.00it/s]
 30%|███       | 3/10 [00:00<00:00, 20.37it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 20.86it/s]
  8%|▊         | 2/25 [00:00<00:01, 17.04it/s]

XGB
Training AdaSamples..


100%|██████████| 25/25 [00:01<00:00, 17.62it/s]
 20%|██        | 2/10 [00:00<00:00, 16.45it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 16.24it/s]
100%|██████████| 25/25 [00:00<00:00, 879.11it/s]
100%|██████████| 10/10 [00:00<00:00, 1080.95it/s]

NB
Training AdaSamples..
Training 10 Classifiers





Unnamed: 0,SGD_lasso,RF,SVM,XGB,NB
Naive_clf,0.929825,0.960526,0.587719,0.960526,0.947368
PU_ensemble,0.916667,0.802632,0.421053,0.842105,0.951754
PU_single,0.837719,0.75,0.421053,0.850877,0.951754
uNeg_clf,0.719298,0.600877,0.416667,0.627193,0.947368


In [5]:
HideFrac = 0.2

y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [6]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SVM", SVC(probability=True)),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, samp_frac=SampFrac)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["PU_single"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["PU_ensemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Naive_clf"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["uNeg_clf"] = uNeg_clf.score(X_test, y_test)
pd.DataFrame.from_dict(clf_res)

100%|██████████| 25/25 [00:00<00:00, 505.29it/s]
100%|██████████| 10/10 [00:00<00:00, 665.92it/s]
  0%|          | 0/25 [00:00<?, ?it/s]

SGD_lasso
Training AdaSamples..
Training 10 Classifiers
RF
Training AdaSamples..


100%|██████████| 25/25 [00:00<00:00, 73.98it/s]
100%|██████████| 10/10 [00:00<00:00, 57.22it/s]


Training 10 Classifiers


 16%|█▌        | 4/25 [00:00<00:00, 33.27it/s]

SVM
Training AdaSamples..


100%|██████████| 25/25 [00:00<00:00, 34.87it/s]
 40%|████      | 4/10 [00:00<00:00, 35.39it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 35.75it/s]
 12%|█▏        | 3/25 [00:00<00:01, 20.82it/s]

XGB
Training AdaSamples..


100%|██████████| 25/25 [00:00<00:00, 26.68it/s]
 30%|███       | 3/10 [00:00<00:00, 24.01it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 24.27it/s]
100%|██████████| 25/25 [00:00<00:00, 898.11it/s]
100%|██████████| 10/10 [00:00<00:00, 1045.70it/s]

NB
Training AdaSamples..
Training 10 Classifiers





Unnamed: 0,SGD_lasso,RF,SVM,XGB,NB
Naive_clf,0.925439,0.97807,0.614035,0.969298,0.95614
PU_ensemble,0.850877,0.907895,0.578947,0.929825,0.95614
PU_single,0.855263,0.885965,0.469298,0.934211,0.95614
uNeg_clf,0.425439,0.916667,0.596491,0.907895,0.95614
