In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
import warnings

from AdaSample import AdaSample

In [2]:
HideFrac = 0.8
TrainFrac = 0.6
SampFrac = 1.0 #Subsampling of training data when preforming Adasampling

dat = datasets.load_breast_cancer()
X = dat['data']
y = dat['target']
N = X.shape[0]

In [3]:
y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [4]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SVM", SVC(probability=True)),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, sampleFactor=SampFrac)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["AdaSingle"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["AdaEnsemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Original"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["Baseline"] = uNeg_clf.score(X_test, y_test)
pd.DataFrame.from_dict(clf_res)

100%|██████████| 25/25 [00:00<00:00, 538.12it/s]
100%|██████████| 10/10 [00:00<00:00, 624.89it/s]
 24%|██▍       | 6/25 [00:00<00:00, 53.22it/s]

SGD_lasso
Training AdaSamples..
Training 10 Classifiers
RF
Training AdaSamples..


100%|██████████| 25/25 [00:00<00:00, 50.72it/s]
100%|██████████| 10/10 [00:00<00:00, 51.44it/s]


Training 10 Classifiers


  8%|▊         | 2/25 [00:00<00:01, 15.61it/s]

SVM
Training AdaSamples..


100%|██████████| 25/25 [00:01<00:00, 15.81it/s]
 20%|██        | 2/10 [00:00<00:00, 17.85it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 17.90it/s]
  8%|▊         | 2/25 [00:00<00:01, 12.51it/s]

XGB
Training AdaSamples..


100%|██████████| 25/25 [00:01<00:00, 12.80it/s]
 20%|██        | 2/10 [00:00<00:00, 13.53it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 13.19it/s]
100%|██████████| 25/25 [00:00<00:00, 705.12it/s]
100%|██████████| 10/10 [00:00<00:00, 902.56it/s]


NB
Training AdaSamples..
Training 10 Classifiers


Unnamed: 0,SGD_lasso,RF,SVM,XGB,NB
AdaEnsemble,0.45614,0.763158,0.412281,0.798246,0.947368
AdaSingle,0.447368,0.723684,0.403509,0.815789,0.938596
Baseline,0.574561,0.539474,0.394737,0.570175,0.942982
Original,0.872807,0.960526,0.609649,0.973684,0.947368


In [5]:
HideFrac = 0.2

y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [6]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SVM", SVC(probability=True)),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, sampleFactor=SampFrac)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["AdaSingle"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["AdaEnsemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Original"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["Baseline"] = uNeg_clf.score(X_test, y_test)
pd.DataFrame.from_dict(clf_res)

100%|██████████| 25/25 [00:00<00:00, 589.31it/s]
100%|██████████| 10/10 [00:00<00:00, 586.09it/s]
  0%|          | 0/25 [00:00<?, ?it/s]

SGD_lasso
Training AdaSamples..
Training 10 Classifiers
RF
Training AdaSamples..


100%|██████████| 25/25 [00:00<00:00, 62.87it/s]
100%|██████████| 10/10 [00:00<00:00, 67.56it/s]
  0%|          | 0/25 [00:00<?, ?it/s]

Training 10 Classifiers
SVM
Training AdaSamples..


100%|██████████| 25/25 [00:01<00:00, 24.07it/s]
 30%|███       | 3/10 [00:00<00:00, 27.62it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 25.33it/s]
  8%|▊         | 2/25 [00:00<00:01, 14.67it/s]

XGB
Training AdaSamples..


100%|██████████| 25/25 [00:01<00:00, 19.40it/s]
 30%|███       | 3/10 [00:00<00:00, 20.62it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 21.71it/s]
100%|██████████| 25/25 [00:00<00:00, 784.62it/s]
100%|██████████| 10/10 [00:00<00:00, 913.12it/s]

NB
Training AdaSamples..
Training 10 Classifiers





Unnamed: 0,SGD_lasso,RF,SVM,XGB,NB
AdaEnsemble,0.903509,0.95614,0.447368,0.925439,0.916667
AdaSingle,0.903509,0.938596,0.385965,0.929825,0.934211
Baseline,0.859649,0.894737,0.47807,0.929825,0.912281
Original,0.649123,0.969298,0.644737,0.947368,0.938596
