In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
import warnings

from AdaSample import AdaSample

In [2]:
HideFrac = 0.8
TrainFrac = 0.6
N = 200000

SampFrac = 0.1 #Subsampling of training data when preforming Adasampling
NBoosts = 50 #Number of AdaSampling rounds

X, y = datasets.make_classification(N, 10, 5)

In [3]:
y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [4]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SGD_SVM", SGDClassifier(loss="modified_huber")),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, sampleFactor=SampFrac, n_rounds=NBoosts)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["AdaSingle"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["AdaEnsemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Original"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["Baseline"] = uNeg_clf.score(X_test, y_test)
        
pd.DataFrame.from_dict(clf_res)

  6%|▌         | 3/50 [00:00<00:01, 25.95it/s]

SGD_lasso
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 31.99it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:03<00:00,  3.17it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

RF
Training AdaSamples..


100%|██████████| 50/50 [00:31<00:00,  1.58it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [01:03<00:00,  6.35s/it]
 12%|█▏        | 6/50 [00:00<00:00, 55.04it/s]

SGD_SVM
Training AdaSamples..


100%|██████████| 50/50 [00:00<00:00, 54.92it/s]
 10%|█         | 1/10 [00:00<00:01,  5.30it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:01<00:00,  5.40it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

XGB
Training AdaSamples..


100%|██████████| 50/50 [01:11<00:00,  1.42s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [02:02<00:00, 12.27s/it]
  6%|▌         | 3/50 [00:00<00:01, 25.18it/s]

NB
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 25.97it/s]
 10%|█         | 1/10 [00:00<00:00,  9.55it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 10.41it/s]


Unnamed: 0,SGD_lasso,RF,SGD_SVM,XGB,NB
AdaEnsemble,0.7146,0.939462,0.781175,0.899438,0.669825
AdaSingle,0.714625,0.927488,0.780963,0.901275,0.669913
Baseline,0.610363,0.621788,0.533225,0.509563,0.660575
Original,0.74935,0.970463,0.676162,0.926162,0.691887


In [5]:
HideFrac=0.2

y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [6]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SGD_SVM", SGDClassifier(loss="modified_huber")),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, sampleFactor=SampFrac, n_rounds=NBoosts)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["AdaSingle"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["AdaEnsemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Original"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["Baseline"] = uNeg_clf.score(X_test, y_test)

pd.DataFrame.from_dict(clf_res)

  8%|▊         | 4/50 [00:00<00:01, 32.25it/s]

SGD_lasso
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 40.21it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:02<00:00,  4.30it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

RF
Training AdaSamples..


100%|██████████| 50/50 [00:23<00:00,  2.12it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]
 14%|█▍        | 7/50 [00:00<00:00, 61.72it/s]

SGD_SVM
Training AdaSamples..


100%|██████████| 50/50 [00:00<00:00, 62.61it/s]
 10%|█         | 1/10 [00:00<00:01,  7.03it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:01<00:00,  6.95it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

XGB
Training AdaSamples..


100%|██████████| 50/50 [00:57<00:00,  1.15s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [01:41<00:00, 10.19s/it]
  6%|▌         | 3/50 [00:00<00:01, 28.91it/s]

NB
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 29.47it/s]
 20%|██        | 2/10 [00:00<00:00, 13.69it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 13.68it/s]


Unnamed: 0,SGD_lasso,RF,SGD_SVM,XGB,NB
AdaEnsemble,0.727137,0.960425,0.783537,0.907763,0.666275
AdaSingle,0.727087,0.955113,0.782212,0.907488,0.666375
Baseline,0.7373,0.933737,0.545837,0.914713,0.764387
Original,0.728313,0.970975,0.63455,0.927075,0.689675
