In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
import warnings

from AdaSample import AdaSample

In [2]:
HideFrac = 0.8
TrainFrac = 0.6
N = 200000

SampFrac = 0.1 #Subsampling of training data when preforming Adasampling
NBoosts = 50 #Number of AdaSampling rounds

X, y = datasets.make_classification(N, 10, 5)

In [3]:
y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [4]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SGD_SVM", SGDClassifier(loss="modified_huber")),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, samp_frac=SampFrac, n_boost=NBoosts)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["PU_single"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["PU_ensemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Naive_clf"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["uNeg_clf"] = uNeg_clf.score(X_test, y_test)
        
pd.DataFrame.from_dict(clf_res)

  6%|▌         | 3/50 [00:00<00:01, 28.97it/s]

SGD_lasso
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 36.37it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:03<00:00,  3.09it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

RF
Training AdaSamples..


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [01:01<00:00,  6.12s/it]
 14%|█▍        | 7/50 [00:00<00:00, 66.29it/s]

SGD_SVM
Training AdaSamples..


100%|██████████| 50/50 [00:00<00:00, 68.52it/s]
 10%|█         | 1/10 [00:00<00:01,  5.14it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:01<00:00,  5.12it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

XGB
Training AdaSamples..


100%|██████████| 50/50 [01:00<00:00,  1.21s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [01:59<00:00, 11.92s/it]
  6%|▌         | 3/50 [00:00<00:01, 29.66it/s]

NB
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 29.86it/s]
 20%|██        | 2/10 [00:00<00:00, 11.18it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 11.13it/s]


Unnamed: 0,SGD_lasso,RF,SGD_SVM,XGB,NB
Naive_clf,0.91185,0.9712,0.8897,0.955788,0.885613
PU_ensemble,0.887525,0.93155,0.846988,0.941588,0.866175
PU_single,0.887687,0.8831,0.847562,0.940813,0.866413
uNeg_clf,0.52185,0.616237,0.79445,0.504687,0.7663


In [5]:
HideFrac=0.2

y_PU = np.copy(y)
Ps = np.where(y_PU == 1)[0]
y_PU[np.random.choice(Ps, int(np.floor(len(Ps) * HideFrac)))] = 0

inds = np.random.permutation(X.shape[0])
train_inds, test_inds = inds[:int(N * TrainFrac)], inds[int(N * TrainFrac):]

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
y_PU_train, y_PU_test = y_PU[train_inds], y_PU[test_inds]

In [6]:
clf_res = {}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for name, clf in [("SGD_lasso", SGDClassifier(loss="log", penalty="l1")),
                      ("RF", RandomForestClassifier()),
                      ("SGD_SVM", SGDClassifier(loss="modified_huber")),
                      ("XGB", XGBClassifier()),
                      ("NB", GaussianNB())]:
        print(name)
        clf_res[name] = {}
        ada = AdaSample(clone(clf))
        ada.fit(X_train, y_PU_train, C=10, samp_frac=SampFrac, n_boost=NBoosts)
        probas = ada.predict_proba(X_test, single=True)[:, 1]
        clf_res[name]["PU_single"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        probas = ada.predict_proba(X_test)[:, 1]
        clf_res[name]["PU_ensemble"] = accuracy_score(y_test, (probas > 0.5).astype(np.int))

        naive_clf = clone(clf)
        naive_clf.fit(X_train, y_train)
        clf_res[name]["Naive_clf"] = naive_clf.score(X_test, y_test)

        uNeg_clf = clone(clf)
        uNeg_clf.fit(X_train, y_PU_train)
        clf_res[name]["uNeg_clf"] = uNeg_clf.score(X_test, y_test)

pd.DataFrame.from_dict(clf_res)

 10%|█         | 5/50 [00:00<00:01, 42.98it/s]

SGD_lasso
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 44.70it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:02<00:00,  4.16it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

RF
Training AdaSamples..


100%|██████████| 50/50 [00:17<00:00,  2.79it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:43<00:00,  4.32s/it]
 16%|█▌        | 8/50 [00:00<00:00, 76.69it/s]

SGD_SVM
Training AdaSamples..


100%|██████████| 50/50 [00:00<00:00, 74.53it/s]
 10%|█         | 1/10 [00:00<00:01,  6.99it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:01<00:00,  6.93it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

XGB
Training AdaSamples..


100%|██████████| 50/50 [00:55<00:00,  1.11s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 Classifiers


100%|██████████| 10/10 [01:39<00:00,  9.98s/it]
  8%|▊         | 4/50 [00:00<00:01, 32.25it/s]

NB
Training AdaSamples..


100%|██████████| 50/50 [00:01<00:00, 31.48it/s]
 20%|██        | 2/10 [00:00<00:00, 13.49it/s]

Training 10 Classifiers


100%|██████████| 10/10 [00:00<00:00, 13.35it/s]


Unnamed: 0,SGD_lasso,RF,SGD_SVM,XGB,NB
Naive_clf,0.911288,0.970063,0.864925,0.9549,0.883875
PU_ensemble,0.9089,0.963163,0.903687,0.94835,0.866962
PU_single,0.909,0.9615,0.904262,0.947887,0.8672
uNeg_clf,0.9,0.934375,0.596575,0.952562,0.869775
