# Data

In [731]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [738]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [740]:
p_num = 5

N_train = len(newsgroups_train.data)
train_target = newsgroups_train.target.copy()
train_target[train_target == 0] = 20
train_target[train_target <= p_num] = 1
train_target[train_target != 1] = -1

In [743]:
N_test = len(newsgroups_test.data)
test_target = newsgroups_test.target.copy()
test_target[test_target  == 0] = 20
test_target[test_target  <= p_num] = 1
test_target[test_target != 1] = -1

## Unigram

In [950]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [744]:
vectorizer = TfidfVectorizer(
    min_df=2, max_df=0.95, stop_words='english', max_features=10000) #, norm=None) #, use_idf=False) 
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)

In [745]:
vectors_train.shape

(11314, 10000)

In [746]:
clf = LinearSVC()
clf.fit(vectors_train, train_target)
clf.score(vectors_test, test_target)

0.9462294211364843

In [9]:
np.mean(train_target[:2000])

0.137

In [10]:
a = np.expand_dims(vectors_train.todense(), 1)

In [11]:
np.concatenate([a, a], axis=1).shape

(11314, 2, 10000)

## Bigram

In [9]:
from sklearn.svm import LinearSVC

In [24]:
vectorizer_bi = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    ngram_range=(2, 2),
    stop_words='english',
    max_features=10000)
vectors_train_bi = vectorizer_bi.fit_transform(newsgroups_train.data)
vectors_test_bi = vectorizer_bi.transform(newsgroups_test.data)

In [25]:
vectors_train_bi.shape

(11314, 10000)

In [11]:
clf = LinearSVC()
clf.fit(vectors_train_bi, train_target)
clf.score(vectors_test_bi, test_target)

0.8591343600637281

## Trigram

In [26]:
vectorizer_tri = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    ngram_range=(3, 3),
    stop_words='english',
    max_features=10000)
vectors_train_tri = vectorizer_tri.fit_transform(newsgroups_train.data)
vectors_test_tri = vectorizer_tri.transform(newsgroups_test.data)

In [13]:
vectors_train_tri.shape

(11314, 10000)

In [14]:
clf = LinearSVC()
clf.fit(vectors_train_tri, train_target)
clf.score(vectors_test_tri, test_target)

0.8170472650026553

## Elmo

In [16]:
cd ..

/media/yu-guan/DATA/works/ens/M1/Intern/Codes/pu_biased_n


In [17]:
import h5py
from sklearn import preprocessing

In [18]:
elmo_train_f = h5py.File(
    'data/20newsgroups/20newsgroups_elmo_mmm_train.hdf5', 'r')
elmo_test_f = h5py.File(
    'data/20newsgroups/20newsgroups_elmo_mmm_test.hdf5', 'r')

In [19]:
train_data = elmo_train_f['data'][:]
test_data = elmo_test_f['data'][:]

In [20]:
train_data_p = preprocessing.scale(train_data)
test_data_p = preprocessing.scale(test_data)

## Glove

In [71]:
elmo_train_gf = h5py.File(
    'data/20newsgroups/20newsgroups_glove_mmm_train.hdf5', 'r')
elmo_test_gf = h5py.File(
    'data/20newsgroups/20newsgroups_glove_mmm_test.hdf5', 'r')

In [74]:
train_data_g = elmo_train_gf['data'][:]
test_data_g = elmo_test_gf['data'][:]

In [77]:
train_data_gp = preprocessing.scale(train_data_g)
test_data_gp = preprocessing.scale(test_data_g)

# Pytorch

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import sklearn.metrics

import sys

In [13]:
dtype = torch.cuda.FloatTensor


class Classifier(object):

    def __init__(self, model, lr=5e-3, weight_decay=1e-2):
        self.model = model
        self.lr = lr
        self.weight_decay = weight_decay
        self.test_accuracies = []
        self.init_optimizer()

    def init_optimizer(self):
        self.optimizer = optim.Adam(
            self.model.parameters(),
            lr=self.lr, weight_decay=self.weight_decay)

    def train(self, training_set, test_set,
              batch_size, num_epochs,
              test_interval=1, print_interval=1):

        self.init_optimizer()
        self.test(test_set, True)

        train_loader = torch.utils.data.DataLoader(
            training_set, batch_size=batch_size,
            shuffle=True, num_workers=1)

        for epoch in range(num_epochs):

            total_loss = self.train_step(train_loader)

            if (epoch+1) % test_interval == 0 or epoch+1 == num_epochs:

                to_print = (epoch+1) % print_interval == 0
                if to_print:
                    sys.stdout.write('Epoch: {}  '.format(epoch))
                    print('Train Loss: {:.6f}'.format(total_loss))
                self.test(test_set, to_print)

    def train_step(self, train_loader, convex=True):
        self.model.train()
        total_loss = 0
        for x, target in train_loader:
            self.optimizer.zero_grad()
            loss = self.compute_loss(x, target, convex)
            total_loss += loss.item()
            loss = loss
            loss.backward()
            self.optimizer.step()
        return total_loss

    def basic_loss(self, fx, convex=True):
        if convex:
            negative_logistic = nn.LogSigmoid()
            return -negative_logistic(fx)
        else:
            sigmoid = nn.Sigmoid()
            return sigmoid(-fx)

    def compute_loss(self, x, target, convex=True):
        fx = self.model(x.type(dtype))
        target = target.type(dtype)
        loss = torch.sum(self.basic_loss(fx * target, convex))
        return loss.cpu()

    def test(self, test_set, to_print=True):
        self.model.eval()
        x = test_set.tensors[0].type(dtype)
        target = test_set.tensors[1].numpy()
        target[target == -1] = 0
        output = self.model(x)
        pred = torch.sign(output)
        pred = pred.detach().cpu().numpy().reshape(-1)
        pred[pred == -1] = 0
        accuracy = sklearn.metrics.accuracy_score(target, pred)
        f1_score = sklearn.metrics.f1_score(target, pred)
        self.test_accuracies.append(accuracy)
        if to_print:
            print('Test set: Accuracy: {:.2f}%'
                  .format(accuracy*100), flush=True)
            print('Test set: F1 Score: {:.2f}%'
                  .format(f1_score*100), flush=True)

## unigram

In [11]:
vectors_train.shape

(11314, 130107)

In [45]:
class Linear(nn.Module):

    def __init__(self, num_classes=1):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(130107, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x

In [22]:
training_set = torch.utils.data.TensorDataset(
    torch.tensor(vectors_train[:2000].toarray()), torch.tensor(train_target[:2000]).unsqueeze(1))
test_set = torch.utils.data.TensorDataset(
    torch.tensor(vectors_test[:2000].toarray()), torch.tensor(test_target[:2000]).unsqueeze(1))

In [29]:
model = Linear().cuda()
cls = Classifier(model, lr=1e-2, weight_decay=1e-4)

In [30]:
# 1-gram
cls.train(training_set, test_set, batch_size=120, num_epochs=30)

Test set: Accuracy: 974.0/2000 (48.70%)
Epoch: 0  Train Loss: 1305.074692
Test set: Accuracy: 1615.0/2000 (80.75%)
Epoch: 1  Train Loss: 1051.899849
Test set: Accuracy: 1660.0/2000 (83.00%)
Epoch: 2  Train Loss: 872.567608
Test set: Accuracy: 1683.0/2000 (84.15%)
Epoch: 3  Train Loss: 738.338737
Test set: Accuracy: 1717.0/2000 (85.85%)
Epoch: 4  Train Loss: 634.672615
Test set: Accuracy: 1718.0/2000 (85.90%)
Epoch: 5  Train Loss: 553.611063
Test set: Accuracy: 1733.0/2000 (86.65%)
Epoch: 6  Train Loss: 487.804068
Test set: Accuracy: 1734.0/2000 (86.70%)
Epoch: 7  Train Loss: 433.927530
Test set: Accuracy: 1746.0/2000 (87.30%)
Epoch: 8  Train Loss: 389.136547
Test set: Accuracy: 1747.0/2000 (87.35%)
Epoch: 9  Train Loss: 350.918127
Test set: Accuracy: 1759.0/2000 (87.95%)
Epoch: 10  Train Loss: 318.576545
Test set: Accuracy: 1764.0/2000 (88.20%)
Epoch: 11  Train Loss: 290.558525
Test set: Accuracy: 1760.0/2000 (88.00%)
Epoch: 12  Train Loss: 266.098560
Test set: Accuracy: 1770.0/2000 (8

## Elmo

In [67]:
class Net(nn.Module):

    def __init__(self, num_classes=1):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9216, 300)
        self.fc2 = nn.Linear(300, 300)
        self.fc3 = nn.Linear(300, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [64]:
training_set = torch.utils.data.TensorDataset(
    torch.tensor(train_data_p[:2000]), torch.tensor(train_target[:2000]).unsqueeze(1))
test_set = torch.utils.data.TensorDataset(
    torch.tensor(test_data_p[:2000]), torch.tensor(test_target[:2000]).unsqueeze(1))

In [68]:
model = Net().cuda()
cls = Classifier(model, lr=5e-3, weight_decay=1e-2)

In [70]:
# elmo
cls.train(training_set, test_set, batch_size=120, num_epochs=30)

Test set: Accuracy: 1496.0/2000 (74.80%)
Epoch: 0  Train Loss: 1009.933940
Test set: Accuracy: 1724.0/2000 (86.20%)
Epoch: 1  Train Loss: 207.716928
Test set: Accuracy: 1754.0/2000 (87.70%)
Epoch: 2  Train Loss: 88.482116
Test set: Accuracy: 1742.0/2000 (87.10%)
Epoch: 3  Train Loss: 42.204642
Test set: Accuracy: 1763.0/2000 (88.15%)
Epoch: 4  Train Loss: 15.729638
Test set: Accuracy: 1766.0/2000 (88.30%)
Epoch: 5  Train Loss: 74.469606
Test set: Accuracy: 1736.0/2000 (86.80%)
Epoch: 6  Train Loss: 138.549006
Test set: Accuracy: 1747.0/2000 (87.35%)
Epoch: 7  Train Loss: 248.145907
Test set: Accuracy: 1715.0/2000 (85.75%)
Epoch: 8  Train Loss: 50.093974
Test set: Accuracy: 1764.0/2000 (88.20%)
Epoch: 9  Train Loss: 48.535688
Test set: Accuracy: 1777.0/2000 (88.85%)
Epoch: 10  Train Loss: 32.726519
Test set: Accuracy: 1744.0/2000 (87.20%)
Epoch: 11  Train Loss: 27.642034
Test set: Accuracy: 1772.0/2000 (88.60%)
Epoch: 12  Train Loss: 4.739286
Test set: Accuracy: 1767.0/2000 (88.35%)
Epo

## Glove

In [85]:
class Net(nn.Module):

    def __init__(self, num_classes=1):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(900, 300)
        self.fc2 = nn.Linear(300, 300)
        self.fc3 = nn.Linear(300, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [86]:
training_set = torch.utils.data.TensorDataset(
    torch.tensor(train_data_gp[:2000]), torch.tensor(train_target[:2000]).unsqueeze(1))
test_set = torch.utils.data.TensorDataset(
    torch.tensor(test_data_gp[:2000]), torch.tensor(test_target[:2000]).unsqueeze(1))

In [89]:
model = Net().cuda()
cls = Classifier(model, lr=1e-3, weight_decay=1e-2)

In [90]:
# glove
cls.train(training_set, test_set, batch_size=120, num_epochs=30)

Test set: Accuracy: 957.0/2000 (47.85%)
Epoch: 0  Train Loss: 1029.039371
Test set: Accuracy: 1645.0/2000 (82.25%)
Epoch: 1  Train Loss: 581.739660
Test set: Accuracy: 1662.0/2000 (83.10%)
Epoch: 2  Train Loss: 373.271939
Test set: Accuracy: 1653.0/2000 (82.65%)
Epoch: 3  Train Loss: 212.011153
Test set: Accuracy: 1641.0/2000 (82.05%)
Epoch: 4  Train Loss: 172.560154
Test set: Accuracy: 1645.0/2000 (82.25%)
Epoch: 5  Train Loss: 129.949013
Test set: Accuracy: 1672.0/2000 (83.60%)
Epoch: 6  Train Loss: 58.449028
Test set: Accuracy: 1660.0/2000 (83.00%)
Epoch: 7  Train Loss: 35.118408
Test set: Accuracy: 1667.0/2000 (83.35%)
Epoch: 8  Train Loss: 19.493456
Test set: Accuracy: 1675.0/2000 (83.75%)
Epoch: 9  Train Loss: 24.963661
Test set: Accuracy: 1678.0/2000 (83.90%)
Epoch: 10  Train Loss: 18.041544
Test set: Accuracy: 1672.0/2000 (83.60%)
Epoch: 11  Train Loss: 11.603492
Test set: Accuracy: 1676.0/2000 (83.80%)
Epoch: 12  Train Loss: 9.577533
Test set: Accuracy: 1683.0/2000 (84.15%)
Ep

# bN

In [571]:
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target

In [780]:
num_classes = 20

# positive_classes = [i for i in range(11)]
positive_classes = [1, 2, 3, 4, 5]
negative_classes = None
# neg_ps = [0] * 11 + [0.25] * 4 + [0] * 5
# neg_ps = [0] * 11 + [0.025] * 4 + [0.5] + [0.112, 0.116, 0.096, 0.076]
neg_ps = [0] * 16 + [0.28, 0.29, 0.24, 0.19]

In [573]:
priors = []
for i in range(num_classes):
    priors.append(
        (np.sum(train_labels == i).item() + np.sum(test_labels == i).item())
        / (len(train_labels) + len(test_labels)))

In [574]:
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

In [781]:
def pick_u_data(labels, n):
    if negative_classes is None:
        selected_u = np.random.choice(len(labels), n, replace=False)
    else:
        u_idxs = np.zeros_like(labels)
        for i in range(num_classes):
            if i in positive_classes or i in negative_classes:
                u_idxs[(labels == i).numpy().astype(bool)] = 1
        u_idxs = np.argwhere(u_idxs == 1).reshape(-1)
        selected_u = np.random.choice(u_idxs, n, replace=False)
    return selected_u

In [576]:
def pick_p_data(labels, n):
    p_idxs = np.zeros_like(labels)
    for i in range(num_classes):
        if i in positive_classes:
            p_idxs[(labels == i).numpy().astype(bool)] = 1
    p_idxs = np.argwhere(p_idxs == 1).reshape(-1)
    selected_p = np.random.choice(p_idxs, n, replace=False)
    return selected_p

In [577]:
def pick_n_data(labels, n):
    n_idxs = np.zeros_like(labels)
    for i in range(num_classes):
        if negative_classes is None:
            if i not in positive_classes:
                n_idxs[(labels == i).numpy().astype(bool)] = 1
        else:
            if i in negative_classes:
                n_idxs[(labels == i).numpy().astype(bool)] = 1
    n_idxs = np.argwhere(n_idxs == 1).reshape(-1)
    selected_n = np.random.choice(n_idxs, n, replace=False)
    return selected_n

In [578]:
def pick_sn_data(labels, n):
    neg_nums = np.random.multinomial(n, neg_ps)
    print('numbers in each subclass', neg_nums)
    selected_sn = []
    for i in range(num_classes):
        if neg_nums[i] != 0:
            idxs = np.argwhere(labels == i).reshape(-1)
            selected = np.random.choice(idxs, neg_nums[i], replace=False)
            selected_sn.extend(selected)
    selected_sn = np.array(selected_sn)
    return selected_sn

In [195]:
u_idxs_v = pick_u_data(train_labels, 1200)
p_idxs_v = pick_p_data(train_labels, 100)

In [580]:
priors

[0.04239626445930171,
 0.05162899288973787,
 0.052265732781492096,
 0.05210654780855354,
 0.051098376313276024,
 0.05242491775443065,
 0.05173511620503025,
 0.05253104106972302,
 0.052849411015600124,
 0.052743287700307756,
 0.05300859598853868,
 0.0525841027273692,
 0.05221267112384591,
 0.05253104106972302,
 0.052371856096784464,
 0.05290247267324631,
 0.04828610845802823,
 0.04987795818741377,
 0.04112278467579327,
 0.0333227210018041]

In [874]:
n_p = 500
n_n = 500
u_idxs = pick_u_data(train_labels, 6000)
p_idxs = pick_p_data(train_labels, n_p)
n_idxs = pick_n_data(train_labels, n_n)
sn_idxs = pick_sn_data(train_labels, n_n)

numbers in each subclass [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 140 152
 107 101]


In [875]:
tr1_u = vectors_train[u_idxs]
tr1_p = vectors_train[p_idxs]
tr1_n = vectors_train[n_idxs]
tr1_sn = vectors_train[sn_idxs]

tr2_p = vectors_train_bi[p_idxs]
tr2_n = vectors_train_bi[n_idxs]
tr2_sn = vectors_train_bi[sn_idxs]

tr3_p = vectors_train_tri[p_idxs]
tr3_n = vectors_train_tri[n_idxs]
tr3_sn = vectors_train_tri[sn_idxs]

In [878]:
tr_l = np.concatenate([np.ones(tr1_p.shape[0]), -np.ones(tr1_sn.shape[0])])

In [79]:
tr1_uv = vectors_train[u_idxs_v]
tr1_pv = vectors_train[p_idxs_v]

In [270]:
tre_p = train_data_p[p_idxs]
tre_n = train_data_p[n_idxs]
tre_sn = train_data_p[sn_idxs]

In [144]:
tr1_p.shape

(1000, 56123)

In [145]:
tr1_sn.shape

(1000, 56123)

In [20]:
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target
p_idxs = train_labels <= 5
sn_idxs = np.logical_and(5 < train_labels, train_labels <= 10)

In [None]:
tr_l = np.concatenate([np.ones(tre_p.shape[0]), -np.ones(tre_sn.shape[0])])

In [324]:
tr_l = np.concatenate([np.ones(n), -np.ones(n)])

# CBS

In [30]:
from sklearn import preprocessing

In [31]:
def center(p_vectors, n_vectors, alpha=16, beta=4):
    p_nor = preprocessing.normalize(p_vectors)
    n_nor = preprocessing.normalize(n_vectors)
    return alpha*np.mean(p_nor, axis=0) - beta*np.mean(n_nor, axis=0)

In [32]:
def sim_cos(vecs, c):
    vecs_nor = preprocessing.normalize(vecs)
    c_nor = preprocessing.normalize(c[None, :])[0]
    return vecs_nor @ c_nor

def sim_gow(vecs, c):
    vecs_nor = preprocessing.normalize(vecs)
    c_nor = preprocessing.normalize(c[None, :])[0]
    return 1 - np.mean(np.abs(vecs_nor-c_nor), axis=1)

def sim_lor(vecs, c):
    return 1 - np.sum(np.log(1+np.abs(vecs-c)), axis=1)

def sim_dice(vecs, c, epsilon=1e-8):
    vecs_sum = np.sum(vecs**2, axis=1)
    c_sum = np.sum(c**2)
    return 2*vecs@c/(vecs_sum+c_sum+epsilon)

def sim_jac(vecs, c, epsilon=1e-8):
    vecs_sum = np.sum(vecs**2, axis=1)
    c_sum = np.sum(c**2)
    vecs_c = vecs @ c
    return vecs_c/(vecs_sum+c_sum-vecs_c+epsilon)

In [33]:
def sims(vecs, c, epsilon=1e-8):
    sims_cos = sim_cos(vecs, c)
    sims_gow = sim_gow(vecs, c)
    sims_lor = sim_lor(vecs, c)
    sims_dice = sim_dice(vecs, c, epsilon)
    sims_jac = sim_jac(vecs, c, epsilon)
    return np.vstack([sims_cos, sims_gow, sims_lor, sims_dice, sims_jac]).T

In [34]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import chi2
from sklearn.metrics import f1_score
import scipy.sparse

In [876]:
vectors_tr1 = scipy.sparse.vstack([tr1_p, tr1_sn])

In [879]:
f_score1 = chi2(vectors_tr1, tr_l)
# f_score1 = mutual_info_classif(vectors_tr1, tr_l, discrete_features=True)

In [943]:
n_select = 50

In [944]:
s1_idxs = np.argsort(-f_score1[0])[:n_select]
vectors_tr_in1 = vectors_tr1[:, s1_idxs]
vectors_te_in1 = vectors_test[:, s1_idxs][:2000]

In [945]:
clf = LinearSVC()
clf.fit(vectors_tr_in1, tr_l)
print(clf.score(vectors_te_in1, test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(vectors_te_in1))

0.7965


0.8475084301236419

In [946]:
clf = LinearSVC()
clf.fit(vectors_tr1, tr_l)
print(clf.score(vectors_test[:2000], test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(vectors_test[:2000]))

0.732


0.7757322175732217

In [951]:
tr1_s = vectors_tr_in1.toarray()
te1_s = vectors_te_in1.toarray()

In [952]:
c1 = center(tr1_s[:n], tr1_s[n:], alpha=8, beta=4)
cbs_tr1 = preprocessing.scale(sims(tr1_s, c1))
cbs_te1 = preprocessing.scale(sims(te1_s, c1))

In [953]:
clf = SVC(max_iter=10000)
clf.fit(cbs_tr1, tr_l)
print(clf.score(cbs_te1, test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(cbs_te1))

0.786


0.8381240544629349

In [887]:
from sklearn.ensemble import RandomForestClassifier

In [888]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(vectors_tr1, tr_l)
print(clf.score(vectors_test[:2000], test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(vectors_test[:2000]))

0.7005


0.7477894736842106

In [889]:
vectors_tr2 = scipy.sparse.vstack([tr2_p, tr2_sn])

In [890]:
f_score2 = chi2(vectors_tr2, tr_l)
s2_idxs = np.argsort(-f_score2[0])[:n_select]
vectors_tr_in2 = vectors_tr2[:, s2_idxs]
vectors_te_in2 = vectors_test_bi[:, s2_idxs][:2000]

In [891]:
vectors_tr3 = scipy.sparse.vstack([tr3_p, tr3_sn])

In [892]:
f_score3 = chi2(vectors_tr3, tr_l)
s3_idxs = np.argsort(-f_score3[0])[:n_select]
vectors_tr_in3 = vectors_tr2[:, s3_idxs]
vectors_te_in3 = vectors_test_bi[:, s3_idxs][:2000]

In [893]:
tr2_s = vectors_tr_in2.toarray()
te2_s = vectors_te_in2.toarray()

In [894]:
c2 = center(tr2_s[:n], tr2_s[n:], alpha=4, beta=2)
cbs_tr2 = sims(tr2_s, c2)
cbs_te2 = sims(te2_s, c2)

In [895]:
clf = LinearSVC(max_iter=5000)
clf.fit(cbs_tr2, tr_l)
print(clf.score(cbs_te2, test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(cbs_te2))

0.434


0.3887688984881209

In [896]:
tr3_s = vectors_tr_in3.toarray()
te3_s = vectors_te_in3.toarray()

In [897]:
c3 = center(tr3_s[:n], tr3_s[n:], alpha=4, beta=2)
cbs_tr3 = sims(tr3_s, c3)
cbs_te3 = sims(te3_s, c3)

In [898]:
clf = LinearSVC(max_iter=5000)
clf.fit(cbs_tr3, tr_l)
print(clf.score(cbs_te3, test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(cbs_te3))

0.368


0.24851367419738402

In [899]:
cbs_tr = np.hstack([cbs_tr1, cbs_tr2, cbs_tr3])
cbs_te = np.hstack([cbs_te1, cbs_te2, cbs_te3])

In [900]:
clf = LinearSVC(max_iter=5000)
clf.fit(cbs_tr, tr_l)
print(clf.score(cbs_te, test_target[:2000]))
f1_score(-test_target[:2000], -clf.predict(cbs_te))

0.7735


0.8210193599367838

## elmo

In [315]:
n_select = 1000

In [271]:
vectors_tre = np.vstack([tre_p, tre_sn])

In [273]:
# f_scoree = chi2(vectors_tre, tr_l)
f_scoree = mutual_info_classif(vectors_tre, tr_l, discrete_features=True)

In [316]:
s1_idxs = np.argsort(-f_scoree)[:n_select]
vectors_tr_ine = vectors_tre[:, s1_idxs]
vectors_te_ine = test_data_p[:, s1_idxs][:2000]

In [317]:
clf = LinearSVC()
clf.fit(vectors_tr_ine, tr_l)
print(clf.score(vectors_te_ine, test_target[:2000]))
f1_score(clf.predict(vectors_te_ine), test_target[:2000])

0.746


0.7814113597246127

In [272]:
clf = LinearSVC()
clf.fit(vectors_tre, tr_l)
print(clf.score(test_data_p[:2000], test_target[:2000]))
f1_score(clf.predict(test_data_p[:2000]), test_target[:2000])

0.835


0.8547535211267606

In [299]:
c = center(vectors_tre[:n], vectors_tre[n:], alpha=10, beta=4)
cbs_tr = sims(vectors_tre, c)
cbs_te = sims(test_data_p[:2000], c)

In [289]:
clf = LinearSVC()
clf.fit(cbs_tr, tr_l)
print(clf.score(cbs_te, test_target[:2000]))
f1_score(clf.predict(cbs_te), test_target[:2000])

0.574




0.7293519695044473

In [318]:
tr1_s = vectors_tr_ine
te1_s = vectors_te_ine

In [319]:
c1 = center(tr1_s[:n], tr1_s[n:], alpha=10, beta=6)
cbs_tr1 = sims(tr1_s, c1)
cbs_te1 = sims(te1_s, c1)

In [320]:
clf = LinearSVC(max_iter=5000)
clf.fit(cbs_tr1, tr_l)
print(clf.score(cbs_te1, test_target[:2000]))
f1_score(test_target[:2000], clf.predict(cbs_te1))

0.574




0.7293519695044473

## pytorch

In [326]:
class Net(nn.Module):

    def __init__(self, d_input):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(d_input, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x

In [330]:
vectors_tr_p = preprocessing.scale(cbs_tr)
vectors_te_p = preprocessing.scale(cbs_te)

In [336]:
training_set = torch.utils.data.TensorDataset(
    torch.tensor(cbs_tr), torch.tensor(tr_l).unsqueeze(1))
test_set = torch.utils.data.TensorDataset(
    torch.tensor(cbs_te), torch.tensor(test_target[:2000]).unsqueeze(1))

In [337]:
model = Net(15).cuda()
cls = Classifier(model, lr=5e-3, weight_decay=1e-4)

In [338]:
cls.train(training_set, test_set, batch_size=120, num_epochs=80)

Test set: Accuracy: 57.40%
Test set: F1 Score: 72.94%
Epoch: 0  Train Loss: 1381.562126
Test set: Accuracy: 49.15%
Test set: F1 Score: 24.05%
Epoch: 1  Train Loss: 1353.100014
Test set: Accuracy: 57.25%
Test set: F1 Score: 43.19%
Epoch: 2  Train Loss: 1333.811615
Test set: Accuracy: 67.95%
Test set: F1 Score: 77.98%
Epoch: 3  Train Loss: 1319.838493
Test set: Accuracy: 60.70%
Test set: F1 Score: 74.46%
Epoch: 4  Train Loss: 1287.703045
Test set: Accuracy: 62.95%
Test set: F1 Score: 55.76%
Epoch: 5  Train Loss: 1268.075882
Test set: Accuracy: 72.30%
Test set: F1 Score: 71.44%
Epoch: 6  Train Loss: 1248.862343
Test set: Accuracy: 67.45%
Test set: F1 Score: 64.01%
Epoch: 7  Train Loss: 1229.775620
Test set: Accuracy: 80.25%
Test set: F1 Score: 82.59%
Epoch: 8  Train Loss: 1212.279110
Test set: Accuracy: 80.25%
Test set: F1 Score: 82.50%
Epoch: 9  Train Loss: 1195.412617
Test set: Accuracy: 67.70%
Test set: F1 Score: 64.43%
Epoch: 10  Train Loss: 1180.469841
Test set: Accuracy: 66.90%
Test

In [102]:
vectors_tr_p = preprocessing.scale(vectors_tr_in1.toarray())
vectors_te_p = preprocessing.scale(vectors_te_in1.toarray())

In [331]:
training_set = torch.utils.data.TensorDataset(
    torch.tensor(vectors_tr_p), torch.tensor(tr_l).unsqueeze(1))
test_set = torch.utils.data.TensorDataset(
    torch.tensor(vectors_te_p), torch.tensor(test_target[:2000]).unsqueeze(1))

In [96]:
vectors_tr_p.shape

(6390, 600)

In [99]:
model = Net(600).cuda()
cls = Classifier(model, lr=1e-3, weight_decay=1e-4)

In [100]:
cls.train(training_set, test_set, batch_size=100, num_epochs=80)

Test set: Accuracy: 58.25%
Test set: F1 Score: 49.55%
Epoch: 0  Train Loss: 2673.447735
Test set: Accuracy: 70.55%
Test set: F1 Score: 66.63%
Epoch: 1  Train Loss: 1522.633986
Test set: Accuracy: 71.45%
Test set: F1 Score: 67.50%
Epoch: 2  Train Loss: 1203.305039
Test set: Accuracy: 71.35%
Test set: F1 Score: 67.39%
Epoch: 3  Train Loss: 1039.373862
Test set: Accuracy: 71.30%
Test set: F1 Score: 67.31%
Epoch: 4  Train Loss: 936.379956
Test set: Accuracy: 71.05%
Test set: F1 Score: 67.12%
Epoch: 5  Train Loss: 863.410553
Test set: Accuracy: 70.85%
Test set: F1 Score: 66.89%
Epoch: 6  Train Loss: 810.162971
Test set: Accuracy: 70.50%
Test set: F1 Score: 66.55%
Epoch: 7  Train Loss: 767.677536
Test set: Accuracy: 70.20%
Test set: F1 Score: 66.33%
Epoch: 8  Train Loss: 734.302266
Test set: Accuracy: 70.05%
Test set: F1 Score: 66.22%
Epoch: 9  Train Loss: 705.376602
Test set: Accuracy: 69.85%
Test set: F1 Score: 66.07%
Epoch: 10  Train Loss: 681.746154
Test set: Accuracy: 70.05%
Test set: F

## PU

In [137]:
cd pu_biased_n/

/media/yu-guan/DATA/works/ens/M1/Intern/Codes/pu_biased_n


In [138]:
%run newsgroups/cbs.py

In [140]:
cbs_p1

array([[ 4.03007367e-02,  9.70770569e-01, -1.78919024e+01,
         1.28286632e-02,  6.45574085e-03],
       [ 2.50667159e-01,  9.71536787e-01, -1.75724533e+01,
         1.75214777e-01,  9.60193974e-02],
       [ 2.02123627e-01,  9.71443745e-01, -1.75493282e+01,
         1.13634864e-01,  6.02401211e-02],
       ...,
       [ 2.62283139e-01,  9.71864258e-01, -1.78045876e+01,
         1.46390508e-01,  7.89759163e-02],
       [ 3.04537705e-01,  9.72420796e-01, -1.74254894e+01,
         1.91455920e-01,  1.05861904e-01],
       [ 1.67413298e-01,  9.72481346e-01, -1.80181480e+01,
         1.35276394e-01,  7.25450105e-02]])

In [142]:
a = []
a.append(generate_cbs_features(tr1_ps, tr1_bns, alpha=8))
a.append(generate_cbs_features(tr1_ps, tr1_bns, alpha=10))

In [150]:
np.swapaxes(np.array(a), 0, 1).shape

(2, 2, 500, 5)

In [80]:
tr1_ps = tr1_p.toarray()[:, s1_idxs]
tr1_bns = tr1_sn.toarray()[:, s1_idxs]
tr1_us = tr1_u.toarray()[:, s1_idxs]
te1_s = vectors_test.toarray()[:, s1_idxs]

In [83]:
tr1_pvs = tr1_pv.toarray()[:, s1_idxs]
tr1_uvs = tr1_uv.toarray()[:, s1_idxs]

In [81]:
c1 = center(tr1_ps, tr1_bns, alpha=8, beta=4)
cbs_p1 = sims(tr1_ps, c1)
cbs_u1 = sims(tr1_us, c1)
cbs_te1 = sims(te1_s, c1)

In [84]:
cbs_pv1 = sims(tr1_pvs, c1)
cbs_uv1 = sims(tr1_uvs, c1)

In [92]:
p_set = torch.utils.data.TensorDataset(
    torch.tensor(cbs_p1))
u_set = torch.utils.data.TensorDataset(
    torch.tensor(cbs_u1))
test_set = torch.utils.data.TensorDataset(
    torch.tensor(cbs_te1), torch.tensor(test_target).unsqueeze(1))

In [95]:
from training import PUClassifier

In [96]:
class Net(nn.Module):

    def __init__(self, d_input):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(d_input, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x

In [103]:
model = Net(5).cuda()
pi = 0.56
learning_rate_cls = 1e-3
weight_decay = 1e-4
milestones = [200]
lr_d = 0.1
non_negative = True
nn_threshold = 0
nn_rate = 1
validation_momentum = 0
start_validation_epoch = 0

p_batch_size = 10
u_batch_size = 120
p_validation = torch.tensor(cbs_pv1)
u_validation = torch.tensor(cbs_uv1)
cls_training_epochs = 50
convex_epochs = 50

In [107]:
import settings

In [108]:
import importlib
importlib.reload(settings)

<module 'settings' from '/media/yu-guan/DATA/works/ens/M1/Intern/Codes/pu_biased_n/settings.py'>

In [109]:
cls = PUClassifier(
    model,
    pi=pi,
    lr=learning_rate_cls,
    weight_decay=weight_decay,
    milestones=milestones,
    lr_d=lr_d,
    nn=non_negative,
    nn_threshold=nn_threshold,
    nn_rate=nn_rate,
    validation_momentum=validation_momentum,
    start_validation_epoch=start_validation_epoch)
cls.train(
    p_set,
    u_set,
    test_set,
    p_batch_size,
    u_batch_size,
    p_validation,
    u_validation,
    cls_training_epochs,
    convex_epochs=convex_epochs)

Test set: Accuracy: 79.90%
Test set: Balanced Accuracy: 79.15%
Test set: Auc Score: 79.15%
Test set: Precision: 80.55%
Test set: Recall Score: 84.91%
Test set: F1 Score: 82.67%
Test set: False Positive Rate: 26.60%
Validation Loss: 0.5629252195358276
Epoch: 0  Train Loss: 0.427440
Test set: Accuracy: 79.54%
Test set: Balanced Accuracy: 78.48%
Test set: Auc Score: 78.48%
Test set: Precision: 79.10%
Test set: Recall Score: 86.67%
Test set: F1 Score: 82.71%
Test set: False Positive Rate: 29.71%
Validation Loss: 0.5658384561538696
Epoch: 1  Train Loss: 0.435949
Test set: Accuracy: 79.93%
Test set: Balanced Accuracy: 79.26%
Test set: Auc Score: 79.26%
Test set: Precision: 80.89%
Test set: Recall Score: 84.39%
Test set: F1 Score: 82.60%
Test set: False Positive Rate: 25.87%
Validation Loss: 0.563031017780304
Epoch: 2  Train Loss: 0.427794
Test set: Accuracy: 79.39%
Test set: Balanced Accuracy: 78.29%
Test set: Auc Score: 78.29%
Test set: Precision: 78.83%
Test set: Recall Score: 86.84%
Test 

Test set: Balanced Accuracy: 77.63%
Test set: Auc Score: 77.63%
Test set: Precision: 77.74%
Test set: Recall Score: 87.94%
Test set: F1 Score: 82.53%
Test set: False Positive Rate: 32.67%
Validation Loss: 0.5654245615005493
Epoch: 29  Train Loss: 0.376186
Test set: Accuracy: 79.38%
Test set: Balanced Accuracy: 78.27%
Test set: Auc Score: 78.27%
Test set: Precision: 78.80%
Test set: Recall Score: 86.86%
Test set: F1 Score: 82.63%
Test set: False Positive Rate: 30.32%
Validation Loss: 0.5700991153717041
Epoch: 30  Train Loss: 0.398848
Test set: Accuracy: 79.87%
Test set: Balanced Accuracy: 79.23%
Test set: Auc Score: 79.23%
Test set: Precision: 80.93%
Test set: Recall Score: 84.20%
Test set: F1 Score: 82.53%
Test set: False Positive Rate: 25.75%
Validation Loss: 0.5689064264297485
Epoch: 31  Train Loss: 0.367507
Test set: Accuracy: 79.89%
Test set: Balanced Accuracy: 79.12%
Test set: Auc Score: 79.12%
Test set: Precision: 80.48%
Test set: Recall Score: 85.00%
Test set: F1 Score: 82.68%
T