In [1]:
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix, f1_score, recall_score, precision_score

$$\text{Evaluate Results}$$

In [2]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print(f"f1: {f1 * 100.0:.2f}%") 
    rec = recall_score(y_test, y_predict, average='binary')
    print(f"recall: {rec * 100.0:.2f}%") 
    prc = precision_score(y_test, y_predict, average='binary')
    print(f"precision: {prc * 100.0:.2f}%" )

In [3]:
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
columns = ['sepal length in cm', 
           'sepal width in cm', 
           'petal length in cm', 
           'petal width in cm',
           'class']

In [4]:
df = pd.read_csv(path, names = columns)
df.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
df['class'].value_counts()

Iris-virginica     50
Iris-versicolor    50
Iris-setosa        50
Name: class, dtype: int64

In [6]:
df['class'] = df['class'].apply(lambda x: 1 if x == 'Iris-virginica' else 0)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['class'], axis=1), df['class'], test_size = 0.33, random_state = 42)

In [8]:
clf = GradientBoostingClassifier(random_state = 42)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

evaluate_results(y_test, y_test_pred)

Classification results:
f1: 96.77%
recall: 93.75%
precision: 100.00%


In [9]:
f_list, r_list, p_list = [], [], []

f_list.append(f1_score(y_test, y_test_pred))
r_list.append(recall_score(y_test, y_test_pred))
p_list.append(precision_score(y_test, y_test_pred))


In [11]:
mod_data = X_train.copy()
mod_data['label'] = y_train

pos_indx = mod_data.loc[mod_data['label'] == 1, :].index
pos_indx = np.random.permutation(pos_indx)

known_labels_ratio = 0.3
pos_sample_len = int(known_labels_ratio * len(pos_indx))

print(f'Using {pos_sample_len}/{len(pos_indx)} as positives and unlabeling the rest')
pos_sample = pos_indx[:pos_sample_len]

Using 10/34 as positives and unlabeling the rest


In [12]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    90
 1    10
Name: class_test, dtype: int64


In [13]:
mod_data = mod_data.sample(frac = 1)

data_N = mod_data.loc[mod_data['class_test'] == -1]
data_P = mod_data.loc[mod_data['class_test'] == 1]

neg_sample = data_N.iloc[:data_P.shape[0]]
pos_sample = data_P.copy()
#sample_test = data_N.iloc[data_P.shape[0]:]
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

In [14]:
clf_rns = GradientBoostingClassifier(random_state = 42)

sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

clf_rns.fit(sample_train.drop(columns=['class_test', 'label']), 
          sample_train['class_test'])

y_test_pred = clf_rns.predict(X_test)
evaluate_results(y_test, y_test_pred)


Classification results:
f1: 85.71%
recall: 75.00%
precision: 100.00%


In [15]:
f_list.append(f1_score(y_test, y_test_pred))
r_list.append(recall_score(y_test, y_test_pred))
p_list.append(precision_score(y_test, y_test_pred))

In [16]:
results = pd.DataFrame({'f1': f_list, 'recall': r_list, 'precision': p_list}, columns = ['f1', 'recall', 'precision'] ,index = ['Supervised-learning', 'PU-learning'])
results

Unnamed: 0,f1,recall,precision
Supervised-learning,0.967742,0.9375,1.0
PU-learning,0.857143,0.75,1.0
