# Positive and Unlabeled Learning
### simulating unlabeled data in the breast cancer data set, then building a classifer using propensity

Uses Selected at Random Assumption and propensity. Adapted from "Machine Learning from Weak Supervision"
https://mitpress.mit.edu/9780262047074/machine-learning-from-weak-supervision/

# Load Libraries

In [1]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

# Propensity Ops functions
from propensity_ops import estimate_propensity_scores, compute_class_weights, train_pu_model, calculate_optimal_threshold

# Load Data

In [3]:
### subgroup will be defined as (df['mean radius'] > df['mean radius'].median()) & (df['mean texture'] > df['mean texture'].median())
### we will be using mean radius and mean texture as the features to define the subgroup specifically where the mean exceeds the median



# Load the breast cancer dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Create a boolean mask for the subgroup
# Adjust the conditions as needed to define your subgroup
subgroup_mask = (df['mean radius'] > df['mean radius'].median()) & (df['mean texture'] > df['mean texture'].median())

# Create a new column 'subgroup' to track the subgroup
df['subgroup'] = subgroup_mask

# Mask some of the labels from the subgroup
# Here we randomly mask 50% of the subgroup labels

In [4]:

mask = np.random.rand(len(df)) < 0.5
df.loc[subgroup_mask, 'target'] = df.loc[subgroup_mask, 'target'].mask(mask[subgroup_mask])

# Set all other labels outside the subgroup to 0
df.loc[~subgroup_mask, 'target'] = 0

# Hold back the true labels for all instances
df['true_labels_all'] = data.target

# Hold back the true labels for just the subgroup
df['true_labels_subgroup'] = df['target'].where(subgroup_mask, np.nan)

In [5]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,subgroup,true_labels_all,true_labels_subgroup
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0,False,0,
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0,False,0,
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,,True,0,
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0,False,0,
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0,False,0,


# Setup X, Y training and validation

In [None]:
# make a copy of the original DF

df_bak = df.copy()


# Define your features and target
X = df.drop(['target', 'subgroup', 'true_labels_all', 'true_labels_subgroup'], axis=1)
y = df['target'].fillna(0)

# Initialize the scaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Split the data and store the indices
indices = np.arange(len(X))
train_indices, test_indices, X_train, X_test, y_train, y_test = train_test_split(indices, X_scaled, y, test_size=0.33, random_state=42)

# Train Model

In [None]:
# Estimate propensity scores
propensity_scores = estimate_propensity_scores(X_train, y_train)

# Compute class weights
class_weights = compute_class_weights(y_train, propensity_scores)

# Train the PU model
pu_model = train_pu_model(X_train, y_train, class_weights)

# make proba preds

In [13]:
y_probs=pu_model.predict_proba(X_test)[:,1:]

In [14]:
from sklearn.metrics import confusion_matrix, classification_report
df_test=df_bak.loc[test_indices]#['subgroup']
df_sub_test=df_test[df_test['subgroup']]

In [35]:
df_test

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,subgroup,true_labels_all,true_labels_subgroup
204,12.47,18.60,81.09,481.9,0.09965,0.10580,0.08005,0.03821,0.1925,0.06373,...,0.1426,0.2378,0.2671,0.10150,0.3014,0.08750,0.0,False,1,
70,18.94,21.31,123.60,1130.0,0.09009,0.10290,0.10800,0.07951,0.1582,0.05461,...,0.1193,0.2336,0.2687,0.17890,0.2551,0.06589,,True,0,
131,15.46,19.48,101.70,748.9,0.10920,0.12230,0.14660,0.08087,0.1931,0.05796,...,0.1546,0.2394,0.3791,0.15140,0.2837,0.08019,0.0,True,0,0.0
431,12.40,17.68,81.47,467.8,0.10540,0.13160,0.07741,0.02799,0.1811,0.07102,...,0.1450,0.2629,0.2403,0.07370,0.2556,0.09359,0.0,False,1,
540,11.54,14.44,74.65,402.9,0.09984,0.11200,0.06737,0.02594,0.1818,0.06782,...,0.1345,0.2118,0.1797,0.06918,0.2329,0.08134,0.0,False,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,16.11,18.05,105.10,813.0,0.09721,0.11370,0.09447,0.05943,0.1861,0.06248,...,0.1314,0.2236,0.2802,0.12160,0.2792,0.08158,0.0,False,0,
498,18.49,17.52,121.30,1068.0,0.10120,0.13170,0.14910,0.09183,0.1832,0.06697,...,0.1412,0.3089,0.3533,0.16630,0.2510,0.09445,0.0,False,0,
7,13.71,20.83,90.20,577.9,0.11890,0.16450,0.09366,0.05985,0.2196,0.07451,...,0.1654,0.3682,0.2678,0.15560,0.3196,0.11510,,True,0,
541,14.47,24.99,95.81,656.4,0.08837,0.12300,0.10090,0.03890,0.1872,0.06341,...,0.1340,0.4202,0.4040,0.12050,0.3187,0.10230,1.0,True,1,1.0


## Find Optimal Threshold

In [15]:
optimal_threshold = calculate_optimal_threshold(y_test, y_probs)
print("Optimal threshold:", optimal_threshold)
y_preds=(y_probs>=optimal_threshold).astype(int)

Optimal threshold: 0.009736701184927221


In [16]:
y_true_sub=df_sub_test['true_labels_all']

In [17]:
y_preds_sub0=y_preds[df_test['subgroup']].astype(int)

# Confusion Matrix using Optimal Threshold

In [18]:
confusion_matrix(y_true_sub, y_preds_sub0)

array([[39, 15],
       [ 0, 10]], dtype=int64)

In [19]:
print(classification_report(y_true_sub, y_preds_sub0))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        54
           1       0.40      1.00      0.57        10

    accuracy                           0.77        64
   macro avg       0.70      0.86      0.71        64
weighted avg       0.91      0.77      0.80        64



# train out of box LR model

In [44]:

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train.values.reshape(-1, 1))

y_lr_proba=clf.predict_proba(X_test)[:,1:][df_test['subgroup']]
#clf.score(X_test, y_test.values.reshape(-1, 1))


  y = column_or_1d(y, warn=True)


# AUC for PU learning model

In [43]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true_sub,y_probs[df_test['subgroup']])

0.9111111111111111

# AUC for Standard LR model

In [45]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true_sub,y_lr_proba)

0.9055555555555556

# get Optimal Threshold for LR Model

In [46]:
# Get the test instances that belong to the subgroup
X_test_subgroup = X_test[df_test['subgroup']]
y_test_subgroup = y_test[df_test['subgroup']]

# Get the predicted probabilities for the subgroup
y_probs_subgroup = clf.predict_proba(X_test_subgroup)[:, 1:]

# Calculate the optimal threshold for the subgroup
optimal_threshold_subgroup = calculate_optimal_threshold(y_test_subgroup, y_probs_subgroup)
print("Optimal threshold for subgroup:", optimal_threshold_subgroup)

# Get the predictions for the subgroup using the new threshold
y_preds_subgroup = (y_probs_subgroup >= optimal_threshold_subgroup).astype(int)

Optimal threshold for subgroup: 0.008305951046409661


# confusion Matrix for LR model

In [47]:
confusion_matrix(y_test_subgroup, y_preds_subgroup)

array([[39, 19],
       [ 0,  6]], dtype=int64)

# confusion Matrix for PU Learning model

In [18]:
confusion_matrix(y_true_sub, y_preds_sub0)

array([[39, 15],
       [ 0, 10]], dtype=int64)

#  Brief Discussion

The SAR propensity model modestly outperforms standard logistic regression, based on AUC, which translates naturally to Confusion Matrix when using optimal thresholds. However, it outperforms in an interesting and important way. The detection of positives is superior by a significant margin (~40%). 