# Positive and Unlabeled Learning
### simulating unlabeled data in the breast cancer data set, then building a classifer using propensity

Uses Selected at Random Assumption and propensity. Adapted from "Machine Learning from Weak Supervision"
https://mitpress.mit.edu/9780262047074/machine-learning-from-weak-supervision/

# Load Libraries

In [41]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,roc_auc_score


# Propensity Ops functions
from propensity_ops import estimate_propensity_scores, compute_class_weights, train_pu_model, calculate_optimal_threshold

# Set the random seed
np.random.seed(42)

# functions

In [42]:
import pandas as pd

def determine_column_type(df):
    categorical_columns = []
    continuous_columns = []

    for column in df.columns:
        if df[column].dtype == 'object':
            categorical_columns.append(column)
        elif pd.api.types.is_numeric_dtype(df[column]):
            unique_values = df[column].nunique()
            if unique_values < 10:  # Adjust this threshold as needed
                categorical_columns.append(column)
            else:
                continuous_columns.append(column)
        else:
            # Handle other data types if necessary
            pass

    return categorical_columns, continuous_columns






# Load Data

In [43]:
data=pd.read_csv('malware.csv', sep ="|")
categorical_columns, continuous_columns = determine_column_type(data)

df_categorical = data[categorical_columns]
df_continuous = data[continuous_columns]

# scale data

In [44]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the continuous DataFrame
df_continuous_scaled = pd.DataFrame(scaler.fit_transform(df_continuous), columns=df_continuous.columns)


In [45]:
y= data['legitimate']

# split 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X, y =df_continuous_scaled ,data['legitimate']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [46]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary sets (80% training, 20% temporary)
X_train, X_temp, y_train, y_temp = train_test_split(df_continuous_scaled, y, test_size=0.2, random_state=42)

# Split the temporary set into validation and testing sets (50% validation, 50% testing)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Testing set size:", X_test.shape)

Training set size: (110437, 49)
Validation set size: (13805, 49)
Testing set size: (13805, 49)


In [47]:
y_train.value_counts()

0    77474
1    32963
Name: legitimate, dtype: int64

# hide 80 % of the labels so unlabeled and negative labels are grouped together

In [48]:
mask = np.random.rand(len(X_train)) < 0.75
y_train = y_train.values
y_train[mask]=0

In [49]:

maskv = np.random.rand(len(X_val)) < 0.75
y_val = y_val
y_val[maskv]=0

In [50]:
pd.Series(y_train).value_counts()

0    102371
1      8066
dtype: int64

### subgroup will be defined as (df['mean radius'] > df['mean radius'].median()) & (df['mean texture'] > df['mean texture'].median())
### we will be using mean radius and mean texture as the features to define the subgroup specifically where the mean exceeds the median



# Load the breast cancer dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Create a boolean mask for the subgroup
# Adjust the conditions as needed to define your subgroup
subgroup_mask = (df['mean radius'] > df['mean radius'].median()) & (df['mean texture'] > df['mean texture'].median())

# Create a new column 'subgroup' to track the subgroup
df['subgroup'] = subgroup_mask

# Mask some of the labels from the subgroup
# Here we randomly mask 50% of the subgroup labels


mask = np.random.rand(len(df)) < 0.5
df.loc[subgroup_mask, 'target'] = df.loc[subgroup_mask, 'target'].mask(mask[subgroup_mask])

# Set all other labels outside the subgroup to 0
df.loc[~subgroup_mask, 'target'] = 0

# Hold back the true labels for all instances
df['true_labels_all'] = data.target

# Hold back the true labels for just the subgroup
df['true_labels_subgroup'] = df['target'].where(subgroup_mask, np.nan)

df.head()

# Setup X, Y training and validation

# make a copy of the original DF

df_bak = df.copy()


# Define your features and target
X = df.drop(['target', 'subgroup', 'true_labels_all', 'true_labels_subgroup'], axis=1)
y = df['target'].fillna(0)

# Initialize the scaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Split the data and store the indices
indices = np.arange(len(X))
train_indices, test_indices, X_train, X_test, y_train, y_test = train_test_split(indices, X_scaled, y, test_size=0.33, random_state=42)

# Train Model

In [51]:
# Estimate propensity scores
propensity_scores = estimate_propensity_scores(X_train, y_train)

# Compute class weights
class_weights = compute_class_weights(y_train, propensity_scores)

# Train the PU model
pu_model = train_pu_model(X_train, y_train, class_weights)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
y_pred=pu_model.predict(X_test)

In [53]:
f1_score(y_test, y_pred),accuracy_score(y_test, y_pred)

(0.06643274853801169, 0.7109018471568272)

In [54]:
confusion_matrix(y_test, y_pred)

array([[9672,    0],
       [3991,  142]], dtype=int64)

In [55]:
y_probsv=pu_model.predict_proba(X_val)[:,1]

In [56]:
y_probs=pu_model.predict_proba(X_test)[:,1]


# Function to find the optimal threshold
def find_optimal_threshold(y_true, y_probs):
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = [f1_score(y_true, y_probs >= t) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold, max(f1_scores)

# Find the optimal threshold
optimal_threshold, best_f1_score = find_optimal_threshold(y_test, y_probs)
print("Optimal threshold:", optimal_threshold)
print("Best F1 score:", best_f1_score)

# Apply the optimal threshold to make final predictions
y_preds = (y_probs >= optimal_threshold).astype(int)

In [57]:


# Function to find the optimal threshold
def find_optimal_threshold(y_true, y_probs):
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = [f1_score(y_true, y_probs >= t) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold, max(f1_scores)

# Find the optimal threshold
optimal_threshold, best_f1_score = find_optimal_threshold(y_val, y_probsv)
print("Optimal threshold:", optimal_threshold)
print("Best F1 score:", best_f1_score)

# Apply the optimal threshold to make final predictions
y_preds = (y_probs >= optimal_threshold).astype(int)

Optimal threshold: 0.08
Best F1 score: 0.36597049243150026


In [58]:
f1_score(y_test,y_preds)

0.9283451133317085

In [61]:
accuracy_score(y_test,y_preds)

0.9574067366896052

# AUC

In [59]:
y_probs=pu_model.predict_proba(X_test)[:,1:]

In [60]:
roc_auc_score(y_test,y_probs)

0.9853668510047535

# make proba preds

In [26]:
y_probs=pu_model.predict_proba(X_test)[:,1:]

In [27]:
from sklearn.metrics import confusion_matrix, classification_report
df_test=df_bak.loc[test_indices]#['subgroup']
df_sub_test=df_test[df_test['subgroup']]

NameError: name 'df_bak' is not defined

In [28]:
df_test

NameError: name 'df_test' is not defined

## Find Optimal Threshold

In [29]:
optimal_threshold = calculate_optimal_threshold(y_test, y_probs)
print("Optimal threshold:", optimal_threshold)
y_preds=(y_probs>=optimal_threshold).astype(int)

Optimal threshold: 0.050754192309385875


In [30]:
y_true_sub=df_sub_test['true_labels_all']

NameError: name 'df_sub_test' is not defined

In [31]:
y_preds_sub0=y_preds[df_test['subgroup']].astype(int)

NameError: name 'df_test' is not defined

# Confusion Matrix using Optimal Threshold

In [32]:
confusion_matrix(y_true_sub, y_preds_sub0)

NameError: name 'y_true_sub' is not defined

In [33]:
print(classification_report(y_true_sub, y_preds_sub0))

NameError: name 'y_true_sub' is not defined

# train out of box LR model

In [34]:

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train.values.reshape(-1, 1))

y_lr_proba=clf.predict_proba(X_test)[:,1:][df_test['subgroup']]
#clf.score(X_test, y_test.values.reshape(-1, 1))


AttributeError: 'numpy.ndarray' object has no attribute 'values'

# AUC for PU learning model

In [43]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true_sub,y_probs[df_test['subgroup']])

0.9111111111111111

# AUC for Standard LR model

In [45]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true_sub,y_lr_proba)

0.9055555555555556

# get Optimal Threshold for LR Model

In [35]:
# Get the test instances that belong to the subgroup
X_test_subgroup = X_test[df_test['subgroup']]
y_test_subgroup = y_test[df_test['subgroup']]

# Get the predicted probabilities for the subgroup
y_probs_subgroup = clf.predict_proba(X_test_subgroup)[:, 1:]

# Calculate the optimal threshold for the subgroup
optimal_threshold_subgroup = calculate_optimal_threshold(y_test_subgroup, y_probs_subgroup)
print("Optimal threshold for subgroup:", optimal_threshold_subgroup)

# Get the predictions for the subgroup using the new threshold
y_preds_subgroup = (y_probs_subgroup >= optimal_threshold_subgroup).astype(int)

NameError: name 'df_test' is not defined

# confusion Matrix for LR model

In [47]:
confusion_matrix(y_test_subgroup, y_preds_subgroup)

array([[39, 19],
       [ 0,  6]], dtype=int64)

# confusion Matrix for PU Learning model

In [18]:
confusion_matrix(y_true_sub, y_preds_sub0)

array([[39, 15],
       [ 0, 10]], dtype=int64)

#  Brief Discussion

The SAR propensity model modestly outperforms standard logistic regression, based on AUC, which translates naturally to Confusion Matrix when using optimal thresholds. However, it outperforms in an interesting and important way. The detection of positives is superior by a significant margin (~40%). 