# Positive and Unlabeled Learning
### simulating unlabeled data in the breast cancer data set, then building a classifer using propensity

Uses Selected at Random Assumption and propensity. Adapted from "Machine Learning from Weak Supervision"
https://mitpress.mit.edu/9780262047074/machine-learning-from-weak-supervision/

# Load Libraries

In [41]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score,roc_auc_score


# Propensity Ops functions
from propensity_ops import estimate_propensity_scores, compute_class_weights, train_pu_model, calculate_optimal_threshold

# Set the random seed
np.random.seed(42)

# functions

In [42]:
import pandas as pd

def determine_column_type(df):
    categorical_columns = []
    continuous_columns = []

    for column in df.columns:
        if df[column].dtype == 'object':
            categorical_columns.append(column)
        elif pd.api.types.is_numeric_dtype(df[column]):
            unique_values = df[column].nunique()
            if unique_values < 10:  # Adjust this threshold as needed
                categorical_columns.append(column)
            else:
                continuous_columns.append(column)
        else:
            # Handle other data types if necessary
            pass

    return categorical_columns, continuous_columns






# Load Data

In [43]:
data=pd.read_csv('malware.csv', sep ="|")
categorical_columns, continuous_columns = determine_column_type(data)

df_categorical = data[categorical_columns]
df_continuous = data[continuous_columns]

# scale data

In [44]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the continuous DataFrame
df_continuous_scaled = pd.DataFrame(scaler.fit_transform(df_continuous), columns=df_continuous.columns)


In [45]:
y= data['legitimate']

# split 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X, y =df_continuous_scaled ,data['legitimate']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [46]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary sets (80% training, 20% temporary)
X_train, X_temp, y_train, y_temp = train_test_split(df_continuous_scaled, y, test_size=0.2, random_state=42)

# Split the temporary set into validation and testing sets (50% validation, 50% testing)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Testing set size:", X_test.shape)

Training set size: (110437, 49)
Validation set size: (13805, 49)
Testing set size: (13805, 49)


In [47]:
y_train.value_counts()

0    77474
1    32963
Name: legitimate, dtype: int64

# hide 80 % of the labels so unlabeled and negative labels are grouped together

In [48]:
mask = np.random.rand(len(X_train)) < 0.75
y_train = y_train.values
y_train[mask]=0

In [49]:

maskv = np.random.rand(len(X_val)) < 0.75
y_val = y_val
y_val[maskv]=0

In [50]:
pd.Series(y_train).value_counts()

0    102371
1      8066
dtype: int64

# Train Model

In [51]:
# Estimate propensity scores
propensity_scores = estimate_propensity_scores(X_train, y_train)

# Compute class weights
class_weights = compute_class_weights(y_train, propensity_scores)

# Train the PU model
pu_model = train_pu_model(X_train, y_train, class_weights)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
y_pred=pu_model.predict(X_test)

In [53]:
f1_score(y_test, y_pred),accuracy_score(y_test, y_pred)

(0.06643274853801169, 0.7109018471568272)

In [54]:
confusion_matrix(y_test, y_pred)

array([[9672,    0],
       [3991,  142]], dtype=int64)

In [55]:
y_probsv=pu_model.predict_proba(X_val)[:,1]

In [56]:
y_probs=pu_model.predict_proba(X_test)[:,1]


# Function to find the optimal threshold


# Find the optimal threshold


# Apply the optimal threshold to make final predictions


In [57]:


# Function to find the optimal threshold
def find_optimal_threshold(y_true, y_probs):
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = [f1_score(y_true, y_probs >= t) for t in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold, max(f1_scores)

# Find the optimal threshold
optimal_threshold, best_f1_score = find_optimal_threshold(y_val, y_probsv)
print("Optimal threshold:", optimal_threshold)
print("Best F1 score:", best_f1_score)

# Apply the optimal threshold to make final predictions
y_preds = (y_probs >= optimal_threshold).astype(int)

Optimal threshold: 0.08
Best F1 score: 0.36597049243150026


In [58]:
f1_score(y_test,y_preds)

0.9283451133317085

In [61]:
accuracy_score(y_test,y_preds)

0.9574067366896052

# AUC

In [59]:
y_probs=pu_model.predict_proba(X_test)[:,1:]

In [60]:
roc_auc_score(y_test,y_probs)

0.9853668510047535