In [1]:
import numpy as np 
import pandas as pd
from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
# Load in Breast Cancer (Tabular) Data from sklearn's datasets module
cancer_df = datasets.load_breast_cancer()

# To randomly assign some data points to be unlabeled
random = np.random.RandomState(42)

# Randomly assign some data points to be unlabeled
# Denote unlabeled data points with label == -1
rnd_unlabeled_points = random.rand(len(cancer_df.target)) < 0.3
labels = np.copy(cancer_df.target)
labels_orig = np.copy(cancer_df.target) # Keep a copy of the original labels


In [14]:
len(cancer_df.target)

569

In [17]:
rnd_unlabeled_points

array([False, False, False, False,  True,  True,  True, False, False,
       False,  True, False, False,  True,  True,  True, False, False,
       False,  True, False,  True,  True, False, False, False,  True,
       False, False,  True, False,  True,  True, False, False, False,
       False,  True, False, False,  True, False,  True, False,  True,
       False, False, False, False,  True, False, False, False, False,
       False, False,  True,  True,  True, False, False,  True, False,
       False,  True, False,  True, False,  True, False, False,  True,
        True, False, False, False, False,  True, False,  True, False,
       False, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False,  True,
        True,  True, False, False, False, False,  True, False, False,
        True,  True,  True,  True, False, False, False, False, False,
        True, False, False, False, False, False,  True,  True, False,
       False, False,

In [3]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [4]:
labels[rnd_unlabeled_points] = -1

In [5]:
labels

array([ 0,  0,  0,  0, -1, -1, -1,  0,  0,  0, -1,  0,  0, -1, -1, -1,  0,
        0,  0, -1,  1, -1, -1,  0,  0,  0, -1,  0,  0, -1,  0, -1, -1,  0,
        0,  0,  0, -1,  0,  0, -1,  0, -1,  0, -1,  0,  1,  0,  1, -1,  1,
        1,  1,  0,  0,  1, -1, -1, -1,  1,  1, -1,  0,  1, -1,  0, -1,  1,
       -1,  1,  0, -1, -1,  0,  1,  0,  1, -1,  0, -1,  1,  1,  0, -1,  1,
        0,  0,  0,  1,  1, -1,  0,  1,  1,  0,  0,  1,  1, -1, -1, -1,  1,
        1,  1,  1, -1,  1,  1, -1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  0,
        0,  1,  0,  0, -1, -1,  1,  0,  0, -1,  0,  1, -1, -1,  1,  0,  0,
        1,  1,  0,  1,  1, -1,  1,  1, -1, -1,  0,  1, -1, -1,  1, -1, -1,
        1,  1, -1,  0,  1, -1,  1,  1,  0,  0,  1, -1,  1,  1, -1, -1,  1,
        1, -1,  0, -1,  1, -1,  1,  0,  1, -1,  0, -1,  0,  1, -1,  1,  0,
        1,  1, -1, -1,  1,  1,  0,  0,  1,  0,  0,  0,  0,  1, -1, -1,  0,
        1, -1, -1,  0, -1, -1,  0,  1,  0, -1,  0, -1,  1,  1,  0,  0,  1,
        1, -1,  0, -1, -1

In [6]:
X = cancer_df.data
unlabeled = labels[labels==-1]

# Specify model
model = LabelPropagation(kernel='knn',n_neighbors=5, gamma=30, max_iter=2000)

# Fit Model
model.fit(X, labels)

# Make predictions on originally unlabeled data
predicted_labels = model.predict(X[rnd_unlabeled_points])
true_labels = labels_orig[rnd_unlabeled_points]

# Display classification report and confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=model.classes_)
print(classification_report(true_labels, predicted_labels))
print("Confusion matrix")
print(cm)

              precision    recall  f1-score   support

           0       0.97      0.88      0.92        74
           1       0.91      0.98      0.94        95

    accuracy                           0.93       169
   macro avg       0.94      0.93      0.93       169
weighted avg       0.94      0.93      0.93       169

Confusion matrix
[[65  9]
 [ 2 93]]


In [7]:
unlabeled 

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

The RBF Kernel has a parameter gamma “γ” that determines the complexity of the decision boundary. By tuning this parameter, we can adjust the trade-off between bias and variance 

In [10]:
import warnings
warnings.filterwarnings("ignore") 

In [12]:
# Define the parameter grid for the hyperparameters to tune
param_grid = {'kernel': ['rbf','knn'], 
              'gamma': [0.1, 0.5, 1.0, 5.0, 10, 20, 30,50,100], 
              'n_neighbors': [4,5,6,7,8]}

# Specify LPA model
model = LabelPropagation(max_iter=4000)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=model, 
                      param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to training data
grid_search.fit(X, labels)

# Use the best model to make predictions on the unlabeled data points
y_pred = grid_search.best_estimator_.predict(X[rnd_unlabeled_points])

# Calculate the accuracy of the predictions
accuracy = accuracy_score(true_labels, y_pred)


In [13]:
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 92.31%
