In [None]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.datasets import load_digits
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import SelfTrainingClassifier

data = load_digits()

X_train, X_test, y_train, y_test = train_test_split(data.data,
                                                    data.target,
                                                    test_size = 0.5,
                                                    random_state = 1234)

print(X_train.shape)

In [None]:
unlabeled = 798 # Number of unlabeled instances.

# Set first n instances as unlabeled.
# That is, the last instances will be labeled.
y_train[:unlabeled] = -1

# Check that the first n instances are unlabeled.
print(y_train)

In [None]:
# Use a random forest as the underlying model.
rf = RandomForestClassifier(n_estimators = 50, random_state = 123)

ss_model = SelfTrainingClassifier(rf, threshold = 0.95, verbose = True)

ss_model.fit(X_train, y_train)

In [None]:
y_pred = ss_model.predict(X_test)

print(classification_report(y_test, y_pred))

### Not using unabeled instaces

In [None]:
labeled_count = X_train.shape[0] - unlabeled

print(labeled_count)

# Discard first n instances since we only want labeled instances.
X_train_labeled = X_train[-labeled_count:].copy()
y_train_labeled = y_train[-labeled_count:].copy()

rf = RandomForestClassifier(n_estimators = 50, random_state = 123)

rf.fit(X_train_labeled, y_train_labeled)

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))