In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Using Naive Bayes for semi-supervised classification:

In [2]:
X, y = load_digits(return_X_y=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [16]:
X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, 
                                                          y_train, 
                                                          train_size=0.10)

In [17]:
print('for training the naive based:', X_label.shape, y_label.shape)
print('for producing new data set:', X_unlabel.shape)
print('for evaluating:', X_test.shape, y_test.shape) 

for training the naive based: (116, 64) (116,)
for producing new data set: (1052, 64)
for evaluating: (629, 64) (629,)


### 1) Fitting on labeled data:

In [18]:
clf1 = GaussianNB()
clf1.fit(X=X_label, y=y_label)

### 2) Predicting for unlabeled data:

In [19]:
y_pred_unlabeled = clf1.predict(X_unlabel)

### 3) Constructing new training data set:

In [20]:
X_semi_train = np.concatenate([X_label, X_unlabel], axis=0)
y_semi_train = np.concatenate([y_label, y_pred_unlabeled], axis=0)

print(X_semi_train.shape, y_semi_train.shape)

(1168, 64) (1168,)


### 4) Training a classifier on the new data set:

In [21]:
clf2 = GaussianNB()
clf2.fit(X=X_semi_train, y=y_semi_train)

In [22]:
clf3 = Pipeline(steps=[('dimension reduction', PCA(n_components=0.95)),
                       ('classifier', RFC(n_estimators=50))])

clf3.fit(X_semi_train, y_semi_train)

### 5) Evaluating the semi-supervised model:

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
print(accuracy_score(y_true=y_test, y_pred=clf2.predict(X_test)))

0.6836248012718601


In [25]:
print(accuracy_score(y_true=y_test, y_pred=clf3.predict(X_test)))

0.8744038155802861


In [26]:
clf4 = Pipeline(steps=[('dimension reduction', PCA(n_components=0.95)),
                       ('classifier', RFC(n_estimators=50))])

clf4.fit(X_label, y_label)

print(accuracy_score(y_true=y_test, y_pred=clf4.predict(X_test)))

0.7901430842607313
