# Objective

Demonstrate how self training can be used to supplement ML classification accuracy when a majority of the data available are unlabeled.

20% of data set aside as test set

20% of data kept as labeled train set

60% of data designated as 'unlabeled' - labels removed


# Imports

In [85]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Obtain Data

In [56]:
# Load data

df = pd.read_csv('heart_data.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [57]:
# Shuffle the data

df = df.sample(frac=1, random_state=5).reset_index(drop=True)

In [58]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0
1,57,1,0,152,274,0,1,88,1,1.2,1,1,3,0
2,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0
3,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0
4,40,1,3,140,199,0,1,178,1,1.4,2,0,3,1


In [59]:
test_ind = round(len(df)*0.2)
train_ind = test_ind + round(len(df)*0.2)
unlabeled_ind = train_ind + round(len(df)*0.6) - 1

In [60]:
test = df.iloc[:test_ind]
train = df.iloc[test_ind:train_ind]
unlabeled = df.iloc[train_ind:unlabeled_ind]

In [69]:
test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0
1,57,1,0,152,274,0,1,88,1,1.2,1,1,3,0
2,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0
3,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0
4,40,1,3,140,199,0,1,178,1,1.4,2,0,3,1


In [70]:
train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
61,55,0,0,180,327,0,2,117,1,3.4,1,0,2,0
62,43,0,0,132,341,1,0,136,1,3.0,1,0,3,0
63,53,1,2,130,246,1,0,173,0,0.0,2,3,2,1
64,46,0,0,138,243,0,0,152,1,0.0,1,0,2,1
65,48,0,2,130,275,0,1,139,0,0.2,2,0,2,1


In [71]:
unlabeled.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
122,51,0,2,140,308,0,0,142,0,1.5,2,1,2,1
123,66,0,0,178,228,1,1,165,1,1.0,1,2,3,0
124,66,1,0,120,302,0,0,151,0,0.4,1,0,2,1
125,50,1,0,144,200,0,0,126,1,0.9,1,0,3,0
126,57,1,0,110,201,0,1,126,1,1.5,1,0,1,1


In [82]:
X_train = train.drop('target', axis=1)
y_train = train.target

X_unlabeled = unlabeled.drop('target', axis=1)

X_test = test.drop('target', axis=1)
y_test = test.target

In [83]:
# Logistic Regression Classifier

clf = LogisticRegression(max_iter=1000)

clf.fit(X_train, y_train)
y_hat_test = clf.predict(X_test)

In [84]:
accuracy_score(y_test, y_hat_test)

0.8688524590163934

In [89]:
confusion_matrix(y_test, y_hat_test)

array([[24,  6],
       [ 2, 29]], dtype=int64)

In [90]:
u_preds = clf.predict_proba(X_unlabeled)

In [91]:
u_preds

array([[5.76109273e-02, 9.42389073e-01],
       [8.08238825e-01, 1.91761175e-01],
       [8.35988409e-01, 1.64011591e-01],
       [4.56060963e-01, 5.43939037e-01],
       [8.98685684e-01, 1.01314316e-01],
       [4.33264140e-01, 5.66735860e-01],
       [8.25426826e-01, 1.74573174e-01],
       [4.97616251e-01, 5.02383749e-01],
       [5.46514067e-01, 4.53485933e-01],
       [1.72168481e-02, 9.82783152e-01],
       [8.84392868e-01, 1.15607132e-01],
       [2.12696865e-01, 7.87303135e-01],
       [3.46941954e-02, 9.65305805e-01],
       [1.99409969e-01, 8.00590031e-01],
       [3.26069357e-01, 6.73930643e-01],
       [6.03058579e-02, 9.39694142e-01],
       [4.90802604e-02, 9.50919740e-01],
       [9.94001361e-01, 5.99863915e-03],
       [1.30524067e-01, 8.69475933e-01],
       [9.32925743e-01, 6.70742568e-02],
       [1.40112875e-01, 8.59887125e-01],
       [1.99761834e-01, 8.00238166e-01],
       [6.97196122e-02, 9.30280388e-01],
       [9.02971330e-01, 9.70286698e-02],
       [1.418085