# Objective

Demonstrate how self training can be used to supplement ML classification accuracy when a majority of the data available are unlabeled.

20% of data set aside as test set

20% of data kept as labeled train set

60% of data designated as 'unlabeled' - labels removed


# Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Obtain Data

In [2]:
# Load data

df = pd.read_csv('heart_data.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Shuffle the data

df = df.sample(frac=1, random_state=5).reset_index(drop=True)

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0
1,57,1,0,152,274,0,1,88,1,1.2,1,1,3,0
2,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0
3,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0
4,40,1,3,140,199,0,1,178,1,1.4,2,0,3,1


In [5]:
test_ind = round(len(df)*0.2)
train_ind = test_ind + round(len(df)*0.2)
unlabeled_ind = train_ind + round(len(df)*0.6) - 1

In [6]:
test = df.iloc[:test_ind]
train = df.iloc[test_ind:train_ind]
unlabeled = df.iloc[train_ind:unlabeled_ind]

In [7]:
test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57,0,1,130,236,0,0,174,0,0.0,1,1,2,0
1,57,1,0,152,274,0,1,88,1,1.2,1,1,3,0
2,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0
3,61,0,0,145,307,0,0,146,1,1.0,1,0,3,0
4,40,1,3,140,199,0,1,178,1,1.4,2,0,3,1


In [8]:
train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
61,55,0,0,180,327,0,2,117,1,3.4,1,0,2,0
62,43,0,0,132,341,1,0,136,1,3.0,1,0,3,0
63,53,1,2,130,246,1,0,173,0,0.0,2,3,2,1
64,46,0,0,138,243,0,0,152,1,0.0,1,0,2,1
65,48,0,2,130,275,0,1,139,0,0.2,2,0,2,1


In [9]:
unlabeled.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
122,51,0,2,140,308,0,0,142,0,1.5,2,1,2,1
123,66,0,0,178,228,1,1,165,1,1.0,1,2,3,0
124,66,1,0,120,302,0,0,151,0,0.4,1,0,2,1
125,50,1,0,144,200,0,0,126,1,0.9,1,0,3,0
126,57,1,0,110,201,0,1,126,1,1.5,1,0,1,1


In [10]:
X_train = train.drop('target', axis=1)
y_train = train.target

X_unlabeled = unlabeled.drop('target', axis=1)

X_test = test.drop('target', axis=1)
y_test = test.target

In [11]:
# Logistic Regression Classifier

clf = LogisticRegression(max_iter=1000)

clf.fit(X_train, y_train)
y_hat_test = clf.predict(X_test)

In [12]:
accuracy_score(y_test, y_hat_test)

0.8688524590163934

In [13]:
confusion_matrix(y_test, y_hat_test)

array([[24,  6],
       [ 2, 29]], dtype=int64)

In [24]:
u_probs = clf.predict_proba(X_unlabeled)
u_preds = clf.predict(X_unlabeled)

In [25]:
prob_diff = abs(np.subtract(u_probs[:,0], u_probs[:,1]))

In [28]:
X_unlabeled.index

RangeIndex(start=122, stop=303, step=1)

In [29]:
df_unlab_preds = pd.DataFrame([])

In [31]:
df_unlab_preds['preds'] = u_preds
df_unlab_preds['prob_diff'] = prob_diff

In [33]:
df_unlab_preds.index = X_unlabeled.index

In [35]:
df_unlab_preds

Unnamed: 0,preds,prob_diff
122,1,0.884778
123,0,0.616478
124,0,0.671977
125,1,0.087878
126,0,0.797371
...,...,...
298,1,0.980697
299,1,0.444145
300,1,0.937678
301,1,0.785090


In [58]:
df_unlab_preds.sort_values(by='prob_diff', ascending=False)[:10]

Unnamed: 0,preds,prob_diff
161,0,0.999372
193,1,0.998358
220,0,0.996095
265,1,0.99174
268,0,0.990607
256,0,0.990589
279,1,0.990174
251,1,0.98833
180,1,0.988181
226,1,0.988064


In [70]:
df.loc[[161,193]]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
161,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
193,34,1,3,118,182,0,0,174,0,0.0,2,0,2,1
