In [256]:
import pandas as pd
import numpy as np

In [257]:
def e_dist(x, y):

    sqd = np.sum(np.square(x-y), axis=1)
    cd = np.power(sqd, 0.5)

    return cd

In [258]:
def g_csfy(k, slab):

    kns = slab[:k]
    m_s = np.count_nonzero(kns == 'M')
    w_s = np.count_nonzero(kns == 'W')
    prob = max(m_s, w_s)/k

    return 'M' if m_s > w_s else 'W', prob

In [259]:
def knn_main(x, k, df_train):
    
    y = df_train.drop(['g'], axis=1).values

    labels = df_train["g"].values

    e_d = e_dist(x, y)

    clab = np.vstack((e_d, labels))

    scar = clab.T[clab.T[:, 0].argsort()]
    slab = scar.T[1]

    return g_csfy(k, slab)

In [260]:
def compute_acc(original, predicted):
        count = 0
        for i in range(len(original)):
            if original[i] == predicted[i]:
                count += 1
        return count / float(len(original)) * 100.0

## 1 datapoint

In [261]:
df = pd.read_csv('dataset.csv')
df_train = df.iloc[:20, :]
df_train = df_train.reset_index(drop = True)
df_test = df.iloc[20:, :-1]
df_test = df_test.reset_index(drop = True)
test_labels = df.iloc[20:,-1].values
k = 4

In [262]:
actual_ylabels = []

while True:
  probabilities = []
  predictions = []

  for sample in df_test.values:
      preds, probability = knn_main(sample, k, df_train)
      probabilities.append(probability)
      predictions.append(preds)

  sort_index = np.flip(np.argsort(probabilities))
  if len(sort_index)==0:
    break
  actual_ylabels.append(test_labels[sort_index[0]])
  pred = [predictions[i] for i in sort_index]

  pseudodata = df_test.iloc[sort_index[0],:].copy()
  pseudodata['g'] = pred[0]

  df_train.loc[len(df_train.index)] = pseudodata.to_list()

  # df_train = df_train.append(pseudodata, ignore_index=True)
  df_test = df_test.drop(sort_index[0], axis = 0).reset_index(drop = True)



In [263]:
y_pred = df_train.iloc[20:, -1].values

In [264]:
print(f"Accuracy for 1 datapoint in each iteration: {compute_acc(actual_ylabels, y_pred)}")

Accuracy for 1 datapoint in each iteration: 46.0


## 5 datapoints

In [265]:
df = pd.read_csv('dataset.csv')
df_train = df.iloc[:20, :]
df_train = df_train.reset_index(drop = True)
df_test = df.iloc[20:, :-1]
df_test = df_test.reset_index(drop = True)
test_labels = df.iloc[20:,-1].values
k = 4

In [266]:
actual_ylabels = []

while True:

  probabilities = []
  predictions = []

  for sample in df_test.values:
      preds, probability = knn_main(sample, k, df_train)
      probabilities.append(probability)
      predictions.append(preds)

  sort_index = np.flip(np.argsort(probabilities))
  if len(sort_index)==0:
    break
  for a in range (5):
    actual_ylabels.append(test_labels[sort_index[a]])
  pred = [predictions[i] for i in sort_index]

  pseudodata = df_test.iloc[sort_index[:5],:].copy()
  pseudodata['g'] = pred[:5]

  # df_train = df_train.append(pseudodata, ignore_index=True)
  frames = [df_train, pseudodata]
  df_train = pd.concat(frames, ignore_index=True)
  df_test = df_test.drop(sort_index[:5], axis = 0).reset_index(drop = True)

In [267]:
y_pred = df_train.iloc[20:, -1].values

In [268]:
print(f"Accuracy for 5 datapoints in each iteration: {compute_acc(actual_ylabels, y_pred)}")

Accuracy for 5 datapoints in each iteration: 49.0


## All points at a time

In [269]:
df = pd.read_csv('dataset.csv')
df_train = df.iloc[:20, :]
df_train = df_train.reset_index(drop = True)
df_test = df.iloc[20:, :-1]
df_test = df_test.reset_index(drop = True)
test_labels = df.iloc[20:,-1].values
k = 4

In [270]:
actual_ylabels = []

while True:

  probabilities = []
  predictions = []

  for sample in df_test.values:
      preds, probability = knn_main(sample, k, df_train)
      probabilities.append(probability)
      predictions.append(preds)

  sort_index = np.flip(np.argsort(probabilities))
  if len(sort_index)==0:
    break
  for a in range(100):
    actual_ylabels.append(test_labels[sort_index[a]])
  pred = [predictions[i] for i in sort_index]

  pseudodata = df_test.iloc[sort_index[:100],:].copy()
  pseudodata['g'] = pred[:100]

  frames = [df_train, pseudodata]
  df_train = pd.concat(frames, ignore_index=True)
  df_test = df_test.drop(sort_index[:100], axis = 0).reset_index(drop = True)

In [271]:
y_pred = df_train.iloc[20:, -1].values

In [272]:
print(f"Accuracy for all datapoints in first iteration: {compute_acc(actual_ylabels, y_pred)}")

Accuracy for all datapoints in first iteration: 53.0


## Own constraint setting

Here, the constraint is the votes probability of the labels of K data points nearest to the given data point. For instance, suppose a data point has 3 M and 2 W for k = 5, the probability of labels for that data point will be "0.6 for M" and "0.4 for W" and we use a threshold value to transfer the pseudo labels to the training data such that labels with probabilities more than threshold will be transferred to the training data

In [273]:
df = pd.read_csv('dataset.csv')
df_train = df.iloc[:20, :]
df_train = df_train.reset_index(drop = True)
df_test = df.iloc[20:, :-1]
df_test = df_test.reset_index(drop = True)
test_labels = df.iloc[20:,-1].values
k = 4
threshold = 0.7

In [274]:
actual_ylabels = []

while True:

    probabilities = []
    predictions = []
    indexes = []

    for sample in df_test.values:
        preds, probability = knn_main(sample, k, df_train)
        probabilities.append(probability)
        predictions.append(preds)

    indexes = np.where(np.array(probabilities) > threshold)
    if len(indexes[0]) == 0:
        break
    
    for i in indexes[0]:
      # print(test_labels[i])
      actual_ylabels.append(test_labels[i])
    

    pred = [predictions[i] for i in indexes[0]]

    pseudodata = df_test.iloc[indexes[0],:].copy()
    pseudodata['g'] = pred

    df_train = pd.concat([df_train, pseudodata]).reset_index(drop = True)
    df_test = df_test.drop(indexes[0], axis = 0).reset_index(drop = True)

In [275]:
y_pred = df_train.iloc[20:, -1].values

In [276]:
print(f"Accuracy for constrained no. of datapoints in each iteration: {compute_acc(actual_ylabels, y_pred)}")

Accuracy for constrained no. of datapoints in each iteration: 53.191489361702125


## Supervised KNN classifier

In [277]:
df = pd.read_csv('dataset.csv')
df_train = df.iloc[:20, :]
df_train = df_train.reset_index(drop = True)
df_test = df.iloc[20:, :-1]
df_test = df_test.reset_index(drop = True)
test_labels = df.iloc[20:,-1].values
k = 4

In [278]:
predictions = []
for sample in df_test.values:
    preds, probability = knn_main(sample, k, df_train)
    probabilities.append(probability)
    predictions.append(preds)

In [279]:
print(f"Accuracy for supervised KNN: {compute_acc(test_labels, predictions)}")

Accuracy for supervised KNN: 53.0


Accuracy:

For 1 datapoint in each iteration: 46.0

For 5 datapoints in each iteration: 49.0

For all datapoints in first iteration: 53.0

For constrained no. of datapoints in each iteration: 53.191489361702125

For supervised KNN: 53.0

Here, we can infer that constrained datapoints as pseudo training data give better results compared to other iteration criteria. This is because, with each iteration, only the predictions that are above the specified threshold are added for training before the next iteration of prediction and hence all the predicted data with high probability of prediction is going in training as opposed to truncated data in the 1 datapoint, 5 datapoint iterations.