**Part 1: Implementing nearest neighbour**

**First, for Iris dataset**

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [3]:
iris_data = load_iris()
print(iris_data.feature_names)
print(iris_data.target_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(iris_data.data,iris_data.target,random_state = 2810)

In [5]:
# a function to calculate euclidean distance
def E_len(x,y):
    sum = 0
    for i in range(len(x)):
        sum = sum +(x[i]-y[i])**2
    norm = sum**0.5
    return norm

In [8]:
y_pred = np.zeros(len(X_test))
a = np.zeros(len(X_train))
for i in range(len(X_test)):
    for j in range(len(X_train)):
        a[j] = E_len(X_train[j], X_test[i])
    pos = np.argmin(a)
    y_pred[i] = y_train[pos] 
test_error_rate = np.mean(y_pred != y_test)
no_of_errors = np.sum(y_pred!=y_test)

In [9]:
print('The Test Error rate is: ',test_error_rate)
print('Number of Errors: ',no_of_errors)

The Test Error rate is:  0.07894736842105263
Number of Errors:  3


**applying nearest neighbor to Ionosphere dataset**

In [11]:
data = np.genfromtxt('/home/smith/Downloads/ionosphere.txt', delimiter =',')
X_train, X_test, y_train, y_test = train_test_split(data[:,:34],data[:,-1], random_state = 2810)
y_pred = np.zeros(len(X_test))
a = np.zeros(len(X_train))
for i in range(len(X_test)):
    for j in range(len(X_train)):
        a[j] = E_len(X_train[j], X_test[i])
    pos = np.argmin(a)
    y_pred[i] = y_train[pos] 
test_error_rate = np.mean(y_pred != y_test)
no_of_errors = np.sum(y_pred!=y_test)

In [12]:
print('The Test Error rate is: ',test_error_rate)
print('Number of Errors: ',no_of_errors)

The Test Error rate is:  0.10227272727272728
Number of Errors:  9


**part 2: Implementation of conformal predictors based on nearest neighbour**

**first, for Iris dataset**

In [20]:
from math import inf

In [21]:
X_train, X_test,y_train,y_test = train_test_split(iris_data.data,iris_data.target,random_state = 2810)

In [22]:
def calc_pvalue(aug_set,aug_targets):
    for i in range(len(aug_set)):
        a = np.array([])
        c = np.array([])
        for j in range(len(aug_set)):
            if (i!=j) & (aug_targets[i] == aug_targets[j]):
                b = (np.sum((aug_set[i]-aug_set[j])**2))**0.5
                a = np.append(a,b)
            elif (i!=j) & (aug_targets[i] != aug_targets[j]):
                b = (np.sum((aug_set[i]-aug_set[j])**2))**0.5
                c = np.append(c,b)       
        if (np.min(c)==0)&(np.min(a)==0):
            conformity_scores[i] = 0
        elif np.min(a)==0:
            conformity_scores[i] = inf
        else:
            conformity_scores[i] = np.min(c)/np.min(a)

    test_conformity = conformity_scores[-1]
    rank = np.sum(test_conformity >= conformity_scores)
    p = rank/len(aug_set)
    return p

In [24]:
labels = np.unique(y_train)
p_values =np.zeros((len(X_test), len(labels)))
pred_targets = np.zeros(len(X_test))
conformity_scores = np.zeros(len(X_train)+1)
for i in range(len(X_test)):
    augmented_set = np.append(X_train, [X_test[i]],axis = 0)
    for j in range(len(labels)):
        augmented_targets = np.append(y_train, labels[j])
        p_values[i][j] = calc_pvalue(augmented_set,augmented_targets)
for i in range(len(p_values)):
    pred_targets[i] = labels[np.argmax(p_values[i])]
sum = 0    
for i in range(len(p_values)):
    k = int(pred_targets[i])
    for j in range(len(labels)):
        if p_values[i,k]!= p_values[i,j]:
            sum =sum+ p_values[i,j]
avg_false_p = sum/(len(p_values)*(len(labels)-1))
print('The average False_Pvalue is: ',avg_false_p)

The average False_Pvalue is:  0.009548206800186318


In [25]:
print('The Test error rate is :',np.mean(pred_targets != y_test))
print('Number of Misclassification :',np.sum(pred_targets!=y_test))
print('Success rate is :',np.mean(pred_targets == y_test))

The Test error rate is : 0.07894736842105263
Number of Misclassification : 3
Success rate is : 0.9210526315789473


Validity : Success rate on test set is 92.10%, which means our conformal prediction model is valid about 90% on unseen data

**applying conformal prediction to Ionosphere dataset**

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data[:,:34],data[:,-1], random_state = 2810)

In [28]:
labels = np.unique(y_train)
p_values =np.zeros((len(X_test), len(labels)))
pred_targets = np.zeros(len(X_test))
conformity_scores = np.zeros(len(X_train)+1)
for i in range(len(X_test)):
    augmented_set = np.append(X_train, [X_test[i]],axis = 0)
    for j in range(len(labels)):
        augmented_targets = np.append(y_train, labels[j])
        p_values[i][j] = calc_pvalue(augmented_set,augmented_targets)
for i in range(len(p_values)):
    pred_targets[i] = labels[np.argmax(p_values[i])]
sum = 0    
for i in range(len(p_values)):
    k = int(pred_targets[i])
    for j in range(len(labels)):
        if p_values[i,k]!= p_values[i,j]:
            sum =sum+ p_values[i,j]
avg_false_p = sum/(len(p_values)*(len(labels)-1))
print('The average False_Pvalue is: ',avg_false_p)

The average False_Pvalue is:  0.09999139118457302


In [29]:
print('The Test error rate is :',np.mean(pred_targets != y_test))
print('Number of Misclassification :',np.sum(pred_targets!=y_test))
print('Success rate is :',np.mean(pred_targets == y_test))

The Test error rate is : 0.10227272727272728
Number of Misclassification : 9
Success rate is : 0.8977272727272727


Validity : Success rate on test set is 89.77%, which means our conformal prediction model is valid more than 85% on unseen data

**Justifying convention for 0/0**


 
 We can only get 0/0 if nearest distance with in the same class as well as the nearest distance to a different class 0. If this happens that would mean two test sample with different labels, which is a very strange case. Assigning 0 to this convention would mean having rank 1 and hence the least P-value which singnifies the strangeness of the sample.

**using different conformity measures**

**implementing nearest distance within class conformity measure for iris dataset**

In [35]:
X_train, X_test,y_train,y_test = train_test_split(iris_data.data,iris_data.target,random_state = 2810)

In [37]:
def calc_pvalue_2(aug_set,aug_targets):
    for i in range(len(aug_set)):
        a = np.array([])
        for j in range(len(aug_set)):
            if (i!=j) & (aug_targets[i] == aug_targets[j]):
                b = (np.sum((aug_set[i]-aug_set[j])**2))**0.5
                a = np.append(a,b)
        if np.min(a)==0:
            conformity_scores[i] = inf
        else:
            conformity_scores[i] = 1/np.min(a)
    test_conformity = conformity_scores[-1]
    rank = np.sum(test_conformity >= conformity_scores)
    p = rank/len(aug_set)
    return p
         

In [38]:
labels = np.unique(y_train)
p_values =np.zeros((len(X_test), len(labels)))
pred_targets = np.zeros(len(X_test))
conformity_scores = np.zeros(len(X_train)+1)
for i in range(len(X_test)):
    augmented_set = np.append(X_train, [X_test[i]],axis = 0)
    for j in range(len(labels)):
        augmented_targets = np.append(y_train, labels[j])
        p_values[i][j] = calc_pvalue_2(augmented_set,augmented_targets)
for i in range(len(p_values)):
    pred_targets[i] = labels[np.argmax(p_values[i])]
sum = 0    
for i in range(len(p_values)):
    k = int(pred_targets[i])
    for j in range(len(labels)):
        if p_values[i,k]!= p_values[i,j]:
            sum =sum+ p_values[i,j]
avg_false_p = sum/(len(p_values)*(len(labels)-1))
print('The average False_Pvalue is: ',avg_false_p)

The average False_Pvalue is:  0.021541686073591068


In [39]:
print('The Test error rate is :',np.mean(pred_targets != y_test))
print('Number of Misclassification :',np.sum(pred_targets!=y_test))
print('Success rate is :',np.mean(pred_targets == y_test))

The Test error rate is : 0.07894736842105263
Number of Misclassification : 3
Success rate is : 0.9210526315789473


**implementing nearest distance to a different class conformity measure for iris dataset**

In [41]:
def calc_pvalue_3(aug_set,aug_targets):
    for i in range(len(aug_set)):
        c = np.array([])
        for j in range(len(aug_set)):
            if (i!=j) & (aug_targets[i] != aug_targets[j]):
                b = (np.sum((aug_set[i]-aug_set[j])**2))**0.5
                c = np.append(c,b)       
        conformity_scores[i] = np.min(c)
    test_conformity = conformity_scores[-1]
    rank = np.sum(test_conformity >= conformity_scores)
    p = rank/len(aug_set)
    return p
         

In [42]:
labels = np.unique(y_train)
p_values =np.zeros((len(X_test), len(labels)))
pred_targets = np.zeros(len(X_test))
conformity_scores = np.zeros(len(X_train)+1)
for i in range(len(X_test)):
    augmented_set = np.append(X_train, [X_test[i]],axis = 0)
    for j in range(len(labels)):
        augmented_targets = np.append(y_train, labels[j])
        p_values[i][j] = calc_pvalue_3(augmented_set,augmented_targets)
for i in range(len(p_values)):
    pred_targets[i] = labels[np.argmax(p_values[i])]
sum = 0    
for i in range(len(p_values)):
    k = int(pred_targets[i])
    for j in range(len(labels)):
        if p_values[i,k]!= p_values[i,j]:
            sum =sum+ p_values[i,j]
avg_false_p = sum/(len(p_values)*(len(labels)-1))
print('The average False_Pvalue is: ',avg_false_p)

The average False_Pvalue is:  0.03935724266418259


In [43]:
print('The Test error rate is :',np.mean(pred_targets != y_test))
print('Number of Misclassification :',np.sum(pred_targets!=y_test))
print('Success rate is :',np.mean(pred_targets == y_test))

The Test error rate is : 0.07894736842105263
Number of Misclassification : 3
Success rate is : 0.9210526315789473


***observation***

After Using Conformal Predictors on Iris dataset using different conformity measures,the Test error rate and number of misclassifications remained same.
Also, The results obtained by conformal predictors were similar to the results obtained via Nearest Neighbour algorithm

***KNN implementation for General K***

In [15]:
from collections import Counter
from random import choice

Choosing the value of K = 3 for example run on Ionosphere data set

In [16]:
X_train, X_test,y_train,y_test = train_test_split(iris_data.data,iris_data.target,random_state = 2810)

In [18]:
k =3
y_pred = np.zeros(len(X_test))
a = np.zeros(len(X_train))
for i in range(len(X_test)):
    for j in range(len(X_train)):
        a[j] = E_len(X_train[j], X_test[i])
    pos = np.argpartition(a,k)    
    values = [ y_train[pos[h]] for h in range(k)]
    temp = Counter(values)    
    if temp.most_common(1)[0][1] == 1:
        y_pred[i]= choice(np.unique(y_train))
    else:
        y_pred[i]=temp.most_common(1)[0][0]

In [19]:
# Test error and no_of_error values
test_error_rate = np.mean(y_pred != y_test)
no_of_errors = np.sum(y_pred!=y_test)
print('The Test Error rate is: ',test_error_rate)
print('Number of Errors: ',no_of_errors)

The Test Error rate is:  0.07894736842105263
Number of Errors:  3


Observation: K-Neighbours algorithm for k = 3 gave better results than Nearest Neighbour or Conformal Prediction algorithm

**SUMMARY**

 
  Nearest Neighbor- Iris Dataset-  The Test Error rate is:  0.07894736842105263
  
  Nearest Neighbor- Ionosphere Dataset - The Test Error rate is:  0.10227272727272728
  
  Conformal Predictors - Iris Dataset- The average False_Pvalue is:  0.009548206800186318
  
  COnformal Predictors - Inonosphere Dataset- The average False_Pvalue is:  0.09999139118457302

