In [141]:
import idx2numpy
import numpy as np
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
import random
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support

In [29]:
from imblearn.under_sampling import CondensedNearestNeighbour

In [2]:
x_train = idx2numpy.convert_from_file('train-images.idx3-ubyte')
y_train = idx2numpy.convert_from_file('train-labels.idx1-ubyte')

In [3]:
x_test = idx2numpy.convert_from_file('t10k-images.idx3-ubyte')
y_test = idx2numpy.convert_from_file('t10k-labels.idx1-ubyte')

In [4]:
x_train.shape

(60000, 28, 28)

In [5]:
y_train.shape

(60000,)

In [6]:
x_test.shape

(10000, 28, 28)

In [7]:
y_test.shape

(10000,)

In [8]:
x_train=[i.flatten() for i in list(x_train)]
x_test=[i.flatten() for i in list(x_test)]

### Full dataset training

In [137]:
model = KNeighborsClassifier(n_neighbors=1)
model.fit(x_train, y_train)

In [138]:
model.score(x_test,y_test)

0.9691

In [139]:
y_pred = model.predict(x_test)

In [143]:
precision_recall_fscore_support(y_test,y_pred,average='weighted')

(0.9692024021594029, 0.9691, 0.9690691238431085, None)

In [131]:
Counter(y_train)

Counter({1: 6742,
         7: 6265,
         3: 6131,
         2: 5958,
         9: 5949,
         0: 5923,
         6: 5918,
         8: 5851,
         4: 5842,
         5: 5421})

### Random Samples as prototypes

In [151]:
ls = list(zip(x_train,y_train))
def knn_random_sample(n_samples):
    xy_rand = random.sample(ls,n_samples)
    xr = [i[0] for i in xy_rand]
    yr = [i[1] for i in xy_rand]
    #print(Counter(yr))
    model = KNeighborsClassifier(n_neighbors=1)
    model.fit(xr, yr)
    acc = model.score(x_test,y_test)
    #y_pred = model.predict(x_test)
    #p = precision_recall_fscore_support(y_test,y_pred,average='weighted')
    return acc#,p

In [304]:
a = 0
for i in range(15):
    a += knn_random_sample(10000)
print(a/15)

0.9476533333333331


In [116]:
for i in range(15):
    a = knn_random_sample(5000)
    print(a)

0.9363
0.9327
0.9368
0.9327
0.9331
0.9335
0.9361
0.9345
0.9353
0.9362
0.9325
0.9348
0.9359
0.9371
0.9352


In [117]:
for i in range(15):
    a = knn_random_sample(1000)
    print(a)

0.8929
0.8894
0.8856
0.8805
0.8862
0.8816
0.8835
0.8888
0.8813
0.8868
0.8943
0.8794
0.8886
0.8901
0.8842


In [16]:
Counter(y_train)

Counter({1: 6742,
         7: 6265,
         3: 6131,
         2: 5958,
         9: 5949,
         0: 5923,
         6: 5918,
         8: 5851,
         4: 5842,
         5: 5421})

In [126]:
def knn_random_sample2(n_samples):
    pro = []
    pro_label = []
    k = int(n_samples/10)

    for i in range(10):
        print("Getting "+ str(k) + " centers for label "+str(i))
        xy = [j for j in ls if j[1]==i]
        xl = [j[0] for j in xy]
        #print(xl)
        #yl = [i[1] for i in xy]
#         kmeans = KMeans(n_clusters=k,random_state=0, n_init="auto").fit(xl)
#         pro.extend(kmeans.cluster_centers_)
        pro.extend(random.sample(xl,k))
        pro_label.extend([i]*k)
    return pro, pro_label

In [165]:
pro, pro_label = knn_random_sample2(5000)
pro_zip = list(zip(pro,pro_label))
random.shuffle(pro_zip)
print("Training dataset size is "+str(len(pro_zip)))
xr = [i[0] for i in pro_zip]
yr = [i[1] for i in pro_zip]
model = KNeighborsClassifier(n_neighbors=1)
model.fit(xr, yr)
acc = model.score(x_test,y_test)
acc

Getting 500 centers for label 0
Getting 500 centers for label 1
Getting 500 centers for label 2
Getting 500 centers for label 3
Getting 500 centers for label 4
Getting 500 centers for label 5
Getting 500 centers for label 6
Getting 500 centers for label 7
Getting 500 centers for label 8
Getting 500 centers for label 9
Training dataset size is 5000


0.938

### Approach 1

In [167]:
def get_prototypes1(k):
    pro = []
    pro_label = []

    for i in range(10):
        print("Getting "+ str(k) + " centers for label "+str(i))
        xy = [j for j in ls if j[1]==i]
        xl = [j[0] for j in xy]
        #yl = [i[1] for i in xy]
        kmeans = KMeans(n_clusters=k,random_state=0, n_init="auto").fit(xl)
        pro.extend(kmeans.cluster_centers_)
        pro_label.extend([i]*k)
    return pro, pro_label

In [169]:
M = 10000
pro, pro_label = get_prototypes1(int(M/10))
pro_zip = list(zip(pro,pro_label))
random.shuffle(pro_zip)
print("Training dataset size is "+str(len(pro_zip)))
xr = [i[0] for i in pro_zip]
yr = [i[1] for i in pro_zip]
model = KNeighborsClassifier(n_neighbors=1)
model.fit(xr, yr)
acc = model.score(x_test,y_test)
acc

Getting 1000 centers for label 0
Getting 1000 centers for label 1
Getting 1000 centers for label 2
Getting 1000 centers for label 3
Getting 1000 centers for label 4
Getting 1000 centers for label 5
Getting 1000 centers for label 6
Getting 1000 centers for label 7
Getting 1000 centers for label 8
Getting 1000 centers for label 9
Training dataset size is 10000


0.9685

In [170]:
M = 5000
pro, pro_label = get_prototypes1(int(M/10))
pro_zip = list(zip(pro,pro_label))
random.shuffle(pro_zip)
print("Training dataset size is "+str(len(pro_zip)))
xr = [i[0] for i in pro_zip]
yr = [i[1] for i in pro_zip]
model = KNeighborsClassifier(n_neighbors=1)
model.fit(xr, yr)
acc = model.score(x_test,y_test)
acc

Getting 500 centers for label 0
Getting 500 centers for label 1
Getting 500 centers for label 2
Getting 500 centers for label 3
Getting 500 centers for label 4
Getting 500 centers for label 5
Getting 500 centers for label 6
Getting 500 centers for label 7
Getting 500 centers for label 8
Getting 500 centers for label 9
Training dataset size is 5000


0.9669

In [171]:
M = 1000
pro, pro_label = get_prototypes1(int(M/10))
pro_zip = list(zip(pro,pro_label))
random.shuffle(pro_zip)
print("Training dataset size is "+str(len(pro_zip)))
xr = [i[0] for i in pro_zip]
yr = [i[1] for i in pro_zip]
model = KNeighborsClassifier(n_neighbors=1)
model.fit(xr, yr)
acc = model.score(x_test,y_test)
acc

Getting 100 centers for label 0
Getting 100 centers for label 1
Getting 100 centers for label 2
Getting 100 centers for label 3
Getting 100 centers for label 4
Getting 100 centers for label 5
Getting 100 centers for label 6
Getting 100 centers for label 7
Getting 100 centers for label 8
Getting 100 centers for label 9
Training dataset size is 1000


0.959

### Approach 2 - LVQ

In [272]:
prototypes = []
for i in range(10):
    xy = [j for j in ls if j[1]==i]
    prototypes.extend(random.sample(xy,1000))
len(prototypes)

10000

In [273]:
#prototypes = pro_zip
xp = [i[0] for i in prototypes]
yp = [i[1] for i in prototypes]
lr = 0.15
lri = lr
max_iter = 2000

In [274]:
for i in range(max_iter):
    rs = random.randint(0, len(ls)-1)
    x = ls[rs][0]
    label = ls[rs][1]
    model = KNeighborsClassifier(n_neighbors=1)
    model.fit(xp, yp)
    nn = model.kneighbors([x],return_distance = False)[0][0]
    nn_label = yp[nn]
#     print(nn_label)
#     print(label)
    if(label != nn_label):
        ## Move protype away
#         print("moving away")
        xp[nn] = xp[nn] - lr*(x-xp[nn])
    else:
        ## Move prototype closer
#         print("moving closer")
        xp[nn] = xp[nn] + lr*(x-xp[nn])
        
#     print(x_train[i])
#     print(xp[nn])
    lr = lr - lri/max_iter

In [303]:
model = KNeighborsClassifier(n_neighbors=1)
model.fit(xp, yp)
acc = model.score(x_test,y_test)
acc

0.9578


### Approach 3 CNN

In [276]:
def cnn_protoype(M):
    random.shuffle(ls)
    pr = random.sample(ls,1000)#[ls[0]]
    for i in range(len(ls)):
        #print(i)
        if(len(pr)>=M): break
        #pred = predict(ls[i][0])
        
        xp = [i[0] for i in pr]
        yp = [i[1] for i in pr]
        model = KNeighborsClassifier(n_neighbors=1)
        model.fit(xp, yp)
        nn = model.kneighbors([ls[i][0]],return_distance = False)[0][0]
        nn_label = yp[nn]
        
        if(nn_label != ls[i][1]):
            pr.append(ls[i])
    return pr

In [302]:
ls_cnnp = cnn_protoype(10000)
len(ls_cnnp)

5003


In [279]:
def get_prototypes1(k,yp):
    pro = []
    pro_label = []

    for i in range(10):
        c = k-Counter(yp)[i]
        if(c<=0): continue
        print("Getting "+ str(c) + " centers for label "+str(i))
        xy = [j for j in ls if j[1]==i]
        xl = [j[0] for j in xy]
        #yl = [i[1] for i in xy]
        kmeans = KMeans(n_clusters=c,random_state=0, n_init="auto").fit(xl)
        pro.extend(kmeans.cluster_centers_)
        pro_label.extend([i]*c)
    return pro, pro_label

In [281]:
pro, pro_label = get_prototypes1(1000,yp)
xp.extend(pro)
yp.extend(pro_label)
model = KNeighborsClassifier(n_neighbors=1)
model.fit(xp, yp)
acc = model.score(x_test,y_test)
acc

Getting 734 centers for label 0
Getting 798 centers for label 1
Getting 431 centers for label 2
Getting 306 centers for label 3
Getting 416 centers for label 4
Getting 387 centers for label 5
Getting 665 centers for label 6
Getting 516 centers for label 7
Getting 152 centers for label 8
Getting 286 centers for label 9


0.9676