In [1]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_mldata
%matplotlib inline
from sklearn.utils import shuffle
from tga import TGA

# #use all digits

SPLIT = 16000
END = 24000

mnist = fetch_mldata("MNIST original")
x_dat, y_dat = mnist.data[:] / 255., mnist.target[:]
x_dat, y_dat = shuffle(x_dat, y_dat)


In [16]:
def showDigitImage(array):
    t = (array).reshape(28,28)
    plt.imshow(t)
    plt.show()
    
def filt_num(xs,ys,ns):
    xs = xs.copy()
    ys = ys.copy()
    all_truth = [False]*len(ys)
    for i in ns:
        cur_truth = ys==i
        all_truth = [a or b for a,b in zip(all_truth,cur_truth)]
    return xs[all_truth],ys[all_truth]
        
def add_noise_to_array(pure):
    pure = pure.copy()
    noise = np.random.normal(0, 0.3, pure.shape)
    signal = pure + noise
    return signal

def add_noise_to_train_set(x_train):
    rs = []
    for i in x_train:
        rs += [add_noise_to_array(i)]
    return rs

def rotate180(features):
    return features.copy()[::-1]
  
def vertical_flip(features):
    return np.flip(features.copy().reshape(28,28),1).ravel()
        
def add_true_noise_to_train_set(x_train,p):
    rs = []
    thresholds = np.random.rand(len(x_train)) < p
    for i in range(len(x_train)):
        if(thresholds[i]):
#             print(i)
            if(np.random.rand()<0.5):
                
                rs += [vertical_flip(x_train[i].copy())]
            else:
                rs += [rotate180(x_train[i].copy())]
        else:
            rs += [x_train[i].copy()]
    return rs

new_xs,new_ys = filt_num(x_dat,y_dat,[1,5])


In [5]:
tga = TGA(n_components=5)
tga.fit(new_xs)


0,a
0.0974030092132
0.0212057208929
0.00633617798664
0.0110274658452
0.0114991661628
0.00394380238027
0.0512755219473
0.0506544403746
0.00184739907649
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.00185261240701
0.00391780742202
0.0727848817367
0.0714353928321
0.001852

KeyboardInterrupt: 

In [10]:
tga_dat = tga.transform(new_xs)

x_projected = tga.inverse_transform(tga_dat)

loss = ((new_xs - x_projected) ** 2).mean()
print(loss)

0.0297627101048


# {4,7} vs {4,7}

In [11]:
# Original feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),new_xs, new_ys, cv=10)
print("Logistic Regression",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,new_xs, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))





Logistic Regression 0.996124031008
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00      7877
        5.0       1.00      1.00      1.00      6313

avg / total       1.00      1.00      1.00     14190

SVM 0.994996476392
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00      7877
        5.0       0.99      0.99      0.99      6313

avg / total       0.99      0.99      0.99     14190



In [13]:
# PCA feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),tga_dat, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,tga_dat, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))


0.980056377731
             precision    recall  f1-score   support

        1.0       0.98      0.98      0.98      7877
        5.0       0.98      0.97      0.98      6313

avg / total       0.98      0.98      0.98     14190

SVM 0.980479210712
             precision    recall  f1-score   support

        1.0       0.98      0.98      0.98      7877
        5.0       0.98      0.98      0.98      6313

avg / total       0.98      0.98      0.98     14190



In [19]:
white_xs = add_noise_to_train_set(new_xs)
tga = TGA(n_components=5)
tga.fit(white_xs)

tga_dat = tga.transform(white_xs)

x_projected = tga.inverse_transform(tga_dat)

# loss = ((new_xs - x_projected) ** 2).mean()
# print(loss)


0,1,2,3,4,

In [20]:
# Original feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),white_xs, new_ys, cv=10)
print("Logistic Regression",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,white_xs, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))



Logistic Regression 0.990133897111
             precision    recall  f1-score   support

        1.0       0.99      0.99      0.99      7877
        5.0       0.99      0.99      0.99      6313

avg / total       0.99      0.99      0.99     14190

SVM 0.986539816772
             precision    recall  f1-score   support

        1.0       0.99      0.99      0.99      7877
        5.0       0.99      0.98      0.98      6313

avg / total       0.99      0.99      0.99     14190



In [21]:
# PCA feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),tga_dat, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,tga_dat, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))


0.975546159267
             precision    recall  f1-score   support

        1.0       0.98      0.98      0.98      7877
        5.0       0.98      0.97      0.97      6313

avg / total       0.98      0.98      0.98     14190

SVM 0.976885130374
             precision    recall  f1-score   support

        1.0       0.98      0.98      0.98      7877
        5.0       0.98      0.97      0.97      6313

avg / total       0.98      0.98      0.98     14190



# {1,5} vs {1,5}: Flip


In [17]:
true_noise_xs = add_true_noise_to_train_set(new_xs,0.2)

tga = TGA(n_components=20)
tga.fit(true_noise_xs)
tga_dat = tga.transform(true_noise_xs)

x_projected = tga.inverse_transform(tga_dat)

loss = ((new_xs - x_projected) ** 2).mean()
print(loss)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,0.0269535433814


In [18]:
# Original feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),true_noise_xs, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,true_noise_xs, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

0.984989429175
             precision    recall  f1-score   support

        1.0       0.98      0.99      0.99      7877
        5.0       0.99      0.98      0.98      6313

avg / total       0.98      0.98      0.98     14190

SVM 0.98548273432
             precision    recall  f1-score   support

        1.0       0.98      0.99      0.99      7877
        5.0       0.99      0.98      0.98      6313

avg / total       0.99      0.99      0.99     14190



In [19]:
# PCA feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),tga_dat, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,tga_dat, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

0.968921775899
             precision    recall  f1-score   support

        1.0       0.97      0.98      0.97      7877
        5.0       0.97      0.96      0.96      6313

avg / total       0.97      0.97      0.97     14190

SVM 0.970401691332
             precision    recall  f1-score   support

        1.0       0.97      0.98      0.97      7877
        5.0       0.97      0.96      0.97      6313

avg / total       0.97      0.97      0.97     14190



In [26]:
new_xs,new_ys = filt_num(x_dat,y_dat,[6,9])

tga = TGA(n_components=20)

tga.fit(new_xs)
tga_dat = tga.transform(new_xs)

x_projected = tga.inverse_transform(tga_dat)

loss = ((new_xs - x_projected) ** 2).mean()


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,

In [27]:
# Original feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),new_xs, new_ys, cv=10)
print("Logistic Regression",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,new_xs, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))



Logistic Regression 0.998192858176
             precision    recall  f1-score   support

        6.0       1.00      1.00      1.00      6876
        9.0       1.00      1.00      1.00      6958

avg / total       1.00      1.00      1.00     13834

SVM 0.998265143848
             precision    recall  f1-score   support

        6.0       1.00      1.00      1.00      6876
        9.0       1.00      1.00      1.00      6958

avg / total       1.00      1.00      1.00     13834



In [28]:
# PCA feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),tga_dat, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,tga_dat, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))


0.994434003181
             precision    recall  f1-score   support

        6.0       0.99      0.99      0.99      6876
        9.0       0.99      0.99      0.99      6958

avg / total       0.99      0.99      0.99     13834

SVM 0.994361717508
             precision    recall  f1-score   support

        6.0       0.99      0.99      0.99      6876
        9.0       0.99      0.99      0.99      6958

avg / total       0.99      0.99      0.99     13834



In [31]:
true_noise_xs = add_true_noise_to_train_set(new_xs,0.2)

tga = TGA(n_components=20)
tga.fit(true_noise_xs)
tga_dat = tga.transform(true_noise_xs)

x_projected = tga.inverse_transform(tga_dat)

loss = ((new_xs - x_projected) ** 2).mean()
print(loss)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,0.0335671362677


In [32]:
# Original feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),true_noise_xs, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,true_noise_xs, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

0.951640884777
             precision    recall  f1-score   support

        6.0       0.95      0.95      0.95      6876
        9.0       0.95      0.95      0.95      6958

avg / total       0.95      0.95      0.95     13834

SVM 0.953520312274
             precision    recall  f1-score   support

        6.0       0.95      0.95      0.95      6876
        9.0       0.95      0.95      0.95      6958

avg / total       0.95      0.95      0.95     13834



In [33]:
# PCA feature

from sklearn.linear_model import LogisticRegression
from sklearn import metrics, cross_validation, svm
from sklearn import datasets
predicted = cross_validation.cross_val_predict(LogisticRegression(),tga_dat, new_ys, cv=10)
print(metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

clf = svm.SVC(kernel='linear', C=1)
predicted = cross_validation.cross_val_predict(clf,tga_dat, new_ys, cv=10)
print("SVM",metrics.accuracy_score(new_ys, predicted))
print(metrics.classification_report(new_ys, predicted))

0.903354055226
             precision    recall  f1-score   support

        6.0       0.91      0.89      0.90      6876
        9.0       0.90      0.91      0.90      6958

avg / total       0.90      0.90      0.90     13834

SVM 0.909064623392
             precision    recall  f1-score   support

        6.0       0.92      0.89      0.91      6876
        9.0       0.90      0.93      0.91      6958

avg / total       0.91      0.91      0.91     13834

