In [1]:
from builtins import range, input

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

from collections import defaultdict

In [3]:

def get_data(limit=None):
    print("Reading in and transforming data...")
    df = pd.read_csv('../data/digit-recognizer/train.csv')
    data = df.values
    np.random.shuffle(data)
    X = data[:, 1:] / 255.0 # data is from 0..255
    Y = data[:, 0]
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

In [6]:
X, Y = get_data(10000)

Reading in and transforming data...


In [7]:
X.shape

(10000, 784)

In [8]:
Y.shape

(10000,)

In [10]:
X, Y = get_data(10000)
Ntrain = len(Y) // 2
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]


Reading in and transforming data...


In [11]:

print(Ntrain)

5000


In [105]:
class Bayes(object):
    def fit(self, X, Y, smoothing=1e-2):
        N, D = X.shape
        labels = set(Y)
        self.gaussians = np.array([defaultdict(float) for index in range(len(labels))])
        self.priors = dict()
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': np.cov(current_x.T) + np.eye(D)*smoothing ,
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))
        for gaus in gaussians:
            mean = gaus['mean']
            cov = gaus['cov']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=cov, allow_singular=True) + np.log(self.priors[c])
        return np.argmax(P, axis=1)

In [106]:

model = Bayes()
t0 = datetime.now()
model.fit(Xtrain, Ytrain)
print("Training time:", (datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy:", model.score(Xtest, Ytest))
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

# plot the mean of each class
for gaus in gaussians:
    mean = gaus['mean']
    cov = gaus['cov']
    plt.imshow(mean.reshape(28, 28))
    plt.title(c)
    plt.show()

Training time: 0:00:00.134679


ValueError: shapes (5000,784) and (1,1) not aligned: 784 (dim 1) != 1 (dim 0)

In [60]:
gaussians = np.array([defaultdict(str) for index in range(4)])
#gaussians = defaultdict(str)

In [61]:
for c in range(4):
    gaussians[c] = {
        'mean': c,
        'cov': c*2,
    }

In [62]:
list(gaussians)

[{'mean': 0, 'cov': 0},
 {'mean': 1, 'cov': 2},
 {'mean': 2, 'cov': 4},
 {'mean': 3, 'cov': 6}]

In [79]:
for gaus in gaussians:
    print(gaus['mean'])
    print(gaus['cov'])
    

0
0
1
2
2
4
3
6
