In [None]:
import numpy as np
import torch
import torchvision
from torchvision import datasets
from torchvision import transforms
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# import pyprobml_utils as pml

data_transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor()])

def get_data(data, indices=None, binarize=True):
  N = len(data)
  if indices is None:
    indices = range(0, N)
  #X = torch.stack([data[i][0] for i in indices], dim=1).squeeze(0) # (N,28,28)
  X = np.stack([data[i][0].numpy() for i in indices], axis=1).squeeze(0) # (N,28,28)
  if binarize: X = (X > 0.5)
  #y = torch.tensor([data[i][1] for i in indices])
  y = np.array([data[i][1] for i in indices])
  return X, y

data = datasets.EMNIST(
    root="~/data",
    split="balanced",
    download=True,
    transform=data_transform
)

X, y = get_data(data)

import sklearn.metrics



Downloading https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip to /root/data/EMNIST/raw/gzip.zip


  0%|          | 0/561753746 [00:00<?, ?it/s]

Extracting /root/data/EMNIST/raw/gzip.zip to /root/data/EMNIST/raw


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.90, random_state=0)

In [None]:
from sklearn.base import BaseEstimator
import numpy as np
import matplotlib.pyplot as plt
import math
from custom_scorer_module import custom_scoring_function 

In [None]:
# we are attempting to write a MLE extatom
class MLEBayes(BaseEstimator):
  def fit(self, X, y):
    #we first calculate P(y) - that is just done by counting methods
    y_freq = dict()
    for i in y: # go through all values in y
      y_freq[i] = y_freq.get(i, 0)+1

    
    n = len(y) # total number of samples in y

    self.y_prob = dict()# calculate the prob for each y

    for i in y_freq:
      self.y_prob[i] = y_freq[i]/n # this is the dicitionary that holds all the probs for y indexed using the class itself

    
    # we make a collection of x_ij
    self.labelDict = dict()
    for j in range(len(X)):
      if y[j] not in self.labelDict:
        self.labelDict[y[j]] = []
      self.labelDict.get(y[j]).append(X[j]) # add all 
    
    # print(self.labelDict[y[j]])
    self.x_ij = dict() #prob x given y key being y

    for i in self.labelDict:
      samples = self.labelDict[i] # get all the samples of eeach class. After all we are conditioning on it

      p_matrix = np.zeros((len(X[0])**2)) # this is the probabilty this for those sampls

      for j in samples: # go thorugh all the samples
        b = np.reshape(j, (len(j)**2)) # change the sample into something linear

        for k in range(len(b)): # go through the linear thing
          if b[k] == True:
            p_matrix[k]+=1 # sample fot loop ends

      for k in range(len(p_matrix)): # final probability # salting with alpha
        p_matrix[k]+=1
        p_matrix[k] = p_matrix[k]/(1+len(samples))
        # p_matrix[k]

      self.x_ij[i] = p_matrix# done for label i
    print(len(self.x_ij))
    
    print("Training complete")

  def max_index(self, number_list):
    max_value = max(number_list)
  # Return the max value of the list
    max_index = number_list.index(max_value)
    return max_index

  def bayes_pred(self, xv):
    # plt.imshow((np.reshape(xv, (28, 28))), interpolation='nearest') # we see we can generate some sick picture
    # plt.show()
    y_bar = [0]*len(self.labelDict)
    for i in self.labelDict:# go through all the labels
      py = self.y_prob[i]
      ans = 0
      for j in range(len(xv)):
        if xv[j] == True:
          if self.x_ij[i][j] < 0:
            continue
          ans+=math.log(self.x_ij[i][j])
        else:
          if self.x_ij[i][j] >= 1:
            continue
          ans+=math.log(1-self.x_ij[i][j])
      ans+=math.log(py)
      y_bar[i]=ans
    re = self.max_index(y_bar)
    return re
    # return y_bar

  def bayes_pred_score(self, xv):
    # plt.imshow((np.reshape(xv, (28, 28))), interpolation='nearest') # we see we can generate some sick picture
    # plt.show()
    y_bar = [0]*len(self.labelDict)
    for i in self.labelDict:# go through all the labels
      py = self.y_prob[i]
      ans = 0
      for j in range(len(xv)):
        if xv[j] == True:
          if self.x_ij[i][j] < 0:
            continue
          ans+=math.log(self.x_ij[i][j])
        else:
          if self.x_ij[i][j] >= 1:
            continue
          ans+=math.log(1-self.x_ij[i][j])
      ans+=math.log(py)
      y_bar[i]=ans
    # re = self.max_index(y_bar)
    return y_bar

  
  def score(self, X_test, y_test):
    score = 0
    y_predict = []
    k = 0
    for i in X_test:
      # convert into something you can understand
      something = np.reshape(i, (len(i)**2))
      please = self.bayes_pred(something)
      # if please == y_test[k]:
      #   score+=1
      score+=please[y_test[k]]
      k+=1
      # y_predict.append(self.bayes_pred(something))
    return score/len(y_test)
    
  def predict(self, X_test):
    # plt.imshow((np.reshape(mle.x_ij[0], (28, 28))), interpolation='nearest') # we see we can generate some sick picture
    # plt.show()
    y_predict = []
    for i in X_test:
      # convert into something you can understand
      something = np.reshape(i, (len(i)**2))
      y_predict.append(self.bayes_pred(something))
    return y_predict

  


In [None]:
mle = MLEBayes()
mle.fit(X_train, y_train)