In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
# Predefined ‘magic function’ that display image inline
import matplotlib.image as mimg
# package to help us read in images
import numpy as np
from numpy.random import default_rng
rng=default_rng()  # this is a recommended way to use random number generation now
import os
import random
import math

In [2]:
classes_dict={0:'T-shirt/top',
              1:'Trouser',
              2:'Pullover',
              3:'Dress',
              4:'Coat',
              5:'Sandal',
              6:'Shirt',
              7:'Sneaker',
              8:'Bag',
              9:'Ankle boot'}

In [3]:
def makeDataBase(basename,partition='train',N=None, shuffle=True,normalize=True):
    '''
    This function will create one database that will contain images with their lables
    The data are supposed to be in the paths consisting of
     <basename> / <partition> / <category> / <category>-<index>.jpg
     e.g., FashionMNIST/val/4/4-37.jpg
    Inputs:
        basename (str)  : name of the folder containing all the data. It should be "FashionMNIST"
        partition (str) : "train" or "val" or "test"
        N (int)         : number of examples for each category; when N=None, all samples will be loaded
        shuffle (bool)  : boolean value; if False, samples from category 0 will be returned first and samples from category 9 last
                          if True, samples will be randomly shuffled
        normalize (bool): boolean value; if True, samples are normalized to [-1,1]
    Outputs:
        database (list) : list of tuples (x,y). x is image data. y is numeric label of x 
        
    '''
    database=[]
    for label in range(10): # note: we hardcode number of classes here, it could be another argument
        n = len(os.listdir(os.path.join(basename,partition,str(label)))) if N is None else N
        for i in range(n):          
            imageName=os.path.join(basename,partition,str(label),str(label)+'-'+str(i)+'.jpg')
            imageData=mimg.imread(imageName)
            imageData = np.float32(imageData) if not normalize else np.float32(imageData)/255*2-1
            database.append((imageData,label))
    if shuffle:
        random.shuffle(database)
    return database

In [4]:
def list2ndarray(dataset):
    '''
    This function will take the output from makeDataBase() and return two numpy arrays: X and y
    Inputs:
        dataset (list): list of (sample, label) pairs
    Outputs:
        X (array)     : a 2D numpy array with size (N,D). N is the length of dataset, D is 28*28. 
                        Each row of X is an image sample flattened
        y (array)     : a numpy array with size (N,). y contains numeric labels of corresponding samples
    '''
    num=len(dataset)
    X = np.empty((num,28*28),dtype=np.float32)
    y = np.empty((num,),dtype=int)
    for i in range(num):
        X[i] = dataset[i][0].flatten()
        y[i] = dataset[i][1]
        
    return X, y

In [5]:
def compute_accuracy(yhat,y):
    '''
    This function takes predicted labels and ground truth labels and return accuracy
    Inputs:
        y_hat (array): (N,)-shaped numpy array containing predicted labels OR
                       (N,C) one-hot representation for C classes
        y (array)    : (N,)-shaped numpy array containing ground truth labels OR
                       (N,C) one-hot representation for C classes
    Outputs:
        accu (float) : accuracy between [0.,1.]
    '''
    # if needed, convert from one-hot to class labels
    if len(y.shape) ==2:
        y = np.argmax(y,axis=1)
    if len(yhat.shape) ==2:
        yhat=np.argmax(yhat,axis=1)
    accu = np.count_nonzero(yhat==y)/len(y)
    return accu

You may need to modify the path below to point to wherever you stored the data in Pset 1:

In [6]:
trainset = makeDataBase('../ps1/FashionMNIST',partition='train',normalize=True) # this loads the entire training set, you
                                                                                # may want to use N=200 or other N for some steps
trainX, trainy = list2ndarray(trainset)

In [7]:
valset = makeDataBase('../ps1/FashionMNIST',partition='val',normalize=True)
testset = makeDataBase('../ps1/FashionMNIST',partition='test',normalize=True)
valX, valy = list2ndarray(valset)
testX, testy = list2ndarray(testset)

A couple of functions implementing different schedules for the learning rate

In [8]:
def expSchedule(lr,e,delta):
    '''
    This function computes the updates learning rate (lr) at epoch e, using the decay factor delta;
    it returns the new lr which is the old lr multiplied by delta (e is ignored)
    '''
    return lr*delta

In [9]:
def stepSchedule(lr,t,steps):
    '''
    A scheduled adjustment of the learning rate.
    The current learning rate at the end of epoch e is updated according to the argument 'steps':
    steps specifies the schedule; it's a list, with i-th element being a tuple
    of the form (m,d) which means that once t (number of epoch) reaches m, the lr is multiplie
    by d. So, if initial lr is 0.1, and steps is [(10,.5),(20,.1),(30,.1)] then the lr
    will be 0.1 for epochs 0,..,9; 0.05 for epochs 10,..,19; 0.005 for 20,..,29; and .0005 for 
    epochs 29 and on
    '''
    
    for n in range(len(steps)):
        if t==steps[n][0]:
            return lr*steps[n][1]
    return lr    

In [10]:
def scores(W,X):
    '''
    Compute the scores of the classes on data in X
    Inputs:
        W (array): (d,C) matrix of parameters for the C classes
        X (array): (N,d) data matrix of features; may or may not include the constant feature
    Outputs:
        scores (array): (N,C) matrix of scores for each of the N examples and C classes
    '''
   
    if W.shape[0] == X.shape[1]+1: # need to add the constant feature
        X=np.hstack((X,np.ones((X.shape[0],1))))
    return np.dot(X,W)

In [11]:
def softmax(scores):
    '''
    IMPLEMENT THIS
    Compute the class posterior probabilities from the scores
    Inputs:
        scores (array): output of scores()
    Outputs:
        yhat (array): (N,C) matrix of class posteriors for each of the N examples and C classes
    Note: you may want to include the dynamic range shift trick discussed in class
    '''
    N = len(scores)
    C = len(scores[0])
    yhat = np.zeros((N, C))
    
      #softmax shift hasnt done yet
    # for i in range(N):
    #     maxEle = max(scores[i])

    #     # prevent overfolow
    #     each_row = list(map(lambda x: math.exp(x - maxEle), scores[i]))
    #     denominator = sum(each_row)
    #     for j in range(C):
    #       yhat[i][j] = each_row[j]/denominator
  
    for i in range(N):
        a = max(scores[i])
        row = list(map(lambda x:math.exp(x - a), scores[i]))
        denominator = sum(row)
        for j in range(C):
            yhat[i][j] = math.exp(scores[i][j] - a) / denominator
    return yhat

In [12]:
def convert_to_one_hot(y, num_values=10):
    """
    Expects a numpy array of labels like [4, 3, 0, ...]
    Outputs: one-hot encoding, e.g. for the above, assuming 10 classes (possible label values)
        [
            [0,0,0,0,1,0,0,0,0,0],
            [0,0,0,1,0,0,0,0,0,0],
            [1,0,0,0,0,0,0,0,0,0],
            ....
        ]
    """
    one_hot_labels = np.zeros((y.size, num_values))
    one_hot_labels[np.arange(y.size), y] = 1
    return one_hot_labels


In [13]:
def logLikelihood(yhat,y):
    '''
    IMPLEMENT THIS
    Compute the log-likelihood of the model which produced posterior in yhat corresponding to true labels in y
    Inputs:
        yhat (array): (N,C) posterior matrix like that returned by softmax()
        y (array): (N,C) true labels in one-hot format, or
                   (N,) class labels -- your choice (or the function can handle both)
    Outputs:
        ll (float): the average log-likelihood 
    '''
    N = len(yhat)
    C = len(yhat[0])

    res = 0
    for i in range(N):
        res += math.log(yhat[i][y[i]])
    res = res / N

    return res


In [14]:
idx = list(np.random.randint(10, size = 5))
idx

[0, 4, 0, 3, 1]

In [15]:
def trainSoftmax(X,Y,params):
    '''
    IMPLEMENT THIS
    Inputs:
        X (array): a (N,d) data matrix of features. Does NOT include the constant feature
                   (you need to add it if you want to use it)
        Y (array): a (N,) vector of class labels
        params (dictionary): specification for hyper-parameters. Must include at least the following:
          'num_epochs' : max number of epochs for training;
          'lr' : the (initial) learning rate;
          'scheduler': the function that takes (lr,e,arg) and updates the lr at the end of epoch e;
          'lr_update': the arg to pass to scheduler;
          'batch_size': the size of the mini-batches to use in SGD
          
    Outputs:
        W (array): an (d,C) matrix of parameters
        NLL: the list of values of negative average log-likelihood of the softmax model on the training data, 
             one number for each training epoch covered.
    '''
    print("hahha")
    num_epochs = params['num_epochs']
    lr = params['lr']
    scheduler = params['scheduler']
    lr_update = params['lr_update']
    batch_size = params['batch_size']

    epoch = 0
    N = len(X)
    d = len(X[0])

    W = np.zeros((d, N))
    NLL = np.zeros(num_epochs)

    while (epoch < num_epochs):
        if (batch_size > N):
            print("ERROR: Batch Size larger than sample size")
            exit(-1)
        
        avg_grad = 0
        idx = random.choices(range(N), k = batch_size) 
        #idx = list(np.random.randint(N, size = batch_size))
        for i in idx: # x is 1 x d
            y = Y[i]
            row_score = np.matmul(X[i], W) # 10 scores of w * x
            m = max(row_score) # max element in x * w
            denominator = sum(list(map(lambda a:math.exp(a - m), row_score)))

            single_score = np.dot(X[i], W[:,y]) # top one score of w_c * x
            numerator = X[i] * math.exp(single_score - m)
            
            grad = - X[i] + numerator / denominator
            avg_grad += grad
            W[:, y] -= lr * grad # maybe wrong 

        # for i in idx:

        #     # perform gradient descent
        #     # calculate gradient at first, according to the formula derived in
        #     # Problem 5, we need X[i], and the softmax probability

        #     true_class = Y[i]
        #     predict = np.matmul(X[i], W)
        #     max_val = max(predict)

        #     each_row = list(map(lambda x: math.exp(x - max_val), predict))
        #     numerator = each_row[true_class]
        #     denominator = sum(each_row)

        #     grad = X[i] - X[i] * (numerator/denominator)
        #     avg_grad += (grad / batch_size)
        
        #     W[:, true_class] -= lr * avg_grad
            
        scores_mat = scores(W, X)
        yhat = softmax(scores_mat)

        #y = convert_to_one_hot(Y, num_values=10)
        NLL[epoch] = logLikelihood(yhat, Y)
        
        
        lr = stepSchedule(lr,epoch, lr_update)

        epoch += 1
        print("epoch number:{}".format(epoch))
        
    return W,NLL

In [16]:
# example of setting the params list to pass to the training code; add your own elements that
# specify stopping criteria and anything else
params={'lr':.1, 'num_epochs':40,'scheduler': stepSchedule,'batch_size':200,'lr_update':[(20,.5),(30,.1)]}

Once you complete the code, follow the problem set assignment, run the experiments, and report your findings.

In [17]:
W, NLL = trainSoftmax(trainX, trainy, params)
predict = softmax(scores(W, testX))
N = len(testX)
y_predict = np.zeros(N)
for i in range(N):
    y_predict[i] = np.argmax(predict[i])

acc = compute_accuracy(y_predict, testy)
print("acc", acc)
print(NLL)

hahha


In [None]:
W, NLL = trainSoftmax(trainX, trainy, params)
predict = softmax(scores[W, valX])
N = len(valX)
y_predict = np.zeros(N)
for i in range(y_predict):
    y_predict[i] = np.argmax(predict[i])
