# PSet1 Coding Problems

Most notebooks will start with a setup, including loading important packages. 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# ^^ Predefined ‘magic function’ telling Jupyter to display images inline (rather than pop up a separate window)
import matplotlib.image as mimg
# ^^ package to help us read in images
import numpy as np
import random
import os

## Provided functions

FashionMNIST contains 10 classes, labeled as 0,1,2,...,9 in the dataset. Below is the mapping between numeric labels and their actual classes

In [None]:
classes_dict={0:'T-shirt/top',
              1:'Trouser',
              2:'Pullover',
              3:'Dress',
              4:'Coat',
              5:'Sandal',
              6:'Shirt',
              7:'Sneaker',
              8:'Bag',
              9:'Ankle boot'}

In [None]:
def makeDataBase(basename,partition='train',N=None, shuffle=False,normalize=True):
    '''
    This function will create one database that will contain images with their lables
    The data are supposed to be in the paths consisting of
     <basename> / <partition> / <category> / <category>-<index>.jpg
     e.g., FashionMNIST/val/4/4-37.jpg
    Inputs:
        basename (str)  : name of the folder containing all the data. It should be "FashionMNIST"
        partition (str) : "train" or "val" or "test"
        N (int)         : number of examples for each category; when N=None, all samples will be loaded
        shuffle (bool)  : boolean value; if False, samples from category 0 will be returned first and samples from category 9 last
                          if True, samples will be randomly shuffled
        normalize (bool): boolean value; if True, samples are normalized to [-1,1]
    Outputs:
        database (list) : list of tuples (x,y). x is image data. y is numeric label of x 
        
    '''
    database=[]
    for label in range(10):
        n = len(os.listdir(os.path.join(basename,partition,str(label)))) if N is None else N
        for i in range(n):          
            imageName=os.path.join(basename,partition,str(label),str(label)+'-'+str(i)+'.jpg')
            imageData=mimg.imread(imageName)
            imageData = np.float32(imageData) if not normalize else np.float32(imageData)/255*2-1
            database.append((imageData,label))
    if shuffle:
        random.shuffle(database)
    return database



In [None]:
def list2ndarray(dataset):
    '''
    This function will take the output from makeDataBase() and return two numpy arrays: X and y
    Inputs:
        dataset (list): list of (sample, label) pairs
    Outputs:
        X (array)     : a 2D numpy array with size (N,D). N is the length of dataset, D is 28*28. 
                        Each row of X is an image sample flattened
        y (array)     : a numpy array with size (N,). y contains numeric labels of corresponding samples
    '''
    num=len(dataset)
    X = np.empty((num,28*28),dtype=np.float32)
    y = np.empty((num,),dtype=int)
    for i in range(num):
        X[i] = dataset[i][0].flatten()
        y[i] = dataset[i][1]
        
    return X, y

In [None]:
def compute_accuracy(y_hat,y):
    '''
    This function takes predicted labels and ground truth labels and return accuracy
    Inputs:
        y_hat (array): (N,)-shaped numpy array containing predicted labels
        y (array)    : (N,)-shaped numpy array containing ground truth labels
    Outputs:
        accu (float) : accuracy between [0.,1.]
    '''
    accu = np.count_nonzero(y_hat==y)/len(y)
    return accu

## Problem 7

In [None]:
#First we prepare data
trainset = makeDataBase('FashionMNIST',partition='train',N=100, normalize=True)# Try different N:10/50/100/1000; 
# you can also try omitting N in which case it will load all available training images. The numbers per class 
# are a bit different, but there is a total of 50,000 images; everything will be slower with more data.

# convert the list to matrix/vector format which will make computation easier
trainX, trainy = list2ndarray(trainset)

In [None]:
# now also load val and test (these are of fixed size)
valset = makeDataBase('FashionMNIST',partition='val',normalize=True)
testset = makeDataBase('FashionMNIST',partition='test',normalize=True)

valX, valy = list2ndarray(valset)
testX, testy = list2ndarray(testset)

In [None]:
# Some Sanity check: visualize an image and print its class. Play with this a bit to familiarize yourself with the
# kind of data you are working with
plt.imshow(trainset[0][0],cmap='gray')
plt.title(f'This is a {classes_dict[trainset[0][1]]}')
plt.show()

In [None]:
def KNN_predict(trainX, trainy, testX, k=1):
    '''
    IMPLEMENT THIS
    This function predicts labels using KNN algorithm
    Inputs:
        trainX (array): training data. numpy array with size (N,D)
        trainy (array): label of training data. numpy array with size (N,)
        testX (array) : testing data. numpy array with size (M,D)
        k (int)       : number of nearest neighbors.
    Outputs:
        y_hat (array) : predicted label of textX, numpy array with size (M,)
        
        
    Hint:
    Step1: construct L2 distance matrix D with size (N,M) where len(trainX)=N and len(testX)=M
    Step2: For each column in D, select lowest k values and get their indices (row number)
    Step3: For each sample in testX, find the labels of its k-nearest neighbors 
    Step4: Assign a label to each testX sample by a plurality vote of its neighbors
    Step5: return predicted labels of testX. It should be a numpy array with size (M,)
    '''
    return y_hat

In [None]:
y_hat = KNN_predict(trainX, trainy, valX, k=1) #Try different k: 1,3,7,15,25
accuracy = compute_accuracy(y_hat, valy)
#0 means all wrong; 1.0 means all correct; random guess should give you ~0.1, as there are 10 classes
print(f'Accuracy for your KNN predictor is {accuracy}')

After trying out all combinations of k and N, select the optimal k and N and compute accuracy on the test partition

In [None]:
#Compute test partition accuracy here

## Problem 10

In [None]:
def train_perceptron(X,y, max_epoch=20, lr=1., lr_decay=0.95, stop_threshold=0.02):
    '''
    IMPLEMENT THIS
    This function trains a 10-way percpetron and returns learned parameters of the classifier
    Inputs:  (feel free to add more optional arguments, if you want)
        X (array)             : training data. numpy array with size (N,D)
        y (array)             : label of training data. numpy array with size (N,)
        max_epoch (int)       : max number of epochs to run
        lr (float)            : learning rate
        lr_decay (float)      : multiplier for lr at the end of each epoch; range(0,1]
        stop_threshold (float): early stopping if percentage of error made by the model lower than this number; range [0,1]
    Outputs:
        W (array)             : numpy array with size (num_classes, D)
        b (array)             : numpy array with size (num_classes,)
    '''
    return W, b

In [None]:
def test_perceptron(X, W, b):
    '''
    IMPLEMENT THIS
    This function predicts labels of data X using learned W and b
    Inputs:
        X (array)    : testing data. numpy array with size (N, D)
        W (array)    : numpy array with size (num_classes, D)
        b (array)    : numpy array with size (num_classes,) 
    Outputs:
        y_hat (array): numpy array with size (N,)
    '''
    return y_hat

In [None]:
# You can load all data with N=None, but it will take a long time to train
# Make suffle = True
trainset = makeDataBase('FashionMNIST',partition='train',N=1000, normalize=True, shuffle=True)
trainX, trainy = list2ndarray(trainset)

In [None]:
W, b = train_perceptron(trainX, trainy, lr=1.0) #Tune lr on validation partition
y_hat = test_perceptron(valX, W, b)
accu = compute_accuracy(y_hat, valy)

In [None]:
#Compute test partition accuracy here, with your best lr

## Problem 11

In [None]:
trainset = makeDataBase('FashionMNIST',partition='train',N=1000, normalize=True, shuffle=False)
trainX, trainy = list2ndarray(trainset)
W, b = train_perceptron(trainX, trainy, lr=1.0)
y_hat = test_perceptron(valX, W, b)
accu = compute_accuracy(y_hat, valy)

In [None]:
#What do you observe? What could be the reason?