In [83]:
from fashion_mnist_dataset.utils import mnist_reader
import numpy as np

def normalize(data):
    """
    Description:
                Returns the min-max normalized data.
    Parameters:  
                data: array_like
                    Data to be normalized.
    Returns:
                normData: ndarray
                    Normalized data.
    """
    minVal = np.amin(data)
    maxVal = np.amax(data)
    
    normData = data - minVal
    normData = normData/(maxVal - minVal)
    
    return normData

def one_hot_encode(labels):
    """
    Description:
                Returns the one-hot encoding for a set of labels.
    Parameters:  
                labels: array_like
                    Labels to be one-hot encoded.
    Returns:
                oneHot: ndarray
                    One-hot encoded labels that is numData X numLabels.
    """
    numLabels = np.amax(labels) - np.amin(labels) + 1
    oneHot = np.zeros((labels.shape[0], numLabels))
    
    # See https://numpy.org/devdocs/user/basics.indexing.html#indexing-multi-dimensional-arrays
    oneHot[np.array(range(labels.shape[0])), labels] = 1
    
    return oneHot

def shuffle(data, labels):
    """
    Description:
                Shuffle the data while maintaining proper labeling.
    Parameters: 
                data: array_like
                    Data to be shuffled.
                labels: array_like
                    Labels for each data point.
    Returns:
                dataShuffle: ndarray
                    The shuffled data.
                labelShuffle: ndarray
                    The proper labels for the shuffled data.
    """
    randIdxs = np.random.rand(data.shape[0]).argsort()
    dataShuffle  = np.take(data, randIdxs, axis=0)
    labelShuffle = np.take(labels, randIdxs, axis=0)
    
    return dataShuffle, labelShuffle

def pca(data, comps):
    """
    Description:
                Performs dimensionality reduction on data using Principle Component Analysis and returns the top comps PC's
    Parameters:  
                data: array_like
                    Data to perform PCA on.
                comps: int
                    Number of components to return
    Returns:
                k_pcs: ndarray
                    The top k principle components.
    """
    # using procedure from slide 18: https://piazza.com/class_profile/get_resource/kfrlkk85pei2ma/kg1qafsrf721vt
    mean = np.mean(data, axis=0)
    A = data - mean
    C = np.cov(A.T) # np.cov assumes columns are observations by default
    
    # returns e_vals in ascending order, e_vecs as column vectors
    # we use eigh becuase C is symmetric (faster runtime)
    e_vals, e_vecs = np.linalg.eigh(C)
    k_pcs = e_vecs[:,-comps:]

    return k_pcs

def get_folds(data, k):
    """
    Description:
                Returns the indices from the data set of training and test data
                for k-folds cross validation.
    Parameters:  
                data: array_like
                   Data set to split.
                k: int
                    Number of folds.
    Returns:
                folds: list
                    Contains a list of array pairs <train, test> that are the 
                    indicies from the data set for training and testing.
    """
    folds = []
    
    size = int(data.shape[0]/k) # size of each fold
    for i in range(0, data.shape[0], size):
        train = list(range(i)) + list(range(i+size, data.shape[0]))
        test  = list(range(i, i+size))
        folds.append((train, test))
    
    return folds

def getData(X, y, classes):
    """
    Description:
                Returns data for specific classes indicated from provided data.
    Parameters:  
                X: array_like
                    The original dataset.
                y: array_like
                    The labels for each data point in the dataset.
                classes: array_like
                    Labels of the classes desired.
    Returns:
                new_data:
                    Data for the classes specified by classes.
                new_labels:
                    Labels for the classes specified by classes.
    """
    new_data   = []
    new_labels = []

    for label in classes:
        # see the following link for more information: https://numpy.org/doc/stable/reference/generated/numpy.argwhere.html
        # .T[0] is because argwhere() returns a column vector and we need normal array to index data
        idxs = np.argwhere(y==label).T[0]
        new_data.append(X[idxs])
        new_labels.append(y[idxs])
    
    new_data   = np.vstack(new_data)
    new_labels = np.hstack(new_labels)

    return new_data, new_labels

def prepareData(X, k_pcs, fold):
    """
    Description:
                Prepares the data for training by performing k mutex splits, 
                PCA, and adding a bias term to the data. 
    Parameters:  
                X: array_like
                    The original dataset.
                k_pcs: array_like
                    The top k principal components of the dataset.
    Returns:
                train_set: array_like
                    Data ready for training.
                val_set: array_like
                    Data ready for validation testing.
    """
    train_set = X[fold[0]]@k_pcs # get training, reduce using PCA
    train_set = np.hstack((np.ones((train_set.shape[0], 1)), train_set)) # add bias
    
    val_set   = X[fold[1]]@k_pcs # get validation, reduce using PCA
    val_set   = np.hstack((np.ones((val_set.shape[0], 1)), val_set)) # add bias
    
    
    return train_set, val_set

def logisticRegression(data, labels):
    """
    Description:
                TODO Trains a prediction model using x_train, y_train
    Parameters:  
                data: array_like
                    Training examples.
                labels: array_like
                    Labels of the training examples.
    Returns:
                model: array_like
                    The best model from training used for prediction.
    """
    LR = .01 # Learning rate
    BS = 512 # Batch Size
    p  = 50  # top p PC's
    k  = 4   # number of folds
    M  = 100 # number of epochs
    classes = [0, 9] # classes we want for binary classification
    model   = None # best model from training
    
    X_train, y_train = getData(data, labels, classes)
    X_train, y_train = shuffle(X_train, y_train)

    folds = get_folds(X_train, k=k)
    k_pcs = pca(data, comps=p)
    W = np.random.rand(X_train.shape[0], p+1) # initialize weights
    
    for fold in folds:
        train_set, val_set = prepareData(X_train, k_pcs, fold)
        
        for epoch in range(M):
            continue
    

In [84]:
if __name__ == "__main__":
    X_train, y_train = mnist_reader.load_mnist('fashion_mnist_dataset/data/fashion', kind='train')
    X_test, y_test   = mnist_reader.load_mnist('fashion_mnist_dataset/data/fashion', kind='t10k')
    
    # min-max normalize
    X_train = normalize(X_train)
    X_test  = normalize(X_test)
    
    model = logisticRegression(X_train, y_train)
    
    

(3000, 50)
[1. 1. 1. ... 1. 1. 1.]


TypeError: cannot unpack non-iterable NoneType object