In [1]:
import numpy as np
import os
from PIL import Image
from pathlib import Path 
from sklearn.preprocessing import StandardScaler

# Please write the optimal hyperparameter values you obtain in the global variable 'optimal_hyperparm' below. This
# variable should contain the values when I look at your submission. I should not have to run your code to populate this
# variable.
optimal_hyperparam = {}

class COC131:
 def q1(self, filename=None):
    """
    This function should be used to load the data. To speed-up processing in later steps, lower resolution of the
    image to 32*32. The folder names in the root directory of the dataset are the class names. After loading the
    dataset, you should save it into an instance variable self.x (for samples) and self.y (for labels). Both self.x
    and self.y should be numpy arrays of dtype float.

    :param filename: this is the name of an actual random image in the dataset. You don't need this to load the
    dataset. This is used for testing the implementation.
    :return res1: a one-dimensional numpy array containing the flattened low-resolution image in file 'filename'.
    Flatten the image in the row major order. The dtype for the array should be float.
    :return res2: a string containing the class name for the image in file 'filename'. This string should be the same as
    one of the folder names in the originally shared dataset.
    """

    # Get all subfolders within the dataset directory
    subfolders = [entry.path for entry in os.scandir("../dataset") if entry.is_dir()]

    # Creates a list of all paths for the images stored within the ../dataset directory
    image_paths = [os.path.join(subfolder, file)
                 for subfolder in subfolders
                 for file in os.listdir(subfolder) if file.endswith(('.jpg'))] # If the file is an image

    # Processes each image within the dataset directory and stores it into the images array
    self.x = np.array([np.array(Image.open(image_path).resize((32, 32)), dtype=float).flatten() for image_path in image_paths])

    # Gets the name of the folder that the image is stored in (the classification of the image)
    self.y = np.array([os.path.basename(os.path.dirname(image_path)) for image_path in image_paths])

    # If a filename is provided, find and return it
    if filename:
        matching_image_path = next((image_path for image_path in image_paths if filename in image_path), None)
        
        if matching_image_path:
            res1 = np.array(Image.open(matching_image_path).resize((32, 32)), dtype=float).flatten()
            res2 = os.path.basename(os.path.dirname(matching_image_path))  # Extract class name
            return res1, res2

    return self.x, self.y



    def q2(self, inp):
        """
        This function should compute the standardized data from a given 'inp' data. The function should work for a
        dataset with any number of features.

        :param inp: an array from which the standardized data is to be computed.
        :return res2: a numpy array containing the standardized data with standard deviation of 2.5. The array should
        have the same dimensions as the original data
        :return res1: sklearn object used for standardization.
        """

        standard_scaler = StandardScaler()
        standardised_data = standardScaler.fit_transform(inp)

        res1 = standard_scaler
        res2 = standardised_data * 2.5

        return res2, res1

    def q3(self, test_size=None, pre_split_data=None, hyperparam=None):
        """
        This function should build a MLP Classifier using the dataset loaded in function 'q1' and evaluate model
        performance. You can assume that the function 'q1' has been called prior to calling this function. This function
        should support hyperparameter optimizations.

        :param test_size: the proportion of the dataset that should be reserved for testing. This should be a fraction
        between 0 and 1.
        :param pre_split_data: Can be used to provide data already split into training and testing.
        :param hyperparam: hyperparameter values to be tested during hyperparameter optimization.
        :return: The function should return 1 model object and 3 numpy arrays which contain the loss, training accuracy
        and testing accuracy after each training iteration for the best model you found.
        """

        # normalize data

        res1 = object()
        res2 = np.zeros(1)
        res3 = np.zeros(1)
        res4 = np.zeros(1)

        return res1, res2, res3, res4

    def q4(self):
        """
        This function should study the impact of alpha on the performance and parameters of the model. For each value of
        alpha in the list below, train a separate MLPClassifier from scratch. Other hyperparameters for the model can
        be set to the best values you found in 'q3'. You can assume that the function 'q1' has been called
        prior to calling this function.

        :return: res should be the data you visualized.
        """

        alpha_values = [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 50, 100]

        res = np.zeros(1)

        return res

    def q5(self):
        """
        This function should perform hypothesis testing to study the impact of using CV with and without Stratification
        on the performance of MLPClassifier. Set other model hyperparameters to the best values obtained in the previous
        questions. Use 5-fold cross validation for this question. You can assume that the function 'q1' has been called
        prior to calling this function.

        :return: The function should return 4 items - the final testing accuracy for both methods of CV, p-value of the
        test and a string representing the result of hypothesis testing. The string can have only two possible values -
        'Splitting method impacted performance' and 'Splitting method had no effect'.
        """

        res1 = 0
        res2 = 0
        res3 = 0
        res4 = ''

        return res1, res2, res3, res4

    def q6(self):
        """
        This function should perform unsupervised learning using LocallyLinearEmbedding in Sklearn. You can assume that
        the function 'q1' has been called prior to calling this function.

        :return: The function should return the data you visualize.
        """

        res = np.zeros(1)

        return res

In [2]:
dataset = COC131()
image_array, class_label = dataset.q1("Forest_1.jpg")  # Change to an actual filename in your dataset
print(class_label)
images, labels = dataset.q1()
print(images)
print(labels)

Forest
[[148. 121. 120. ... 106.  94. 104.]
 [ 56.  98.  83. ... 173. 127. 112.]
 [ 69.  93.  96. ... 204. 160. 150.]
 ...
 [ 32.  55.  80. ...  32.  53.  81.]
 [ 54.  71.  80. ...  53.  73.  82.]
 [102.  99. 108. ...  96.  96. 106.]]
['AnnualCrop' 'AnnualCrop' 'AnnualCrop' ... 'SeaLake' 'SeaLake' 'SeaLake']
