Merge pull request #86 from collinleiber/ddc

add DDC + video datasets
collinleiber · Feb 12, 2024 · 8f67d15 · 8f67d15
2 parents 4afb304 + ccf4091
commit 8f67d15
Show file tree

Hide file tree

Showing 55 changed files with 4,594 additions and 2,995 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -34,7 +34,7 @@ jobs:
           command: |
             python -m pip install --upgrade pip
             pip install pytest
-            pip install -e .
+            pip install -e .[full]
       - run:
           name: Run tests
           no_output_timeout: 30m

diff --git a/.github/workflows/test-main.yml b/.github/workflows/test-main.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install pytest pytest-cov
-          pip install -e .
+          pip install -e .[full]
 
       - name: Test with pytest
         run: |

diff --git a/CITATION.cff b/CITATION.cff
@@ -26,10 +26,10 @@ preferred-citation:
     - family-names: "Böhm"
       given-names: "Christian"
       orcid: "https://orcid.org/0000-0002-2237-9969"
-  doi: "10.1145/3583780.3615290"
-  booktitle: "Proceedings of the 32nd ACM International Conference on Information and Knowledge Management"
-  start: 5208 # First page number
-  end: 5211 # Last page number
-  title: "Application of Deep Clustering Algorithms"
+  doi: "10.1109/ICDMW60847.2023.00087"
+  booktitle: "2023 IEEE International Conference on Data Mining Workshops (ICDMW)"
+  start: 625 # First page number
+  end: 632 # Last page number
+  title: "Benchmarking Deep Clustering Algorithms With ClustPy"
   year: 2023
-  publisher: "Association for Computing Machinery"
+  publisher: "IEEE"
diff --git a/README.md b/README.md
diff --git a/clustpy/data/__init__.py b/clustpy/data/__init__.py
@@ -1,19 +1,23 @@
 from .synthetic_data_creator import create_subspace_data, create_nr_data
-from .real_world_data import load_har, load_letterrecognition, \
-    load_optdigits, load_pendigits, load_newsgroups, load_iris, load_wine, load_breast_cancer, load_reuters, \
-    load_banknotes, load_htru2, load_mice_protein, load_ecoli, load_spambase, load_seeds, \
-    load_statlog_shuttle, load_forest_types, load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, \
-    load_user_knowledge, load_dermatology, load_multiple_features, load_statlog_australian_credit_approval, \
-    load_breast_cancer_wisconsin_original, load_semeion, load_imagenet_dog, load_imagenet10, load_coil20, load_coil100
+from .real_world_data import load_newsgroups, load_iris, load_wine, load_breast_cancer, load_reuters, load_imagenet_dog, \
+    load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb
+from .real_uci_data import load_har, load_letterrecognition, load_optdigits, load_pendigits, load_banknotes, load_htru2, \
+    load_mice_protein, load_ecoli, load_spambase, load_seeds, load_statlog_shuttle, load_forest_types, \
+    load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, load_user_knowledge, load_dermatology, \
+    load_multiple_features, load_statlog_australian_credit_approval, load_breast_cancer_wisconsin_original, \
+    load_semeion, load_cmu_faces
 from .real_timeseries_data import load_motestrain, load_olive_oil, load_symbols, load_diatom_size_reduction, \
     load_proximal_phalanx_outline, load_plane, load_sony_aibo_robot_surface, load_two_patterns, load_lsst
-from .real_nr_data import load_aloi_small, load_fruit, load_nrletters, load_stickfigures, load_webkb, load_cmu_faces
+from .real_clustpy_data import load_aloi_small, load_fruit, load_nrletters, load_stickfigures
 from .real_torchvision_data import load_usps, load_mnist, load_fmnist, load_kmnist, load_svhn, load_cifar10, load_stl10, \
-    load_gtsrb
+    load_gtsrb, load_cifar100
 from .real_medical_mnist_data import load_path_mnist, load_chest_mnist, load_derma_mnist, load_oct_mnist, \
     load_pneumonia_mnist, load_retina_mnist, load_breast_mnist, load_blood_mnist, load_tissue_mnist, load_organ_a_mnist, \
     load_organ_c_mnist, load_organ_s_mnist, load_organ_mnist_3d, load_nodule_mnist_3d, load_adrenal_mnist_3d, \
     load_fracture_mnist_3d, load_vessel_mnist_3d, load_synapse_mnist_3d
+from clustpy.data.real_video_data import load_video_weizmann, load_video_keck_gesture
+from clustpy.data.preprocessing import ZNormalizer, z_normalization
+from clustpy.data._utils import flatten_images, unflatten_images
 
 __all__ = ['create_subspace_data',
            'create_nr_data',
@@ -29,6 +33,7 @@
            'load_iris',
            'load_wine',
            'load_breast_cancer',
+           'load_olivetti_faces',
            'load_reuters',
            'load_banknotes',
            'load_htru2',
@@ -88,4 +93,11 @@
            'load_imagenet10',
            'load_gtsrb',
            'load_coil20',
-           'load_coil100']
+           'load_coil100',
+           'load_video_weizmann',
+           'load_video_keck_gesture',
+           'ZNormalizer',
+           'z_normalization',
+           'load_cifar100',
+           'flatten_images',
+           'unflatten_images']
diff --git a/clustpy/data/_utils.py b/clustpy/data/_utils.py
@@ -75,13 +75,22 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si
         chink size when downloading the file (default: 32768)
     """
     print("Downloading data set {0} from Google Drive to {1}".format(file_id, filename_local))
-    URL = "https://docs.google.com/uc?export=download&confirm=1"
+    URL = "https://drive.google.com/uc"
     session = requests.Session()
-    response = session.get(URL, params={"id": file_id, "confirm": 1}, stream=True)
+    response = session.get(URL, params={"id": file_id, "confirm": "t"}, stream=True)
+    if response.text.startswith("<!DOCTYPE"):
+        # Large files can not be obtained automatically but need a second request
+        try:
+            URL_extracted = response.text.split("download-form\" action=\"")[1].split("\" method=\"get\"")[0]
+            uuid = response.text.split("name=\"uuid\" value=\"")[1].split("\">")[0]
+        except:
+            raise Exception("[ERROR] New URL and UUID could not be extracted from first request in _download_file_from_google_drive")
+        response = session.get(URL_extracted, params={"id": file_id, "confirm": "t", "uuid": uuid}, stream=True)
     with open(filename_local, "wb") as f:
         for chunk in response.iter_content(chunk_size):
             if chunk:  # filter out keep-alive new chunks
                 f.write(chunk)
+    session.close()
 
 
 def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> (
@@ -147,14 +156,14 @@ def _decompress_z_file(filename: str, directory: str) -> bool:
     return successful
 
 
-def _load_image_data(path: str, image_size: tuple, color_image: bool) -> np.ndarray:
+def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.ndarray:
     """
     Load image and convert it into a coherent size. Returns a numpy array containing the image data.
 
     Parameters
     ----------
-    path : str
-        Path to the image
+    image : str
+        Path to the image. Can also be a numpy array containing the specific pixels
     image_size : tuple
         images of various sizes can be converted into a coherent size.
         The tuple equals (width, height) of the images.
@@ -167,14 +176,85 @@ def _load_image_data(path: str, image_size: tuple, color_image: bool) -> np.ndar
     image_data : np.ndarray
         The numpy array containing the image data
     """
-    image = Image.open(path)
+    if type(image) is str:
+        pil_image = Image.open(image)
+    else:
+        pil_image = Image.fromarray(np.uint8(image))
     if color_image:
-        image = image.convert("RGB")
+        pil_image = pil_image.convert("RGB")
     # Convert to coherent size
     if image_size is not None:
-        image = image.resize(image_size)
-    image_data = np.asarray(image)
+        pil_image = pil_image.resize(image_size)
+    image_data = np.asarray(pil_image)
     assert image_size is None or image_data.shape == (
         image_size[0], image_size[1], 3), "Size of image is not correct. Should be {0} but is {1}".format(image_size,
                                                                                                           image_data.shape)
     return image_data
+
+
+def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
+    """
+    Convert data array from image to numerical vector.
+    Before flattening, color images will be converted to the HWC/HWDC (height, width, color channels) format.
+
+    Parameters
+    ----------
+    data : np.ndarray
+        The given data set
+    format : str
+        Format of the data array. Can be: "HW", "HWD", "CHW", "CHWD", "HWC", "HWDC".
+        Abbreviations stand for: H: Height, W: Width, D: Depth, C: Color-channels
+
+    Returns
+    -------
+    data : np.ndarray
+        The flatten data array
+    """
+    format_possibilities = ["HW", "HWD", "CHW", "CHWD", "HWC", "HWDC"]
+    assert format in format_possibilities, "Format must be within {0}".format(format_possibilities)
+    if format == "HW":
+        assert data.ndim == 3
+    elif format in ["HWD", "CHW", "HWC"]:
+        assert data.ndim == 4
+    elif format in ["CHWD", "HWDC"]:
+        assert data.ndim == 5
+    # Flatten shape
+    if format != "HW" and format != "HWD":
+        if format == "CHW":
+            # Change representation to HWC
+            data = np.transpose(data, [0, 2, 3, 1])
+        elif format == "CHWD":
+            # Change representation to HWDC
+            data = np.transpose(data, [0, 2, 3, 4, 1])
+        assert data.shape[
+                   -1] == 3, "Color-channels must be in the last position and contain three channels not {0} ({1})".format(
+            data.shape[-1], data.shape)
+    data = data.reshape(data.shape[0], -1)
+    return data
+
+
+def unflatten_images(data_flatten: np.ndarray, image_size: tuple) -> np.ndarray:
+    """
+    Convert data array from numerical vector to image.
+    After unflattening, color images will be converted to the CHW/CHWD (color channels, height, width) format.
+
+    Parameters
+    ----------
+    data_flatten : np.ndarray
+        The given flatten data set
+    image_size : str
+        The size of a single image, e.g., (28,28,3) for a colored image of size 28 x 28
+
+    Returns
+    -------
+    data_image : np.ndarray
+        The unflatten data array corresponding to an image
+    """
+    new_shape = tuple([-1] + [i for i in image_size])
+    data_image = data_flatten.reshape(new_shape)
+    # Change image from HWC/HWDC to CHW/CHWD
+    if data_image.ndim == 4 and image_size[-1] == 3:
+        data_image = np.transpose(data_image, (0, 3, 1, 2))
+    elif data_image.ndim == 5 and image_size[-1] == 3:
+        data_image = np.transpose(data_image, (0, 4, 1, 2, 3))
+    return data_image
diff --git a/clustpy/data/preprocessing.py b/clustpy/data/preprocessing.py
@@ -0,0 +1,147 @@
+from sklearn.base import TransformerMixin, BaseEstimator
+import numpy as np
+
+
+class ZNormalizer(TransformerMixin, BaseEstimator):
+    """
+    Normalize a data set by calculating (data - mean) / std.
+    In general, two strategies are sensible to normalize a data set.
+    Either use all features simultaneously for the normalization or normalize each feature separately.
+    In the case of image data, a feature-wise transformation usually corresponds to a channel-wise transformation.
+    If this normalizer should be applied to RGB image data, the color channels should be in the first dimension, known as CHW representation.
+
+    Parameters
+    ----------
+    feature_or_channel_wise : bool
+        Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False)
+
+    Attributes
+    ----------
+    shape : list
+        Shape of the data set with which this normalizer has been fitted
+    mean : np.ndarray or int
+        Mean value(s) of the data set
+    std : np.ndarray or int
+        Standard deviation value(s) of the data set
+    """
+
+    def __init__(self, feature_or_channel_wise: bool = False):
+        self.feature_or_channel_wise = feature_or_channel_wise
+
+    def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'ZNormalizer':
+        """
+        Compute the mean and std values regarding the input data set.
+
+        Parameters
+        ----------
+        X : np.ndarray
+            the given data set
+        y : np.ndarray
+            the labels (can be ignored)
+
+        Returns
+        -------
+        self : ZNormalizer
+            this instance of the ZNormalizer
+        """
+        self.shape = list(X.shape)
+        self.shape[0] = -1
+        if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
+            # In case of not feature_or_channel_wise or grayscale images (2d or 3d)
+            self.std = np.std(X)
+            self.mean = np.mean(X)
+        elif self.feature_or_channel_wise and (X.ndim == 2 or (X.ndim in [4, 5] and X.shape[1] == 3)):
+            # In case of tabular data or RGB 2D or 3D images
+            self.std = np.array([np.std(X[:, j]) for j in range(self.shape[1])])
+            self.mean = np.array([np.mean(X[:, j]) for j in range(self.shape[1])])
+        else:
+            raise Exception(
+                "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
+                    self.feature_or_channel_wise, X.ndim))
+        return self
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        """
+        Transform the given data set using the fitted mean and std values.
+
+        Parameters
+        ----------
+        X : np.ndarray
+            the given data set
+
+        Returns
+        -------
+        X_out : np.ndarray
+            The transformed data set
+        """
+        assert list(X.shape)[1:] == self.shape[
+                                    1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format(
+            self.shape)
+        X_out = X.astype(float)
+        if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
+            # In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise
+            X_out = (X_out - self.mean) / self.std
+        elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]:
+            # In case of tabular data or RGB 2D or 3D images
+            for j in range(self.shape[1]):
+                X_out[:, j] = (X_out[:, j] - self.mean[j]) / self.std[j]
+        else:
+            raise Exception(
+                "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
+                    self.feature_or_channel_wise,
+                    X.ndim))
+        return X_out
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        """
+        Invert the transformation by applying (data * std) + mean.
+
+        Parameters
+        ----------
+        X : np.ndarray
+            the given data set
+
+        Returns
+        -------
+        X_out : np.ndarray
+            The transformed data set
+        """
+        assert list(X.shape)[1:] == self.shape[
+                                    1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format(
+            self.shape)
+        X_out = X.astype(float)
+        if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
+            # In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise
+            X_out = X_out * self.std + self.mean
+        elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]:
+            # In case of tabular data or RGB 2D or 3D images
+            for j in range(self.shape[1]):
+                X_out[:, j] = X_out[:, j] * self.std[j] + self.mean[j]
+        else:
+            raise Exception(
+                "Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
+                    self.feature_or_channel_wise,
+                    X.ndim))
+        return X_out
+
+
+def z_normalization(X: np.ndarray, feature_or_channel_wise: bool = False) -> np.ndarray:
+    """
+    Wrapper for the ZNormalizer.
+    It automatically executes: X_transform = ZNormalizer(feature_or_channel_wise).fit_transform(X)
+
+    Parameters
+    ----------
+    X : np.ndarray
+            the given data set
+    feature_or_channel_wise : bool
+        Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False)
+
+    Returns
+    -------
+    X_transform : np.ndarray
+        The transformed data set
+    """
+    znorm = ZNormalizer(feature_or_channel_wise)
+    X_transform = znorm.fit_transform(X)
+    return X_transform