Skip to content

Commit

Permalink
Merge pull request #86 from collinleiber/ddc
Browse files Browse the repository at this point in the history
add DDC + video datasets
  • Loading branch information
collinleiber committed Feb 12, 2024
2 parents 4afb304 + ccf4091 commit 8f67d15
Show file tree
Hide file tree
Showing 55 changed files with 4,594 additions and 2,995 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
command: |
python -m pip install --upgrade pip
pip install pytest
pip install -e .
pip install -e .[full]
- run:
name: Run tests
no_output_timeout: 30m
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pytest pytest-cov
pip install -e .
pip install -e .[full]
- name: Test with pytest
run: |
Expand Down
12 changes: 6 additions & 6 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ preferred-citation:
- family-names: "Böhm"
given-names: "Christian"
orcid: "https://orcid.org/0000-0002-2237-9969"
doi: "10.1145/3583780.3615290"
booktitle: "Proceedings of the 32nd ACM International Conference on Information and Knowledge Management"
start: 5208 # First page number
end: 5211 # Last page number
title: "Application of Deep Clustering Algorithms"
doi: "10.1109/ICDMW60847.2023.00087"
booktitle: "2023 IEEE International Conference on Data Mining Workshops (ICDMW)"
start: 625 # First page number
end: 632 # Last page number
title: "Benchmarking Deep Clustering Algorithms With ClustPy"
year: 2023
publisher: "Association for Computing Machinery"
publisher: "IEEE"
163 changes: 99 additions & 64 deletions README.md

Large diffs are not rendered by default.

30 changes: 21 additions & 9 deletions clustpy/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
from .synthetic_data_creator import create_subspace_data, create_nr_data
from .real_world_data import load_har, load_letterrecognition, \
load_optdigits, load_pendigits, load_newsgroups, load_iris, load_wine, load_breast_cancer, load_reuters, \
load_banknotes, load_htru2, load_mice_protein, load_ecoli, load_spambase, load_seeds, \
load_statlog_shuttle, load_forest_types, load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, \
load_user_knowledge, load_dermatology, load_multiple_features, load_statlog_australian_credit_approval, \
load_breast_cancer_wisconsin_original, load_semeion, load_imagenet_dog, load_imagenet10, load_coil20, load_coil100
from .real_world_data import load_newsgroups, load_iris, load_wine, load_breast_cancer, load_reuters, load_imagenet_dog, \
load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb
from .real_uci_data import load_har, load_letterrecognition, load_optdigits, load_pendigits, load_banknotes, load_htru2, \
load_mice_protein, load_ecoli, load_spambase, load_seeds, load_statlog_shuttle, load_forest_types, \
load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, load_user_knowledge, load_dermatology, \
load_multiple_features, load_statlog_australian_credit_approval, load_breast_cancer_wisconsin_original, \
load_semeion, load_cmu_faces
from .real_timeseries_data import load_motestrain, load_olive_oil, load_symbols, load_diatom_size_reduction, \
load_proximal_phalanx_outline, load_plane, load_sony_aibo_robot_surface, load_two_patterns, load_lsst
from .real_nr_data import load_aloi_small, load_fruit, load_nrletters, load_stickfigures, load_webkb, load_cmu_faces
from .real_clustpy_data import load_aloi_small, load_fruit, load_nrletters, load_stickfigures
from .real_torchvision_data import load_usps, load_mnist, load_fmnist, load_kmnist, load_svhn, load_cifar10, load_stl10, \
load_gtsrb
load_gtsrb, load_cifar100
from .real_medical_mnist_data import load_path_mnist, load_chest_mnist, load_derma_mnist, load_oct_mnist, \
load_pneumonia_mnist, load_retina_mnist, load_breast_mnist, load_blood_mnist, load_tissue_mnist, load_organ_a_mnist, \
load_organ_c_mnist, load_organ_s_mnist, load_organ_mnist_3d, load_nodule_mnist_3d, load_adrenal_mnist_3d, \
load_fracture_mnist_3d, load_vessel_mnist_3d, load_synapse_mnist_3d
from clustpy.data.real_video_data import load_video_weizmann, load_video_keck_gesture
from clustpy.data.preprocessing import ZNormalizer, z_normalization
from clustpy.data._utils import flatten_images, unflatten_images

__all__ = ['create_subspace_data',
'create_nr_data',
Expand All @@ -29,6 +33,7 @@
'load_iris',
'load_wine',
'load_breast_cancer',
'load_olivetti_faces',
'load_reuters',
'load_banknotes',
'load_htru2',
Expand Down Expand Up @@ -88,4 +93,11 @@
'load_imagenet10',
'load_gtsrb',
'load_coil20',
'load_coil100']
'load_coil100',
'load_video_weizmann',
'load_video_keck_gesture',
'ZNormalizer',
'z_normalization',
'load_cifar100',
'flatten_images',
'unflatten_images']
98 changes: 89 additions & 9 deletions clustpy/data/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,22 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si
chink size when downloading the file (default: 32768)
"""
print("Downloading data set {0} from Google Drive to {1}".format(file_id, filename_local))
URL = "https://docs.google.com/uc?export=download&confirm=1"
URL = "https://drive.google.com/uc"
session = requests.Session()
response = session.get(URL, params={"id": file_id, "confirm": 1}, stream=True)
response = session.get(URL, params={"id": file_id, "confirm": "t"}, stream=True)
if response.text.startswith("<!DOCTYPE"):
# Large files can not be obtained automatically but need a second request
try:
URL_extracted = response.text.split("download-form\" action=\"")[1].split("\" method=\"get\"")[0]
uuid = response.text.split("name=\"uuid\" value=\"")[1].split("\">")[0]
except:
raise Exception("[ERROR] New URL and UUID could not be extracted from first request in _download_file_from_google_drive")
response = session.get(URL_extracted, params={"id": file_id, "confirm": "t", "uuid": uuid}, stream=True)
with open(filename_local, "wb") as f:
for chunk in response.iter_content(chunk_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
session.close()


def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> (
Expand Down Expand Up @@ -147,14 +156,14 @@ def _decompress_z_file(filename: str, directory: str) -> bool:
return successful


def _load_image_data(path: str, image_size: tuple, color_image: bool) -> np.ndarray:
def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.ndarray:
"""
Load image and convert it into a coherent size. Returns a numpy array containing the image data.
Parameters
----------
path : str
Path to the image
image : str
Path to the image. Can also be a numpy array containing the specific pixels
image_size : tuple
images of various sizes can be converted into a coherent size.
The tuple equals (width, height) of the images.
Expand All @@ -167,14 +176,85 @@ def _load_image_data(path: str, image_size: tuple, color_image: bool) -> np.ndar
image_data : np.ndarray
The numpy array containing the image data
"""
image = Image.open(path)
if type(image) is str:
pil_image = Image.open(image)
else:
pil_image = Image.fromarray(np.uint8(image))
if color_image:
image = image.convert("RGB")
pil_image = pil_image.convert("RGB")
# Convert to coherent size
if image_size is not None:
image = image.resize(image_size)
image_data = np.asarray(image)
pil_image = pil_image.resize(image_size)
image_data = np.asarray(pil_image)
assert image_size is None or image_data.shape == (
image_size[0], image_size[1], 3), "Size of image is not correct. Should be {0} but is {1}".format(image_size,
image_data.shape)
return image_data


def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
"""
Convert data array from image to numerical vector.
Before flattening, color images will be converted to the HWC/HWDC (height, width, color channels) format.
Parameters
----------
data : np.ndarray
The given data set
format : str
Format of the data array. Can be: "HW", "HWD", "CHW", "CHWD", "HWC", "HWDC".
Abbreviations stand for: H: Height, W: Width, D: Depth, C: Color-channels
Returns
-------
data : np.ndarray
The flatten data array
"""
format_possibilities = ["HW", "HWD", "CHW", "CHWD", "HWC", "HWDC"]
assert format in format_possibilities, "Format must be within {0}".format(format_possibilities)
if format == "HW":
assert data.ndim == 3
elif format in ["HWD", "CHW", "HWC"]:
assert data.ndim == 4
elif format in ["CHWD", "HWDC"]:
assert data.ndim == 5
# Flatten shape
if format != "HW" and format != "HWD":
if format == "CHW":
# Change representation to HWC
data = np.transpose(data, [0, 2, 3, 1])
elif format == "CHWD":
# Change representation to HWDC
data = np.transpose(data, [0, 2, 3, 4, 1])
assert data.shape[
-1] == 3, "Color-channels must be in the last position and contain three channels not {0} ({1})".format(
data.shape[-1], data.shape)
data = data.reshape(data.shape[0], -1)
return data


def unflatten_images(data_flatten: np.ndarray, image_size: tuple) -> np.ndarray:
"""
Convert data array from numerical vector to image.
After unflattening, color images will be converted to the CHW/CHWD (color channels, height, width) format.
Parameters
----------
data_flatten : np.ndarray
The given flatten data set
image_size : str
The size of a single image, e.g., (28,28,3) for a colored image of size 28 x 28
Returns
-------
data_image : np.ndarray
The unflatten data array corresponding to an image
"""
new_shape = tuple([-1] + [i for i in image_size])
data_image = data_flatten.reshape(new_shape)
# Change image from HWC/HWDC to CHW/CHWD
if data_image.ndim == 4 and image_size[-1] == 3:
data_image = np.transpose(data_image, (0, 3, 1, 2))
elif data_image.ndim == 5 and image_size[-1] == 3:
data_image = np.transpose(data_image, (0, 4, 1, 2, 3))
return data_image
147 changes: 147 additions & 0 deletions clustpy/data/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from sklearn.base import TransformerMixin, BaseEstimator
import numpy as np


class ZNormalizer(TransformerMixin, BaseEstimator):
"""
Normalize a data set by calculating (data - mean) / std.
In general, two strategies are sensible to normalize a data set.
Either use all features simultaneously for the normalization or normalize each feature separately.
In the case of image data, a feature-wise transformation usually corresponds to a channel-wise transformation.
If this normalizer should be applied to RGB image data, the color channels should be in the first dimension, known as CHW representation.
Parameters
----------
feature_or_channel_wise : bool
Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False)
Attributes
----------
shape : list
Shape of the data set with which this normalizer has been fitted
mean : np.ndarray or int
Mean value(s) of the data set
std : np.ndarray or int
Standard deviation value(s) of the data set
"""

def __init__(self, feature_or_channel_wise: bool = False):
self.feature_or_channel_wise = feature_or_channel_wise

def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'ZNormalizer':
"""
Compute the mean and std values regarding the input data set.
Parameters
----------
X : np.ndarray
the given data set
y : np.ndarray
the labels (can be ignored)
Returns
-------
self : ZNormalizer
this instance of the ZNormalizer
"""
self.shape = list(X.shape)
self.shape[0] = -1
if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
# In case of not feature_or_channel_wise or grayscale images (2d or 3d)
self.std = np.std(X)
self.mean = np.mean(X)
elif self.feature_or_channel_wise and (X.ndim == 2 or (X.ndim in [4, 5] and X.shape[1] == 3)):
# In case of tabular data or RGB 2D or 3D images
self.std = np.array([np.std(X[:, j]) for j in range(self.shape[1])])
self.mean = np.array([np.mean(X[:, j]) for j in range(self.shape[1])])
else:
raise Exception(
"Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
self.feature_or_channel_wise, X.ndim))
return self

def transform(self, X: np.ndarray) -> np.ndarray:
"""
Transform the given data set using the fitted mean and std values.
Parameters
----------
X : np.ndarray
the given data set
Returns
-------
X_out : np.ndarray
The transformed data set
"""
assert list(X.shape)[1:] == self.shape[
1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format(
self.shape)
X_out = X.astype(float)
if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
# In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise
X_out = (X_out - self.mean) / self.std
elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]:
# In case of tabular data or RGB 2D or 3D images
for j in range(self.shape[1]):
X_out[:, j] = (X_out[:, j] - self.mean[j]) / self.std[j]
else:
raise Exception(
"Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
self.feature_or_channel_wise,
X.ndim))
return X_out

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
"""
Invert the transformation by applying (data * std) + mean.
Parameters
----------
X : np.ndarray
the given data set
Returns
-------
X_out : np.ndarray
The transformed data set
"""
assert list(X.shape)[1:] == self.shape[
1:], "The shape of the input data does not match the fitted transformation. Shape must be {0}".format(
self.shape)
X_out = X.astype(float)
if not self.feature_or_channel_wise or (X.ndim > 2 and 3 not in self.shape):
# In case of not feature_or_channel_wise or grayscale images if feature_or_channel_wise
X_out = X_out * self.std + self.mean
elif self.feature_or_channel_wise and X.ndim in [2, 4, 5]:
# In case of tabular data or RGB 2D or 3D images
for j in range(self.shape[1]):
X_out[:, j] = X_out[:, j] * self.std[j] + self.mean[j]
else:
raise Exception(
"Your combination of feature_or_channel_wise={0} and X.ndim={1} is not working for the transformation".format(
self.feature_or_channel_wise,
X.ndim))
return X_out


def z_normalization(X: np.ndarray, feature_or_channel_wise: bool = False) -> np.ndarray:
"""
Wrapper for the ZNormalizer.
It automatically executes: X_transform = ZNormalizer(feature_or_channel_wise).fit_transform(X)
Parameters
----------
X : np.ndarray
the given data set
feature_or_channel_wise : bool
Specifies if all data should be used for the normalization or if a feature-/channel-wise normalization should be applied (default: False)
Returns
-------
X_transform : np.ndarray
The transformed data set
"""
znorm = ZNormalizer(feature_or_channel_wise)
X_transform = znorm.fit_transform(X)
return X_transform

0 comments on commit 8f67d15

Please sign in to comment.