In [None]:
!pip install fastdup



## Connect to google drive

In [None]:
# Read readme.md from github
from google.colab import drive
drive.mount('/gdrive')
%cd "/gdrive/My Drive/Challenge_1 (1)/"

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/Challenge_1 (1)


## Python imports

In [None]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)
import cv2


import logging

import random
random.seed(seed)# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.12.0


## Prepare images from dataset

### Import data
* Imprting original dataset

In [None]:
data = np.load('original_dataset.npz', allow_pickle=True)

In [None]:
images = data["data"]
print(len(data["data"]))

4532


### Convert images to jpg

In [None]:
import os
from PIL import Image
import numpy as np
from tqdm.auto import tqdm

# Create a mapping from integer labels to string names
label_names = np.array(["healthy", "unhealthy"])  # Note: Corrected the array definition

# Create folder to store the images in Colab's local disk
root_dir = '/content/images3'  # Use the '/content' directory in Colab
os.makedirs(root_dir, exist_ok=True)

# Get the total number of samples in the dataset
n_samples = len(data["data"])

# Iterate through the dataset to save images
for i, (image, label) in enumerate(zip(images, data["labels"])):
    # Convert integer label to string name
    label_name = label

    # Create class directory if it doesn't exist
    class_dir = os.path.join(root_dir, label_name)
    os.makedirs(class_dir, exist_ok=True)

    # Define image path and save image
    image_path = os.path.join(class_dir, f'image_{i}.jpg')
    img = np.squeeze(image)  # Remove any singleton dimensions
    img = Image.fromarray(image.astype('uint8'))
    img.save(image_path)


### Load images to dataframe

In [None]:
import pandas as pd

# Get a list of file paths for all JPEG files in the directory and its subdirectories
file_paths = []
labels = []
for root, dirs, files in os.walk('/content/images3'):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg')):
            file_paths.append(os.path.join(root, file))
            labels.append(os.path.basename(root))

# Create a pandas dataframe from the list of file paths and labels
df = pd.DataFrame({'filename': file_paths, 'label': labels, 'split': 'train'})

# Print the resulting dataframe
df

Unnamed: 0,filename,label,split
0,/content/images3/healthy/image_374.jpg,healthy,train
1,/content/images3/healthy/image_1855.jpg,healthy,train
2,/content/images3/healthy/image_1192.jpg,healthy,train
3,/content/images3/healthy/image_1937.jpg,healthy,train
4,/content/images3/healthy/image_832.jpg,healthy,train
...,...,...,...
4527,/content/images3/unhealthy/image_3315.jpg,unhealthy,train
4528,/content/images3/unhealthy/image_4141.jpg,unhealthy,train
4529,/content/images3/unhealthy/image_2925.jpg,unhealthy,train
4530,/content/images3/unhealthy/image_3036.jpg,unhealthy,train


## FastDup

### Install FastDup

In [None]:
!pip install fastdup



### Define I/O

In [None]:
import fastdup
fd = fastdup.create(input_dir='/content/images3', work_dir='/content/cc')
fd.run(annotations=df)

### Image groupings

In [None]:
fd.vis.component_gallery()

healthy


Generating gallery:   0%|          | 0/1 [00:00<?, ?it/s]

Finished OK. Components are stored as image files /content/cc/galleries/components_[index].jpg
Stored components visual view in  /content/cc/galleries/components.html
Execution time in seconds 1.0
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
component,349.0
num_images,2.0
mean_distance,0.96

Label,Unnamed: 1
healthy,2


0

### Duplicates

In [None]:
fd.vis.duplicates_gallery()

Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Stored similarity visual view in  /content/cc/galleries/duplicates.html
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
Distance,0.960014
From,/healthy/image_1373.jpg
To,/healthy/image_1352.jpg
From_Label,healthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.959707
From,/healthy/image_2547.jpg
To,/healthy/image_464.jpg
From_Label,healthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.957212
From,/healthy/image_207.jpg
To,/healthy/image_1364.jpg
From_Label,healthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.954641
From,/healthy/image_825.jpg
To,/unhealthy/image_3277.jpg
From_Label,healthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,0.95368
From,/healthy/image_2052.jpg
To,/healthy/image_1729.jpg
From_Label,healthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.952724
From,/healthy/image_2483.jpg
To,/healthy/image_2479.jpg
From_Label,healthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.952577
From,/unhealthy/image_4438.jpg
To,/healthy/image_676.jpg
From_Label,unhealthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.951743
From,/healthy/image_2887.jpg
To,/healthy/image_1827.jpg
From_Label,healthy
To_Label,healthy

Info,Unnamed: 1
Distance,0.951555
From,/healthy/image_2118.jpg
To,/unhealthy/image_4489.jpg
From_Label,healthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,0.951451
From,/unhealthy/image_3024.jpg
To,/unhealthy/image_4462.jpg
From_Label,unhealthy
To_Label,unhealthy


0

### Outliers

In [None]:
fd.vis.outliers_gallery()


Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Stored outliers visual view in  /content/cc/galleries/outliers.html
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
Distance,0.568004
Path,/healthy/image_2671.jpg
label,healthy

Info,Unnamed: 1
Distance,0.655636
Path,/unhealthy/image_3611.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.849003
Path,/unhealthy/image_3307.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.867732
Path,/unhealthy/image_3627.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.868214
Path,/unhealthy/image_4487.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.870573
Path,/unhealthy/image_4184.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.875749
Path,/unhealthy/image_3567.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.875852
Path,/unhealthy/image_4365.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.876198
Path,/unhealthy/image_4095.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.876616
Path,/healthy/image_121.jpg
label,healthy

Info,Unnamed: 1
Distance,0.876956
Path,/healthy/image_837.jpg
label,healthy

Info,Unnamed: 1
Distance,0.877013
Path,/healthy/image_2455.jpg
label,healthy

Info,Unnamed: 1
Distance,0.877059
Path,/healthy/image_1118.jpg
label,healthy

Info,Unnamed: 1
Distance,0.877086
Path,/unhealthy/image_3467.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.877108
Path,/healthy/image_1543.jpg
label,healthy

Info,Unnamed: 1
Distance,0.877117
Path,/healthy/image_1714.jpg
label,healthy

Info,Unnamed: 1
Distance,0.877132
Path,/unhealthy/image_4342.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.877164
Path,/unhealthy/image_3665.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.877285
Path,/healthy/image_2334.jpg
label,healthy

Info,Unnamed: 1
Distance,0.877448
Path,/healthy/image_394.jpg
label,healthy


0

### Too dark

In [None]:
fd.vis.stats_gallery(metric='dark')

Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Stored mean visual view in  /content/cc/galleries/mean.html
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
mean,31.2974
filename,/content/images3/healthy/image_1812.jpg
label,healthy

Info,Unnamed: 1
mean,31.576
filename,/content/images3/healthy/image_2888.jpg
label,healthy

Info,Unnamed: 1
mean,32.6228
filename,/content/images3/healthy/image_1606.jpg
label,healthy

Info,Unnamed: 1
mean,32.7943
filename,/content/images3/healthy/image_2684.jpg
label,healthy

Info,Unnamed: 1
mean,33.2402
filename,/content/images3/healthy/image_1961.jpg
label,healthy

Info,Unnamed: 1
mean,35.0318
filename,/content/images3/healthy/image_1889.jpg
label,healthy

Info,Unnamed: 1
mean,37.3874
filename,/content/images3/healthy/image_1294.jpg
label,healthy

Info,Unnamed: 1
mean,39.0249
filename,/content/images3/healthy/image_1348.jpg
label,healthy

Info,Unnamed: 1
mean,39.1358
filename,/content/images3/healthy/image_103.jpg
label,healthy

Info,Unnamed: 1
mean,40.1972
filename,/content/images3/healthy/image_2316.jpg
label,healthy

Info,Unnamed: 1
mean,40.8701
filename,/content/images3/healthy/image_1986.jpg
label,healthy

Info,Unnamed: 1
mean,41.9663
filename,/content/images3/unhealthy/image_4451.jpg
label,unhealthy

Info,Unnamed: 1
mean,42.2096
filename,/content/images3/healthy/image_486.jpg
label,healthy

Info,Unnamed: 1
mean,43.2916
filename,/content/images3/unhealthy/image_4170.jpg
label,unhealthy

Info,Unnamed: 1
mean,43.4957
filename,/content/images3/healthy/image_469.jpg
label,healthy

Info,Unnamed: 1
mean,43.6233
filename,/content/images3/healthy/image_2027.jpg
label,healthy

Info,Unnamed: 1
mean,44.2124
filename,/content/images3/unhealthy/image_3650.jpg
label,unhealthy

Info,Unnamed: 1
mean,44.2577
filename,/content/images3/healthy/image_2810.jpg
label,healthy

Info,Unnamed: 1
mean,44.9199
filename,/content/images3/healthy/image_1588.jpg
label,healthy

Info,Unnamed: 1
mean,45.0561
filename,/content/images3/healthy/image_1626.jpg
label,healthy


0