In [1]:
!pip install fastdup

Collecting fastdup
  Downloading fastdup-1.66-cp310-cp310-manylinux_2_31_x86_64.whl (75.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.2/75.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting pillow-heif (from fastdup)
  Downloading pillow_heif-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests==2.28.1 (from fastdup)
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentry-sdk (from fastdup)
  Downloading sentry_sdk-1.34.0-py2.py3-none-any.whl (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.9/243.9 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Collecting charset-normalizer<3,>=2 (from requests==2.28.1->fastdup)
  Downloading

## Connect to google drive

In [2]:
# Read readme.md from github
from google.colab import drive
drive.mount('/gdrive')
%cd "/gdrive/My Drive/Challenge_1/"

Mounted at /gdrive
/gdrive/My Drive/Challenge_1


## Imports

In [3]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.12.0


## Import data

In [4]:
import numpy as np
import cv2

# Load the NPZ file
data = np.load('new_dataset.npz')

In [28]:
images = data["data"]

## Convert images to jpg

In [29]:
import os
from PIL import Image
import numpy as np
from tqdm.auto import tqdm

# Create a mapping from integer labels to string names
label_names = np.array(["healthy", "unhealthy"])  # Note: Corrected the array definition

# Create folder to store the images in Colab's local disk
root_dir = '/content/images'  # Use the '/content' directory in Colab
os.makedirs(root_dir, exist_ok=True)

# Get the total number of samples in the dataset
n_samples = len(data["data"])

# Iterate through the dataset to save images
for i, (image, label) in enumerate(zip(images, data["labels"])):
    # Convert integer label to string name
    label_name = label

    # Create class directory if it doesn't exist
    class_dir = os.path.join(root_dir, label_name)
    os.makedirs(class_dir, exist_ok=True)

    # Define image path and save image
    image_path = os.path.join(class_dir, f'image_{i}.jpg')
    img = np.squeeze(image)  # Remove any singleton dimensions
    img = Image.fromarray(image.astype('uint8'))
    img.save(image_path)


## Load images to dataframe

In [17]:
import pandas as pd

# Get a list of file paths for all JPEG files in the directory and its subdirectories
file_paths = []
labels = []
for root, dirs, files in os.walk('/content/images'):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg')):
            file_paths.append(os.path.join(root, file))
            labels.append(os.path.basename(root))

# Create a pandas dataframe from the list of file paths and labels
df = pd.DataFrame({'filename': file_paths, 'label': labels, 'split': 'train'})

# Print the resulting dataframe
df

Unnamed: 0,filename,label,split
0,/content/images/healthy/image_4978.jpg,healthy,train
1,/content/images/healthy/image_3215.jpg,healthy,train
2,/content/images/healthy/image_197.jpg,healthy,train
3,/content/images/healthy/image_4346.jpg,healthy,train
4,/content/images/healthy/image_5010.jpg,healthy,train
...,...,...,...
5195,/content/images/unhealthy/image_2247.jpg,unhealthy,train
5196,/content/images/unhealthy/image_3428.jpg,unhealthy,train
5197,/content/images/unhealthy/image_1479.jpg,unhealthy,train
5198,/content/images/unhealthy/image_4521.jpg,unhealthy,train


## FastDup

In [22]:
import fastdup
fd = fastdup.create(input_dir='/content/images', work_dir='/content/aa')
fd.run(annotations=df)

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
fastdup C++ info received:  2023-11-06 00:35:59 [INFO] Going to loop over dir /tmp/tmpi_dq13x2.csv
2023-11-06 00:35:59 [INFO] Found total 5200 images to run on, 5200 train, 0 test, name list 5200, counter 5200 
2023-11-06 00:37:30 [INFO] Found total 5200 images to run on
2023-11-06 00:37:33 [INFO] 2896) Finished write_index() NN model
2023-11-06 00:37:33 [INFO] Stored nn model index file /content/aa/nnf.index
2023-11-06 00:37:34 [INFO] Total time took 94936 ms
2023-11-06 00:37:34 [INFO] Found a total of 923 fully identical images (d>0.990), which are 8.88 % of total graph edges
2023-11-06 00:37:34 [INFO] Found a total of 0 nearly identical images(d>0.980), which are 0.00 % of total graph edges
2023-11-06 00:37:34 [INFO] Found a total of 7512 above threshold images (d>0.900), which are 72.23 % of total graph edges
2023-11-06 00:37:34 [INFO] Found a total of 537 outlier images         (d<0.050), which are 5.16 % o

0

### Image groupings

In [23]:
fd.vis.component_gallery()

healthy


Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Finished OK. Components are stored as image files /content/aa/galleries/components_[index].jpg
Stored components visual view in  /content/aa/galleries/components.html
Execution time in seconds 2.4
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
component,63.0
num_images,98.0
mean_distance,1.0

Label,Unnamed: 1
healthy,54

Info,Unnamed: 1
component,3089.0
num_images,98.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,54

Info,Unnamed: 1
component,3320.0
num_images,4.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,4

Info,Unnamed: 1
component,3218.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,4068.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3239.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,4086.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3246.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3121.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3602.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3642.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3313.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3432.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,4306.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3087.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3354.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3515.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3079.0
num_images,3.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,3

Info,Unnamed: 1
component,3644.0
num_images,2.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,2

Info,Unnamed: 1
component,3659.0
num_images,2.0
mean_distance,1.0

Label,Unnamed: 1
unhealthy,2


0

### Duplicates

In [26]:
fd.vis.duplicates_gallery()

Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Stored similarity visual view in  /content/aa/galleries/duplicates.html
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_4901.jpg
To,/unhealthy/image_4399.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_171.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_974.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_622.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_2081.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_2453.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_3286.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_3105.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_4857.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_779.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_4475.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_1554.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_1528.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_3666.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_138.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_3688.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_2727.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_2864.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_3802.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy

Info,Unnamed: 1
Distance,1.0
From,/unhealthy/image_2830.jpg
To,/unhealthy/image_2515.jpg
From_Label,unhealthy
To_Label,unhealthy


0

### Outliers

In [24]:
fd.vis.outliers_gallery()


Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Stored outliers visual view in  /content/aa/galleries/outliers.html
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
Distance,0.763519
Path,/healthy/image_1912.jpg
label,healthy

Info,Unnamed: 1
Distance,0.766314
Path,/unhealthy/image_4461.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.79719
Path,/unhealthy/image_4936.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.800138
Path,/unhealthy/image_3813.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.812491
Path,/unhealthy/image_985.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.814292
Path,/unhealthy/image_1638.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.814985
Path,/unhealthy/image_4044.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.81564
Path,/unhealthy/image_1259.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.819016
Path,/unhealthy/image_4139.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.82021
Path,/unhealthy/image_5095.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.820545
Path,/unhealthy/image_4227.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.820934
Path,/healthy/image_3022.jpg
label,healthy

Info,Unnamed: 1
Distance,0.822343
Path,/unhealthy/image_3156.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.822751
Path,/healthy/image_2610.jpg
label,healthy

Info,Unnamed: 1
Distance,0.823515
Path,/unhealthy/image_3141.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.825127
Path,/unhealthy/image_1551.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.826456
Path,/unhealthy/image_3567.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.827605
Path,/unhealthy/image_1761.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.827605
Path,/unhealthy/image_948.jpg
label,unhealthy

Info,Unnamed: 1
Distance,0.829447
Path,/unhealthy/image_2857.jpg
label,unhealthy


0

### Too dark

In [25]:
fd.vis.stats_gallery(metric='dark')

Generating gallery:   0%|          | 0/20 [00:00<?, ?it/s]

Stored mean visual view in  /content/aa/galleries/mean.html
########################################################################################
Would you like to see awesome visualizations for some of the most popular academic datasets?
Click here to see and learn more: https://app.visual-layer.com/vl-datasets?utm_source=fastdup
########################################################################################


Info,Unnamed: 1
mean,30.9228
filename,/content/images/healthy/image_967.jpg
label,healthy

Info,Unnamed: 1
mean,31.0799
filename,/content/images/healthy/image_3951.jpg
label,healthy

Info,Unnamed: 1
mean,31.2776
filename,/content/images/healthy/image_2527.jpg
label,healthy

Info,Unnamed: 1
mean,31.9336
filename,/content/images/unhealthy/image_80.jpg
label,unhealthy

Info,Unnamed: 1
mean,32.1783
filename,/content/images/healthy/image_4619.jpg
label,healthy

Info,Unnamed: 1
mean,32.1783
filename,/content/images/healthy/image_1193.jpg
label,healthy

Info,Unnamed: 1
mean,32.3865
filename,/content/images/healthy/image_5123.jpg
label,healthy

Info,Unnamed: 1
mean,32.8104
filename,/content/images/healthy/image_1773.jpg
label,healthy

Info,Unnamed: 1
mean,34.4902
filename,/content/images/unhealthy/image_4038.jpg
label,unhealthy

Info,Unnamed: 1
mean,34.4902
filename,/content/images/unhealthy/image_276.jpg
label,unhealthy

Info,Unnamed: 1
mean,34.5304
filename,/content/images/healthy/image_796.jpg
label,healthy

Info,Unnamed: 1
mean,34.7103
filename,/content/images/healthy/image_1869.jpg
label,healthy

Info,Unnamed: 1
mean,35.4469
filename,/content/images/unhealthy/image_3972.jpg
label,unhealthy

Info,Unnamed: 1
mean,36.8724
filename,/content/images/healthy/image_1414.jpg
label,healthy

Info,Unnamed: 1
mean,38.5716
filename,/content/images/healthy/image_4550.jpg
label,healthy

Info,Unnamed: 1
mean,38.5716
filename,/content/images/healthy/image_2592.jpg
label,healthy

Info,Unnamed: 1
mean,38.6609
filename,/content/images/healthy/image_664.jpg
label,healthy

Info,Unnamed: 1
mean,39.7659
filename,/content/images/healthy/image_3432.jpg
label,healthy

Info,Unnamed: 1
mean,40.1968
filename,/content/images/healthy/image_1508.jpg
label,healthy

Info,Unnamed: 1
mean,40.2076
filename,/content/images/unhealthy/image_5040.jpg
label,unhealthy


0