# Solanum Dataset Metrics

We wish to obtain the following information for each file:

1. Section to which it belongs
1. Species
1. Source of file
1. Size (MB)
1. Resolution
1. Type of file
1. hash (fingerprint)

## Image fingeprinting
We'll also use the `ImageHash` library to calculate the fingerprint of the images in order to identify near-duplicates.

## References:
1. https://realpython.com/fingerprinting-images-for-near-duplicate-detection/
1. https://github.com/JohannesBuchner/imagehash
1. https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html


# Imports

In [None]:
!pip install imagehash pillow pandas numpy matplotlib

In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import imagehash

pd.set_option('max_rows', 200)
pd.set_option('max_colwidth', 400)

In [7]:
#!echo %cd%
!echo "$PWD"

/mnt/pepper/CODE/ITESM/tesis-dataset-downloader


In [8]:
#root_folder = "P:/CODE/ITESM/tesis-dataset-downloader/solanum_output"
root_folder = "/mnt/pepper/CODE/ITESM/tesis-dataset-downloader/solanum_output"

# Tools

In [9]:
def plot_histogram(df: pd.DataFrame, nrows: int, ncols: int):
    """
    Plots a series of histograms for a dataframe in a nrows x ncols distribution
    """
    fig, ax = plt.subplots(nrows=nrows,
                           ncols=ncols,
                           figsize=(14,14))
    
    for n, col in enumerate(df.columns):
        plt.subplot(nrows, ncols, n+1)
        df[col].plot.hist(ax = plt.gca(),
                          bins=20,
                          legend=True)

# 1 Calculate image attributes and fingerprints

dHash is the chosen fingerprint algorithm to use. Like aHash and pHash, dHash is pretty simple to implement and is far more accurate than it has any right to be. As an implementation, dHash is nearly identical to aHash but it performs much better. While aHash focuses on average values and pHash evaluates frequency patterns, dHash tracks gradients.

In [None]:
records = []

for root, dirs, files in os.walk(root_folder):
    
    if root == root_folder:
        continue
    #print(f"Root: {root}")
    #print(f"Dirs: {dirs}")
    #print(f"Files: {files}")
    #print("-" * 50)

    print(f"Reading files in {root}...")

    for f in files:
        full_path = os.path.join(root, f)
        
        # Section to which it belongs
        # Species
        # Source of file
        section, species, _, source, _ = f.split("_")

        # Type of file
        file_extension = os.path.splitext(full_path)[1].replace(".", "").lower()

        # Normalize JPG extensions
        file_extension = "jpg" if file_extension == "jpeg" else file_extension

        # Size (MB)
        filesize_mb = round(os.path.getsize(full_path) / 1024 / 1024, 4)

        # Resolution
        image = Image.open(full_path)
        width, height = image.size

        # fingerprint
        hash = str(imagehash.dhash(image))
        
        row = {
            "section": section,
            "species": species,
            "filesize_mb": filesize_mb,
            "width": width,
            "height": height,
            "image_type": file_extension,
            "source": source,
            "hash": hash,
            "full_path": full_path,
            "keep": 1
        }

        records.append(row)

Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\anarrhichomenum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\basarthrum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\brevantherum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\crinitum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\dulcamara...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\erythrotrichum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\gonatotrichum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\herposolanum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\herpystrichum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla...




Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\lasiocarpa...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\lycopersicon...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\melongena...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\micracantha...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\pachyphylla...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\periscariae...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\pteroidea...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\solanum...
Reading files in P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva...


In [None]:
report = pd.DataFrame(records)

display(report.head(25))


Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path,keep
0,acanthophora,acerifolium,8.9974,3750,5500,jpg,gbif,8c86b6326a6e0cca,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1928496814_gbif_2700.jpg,1
1,acanthophora,acerifolium,7.3703,3750,5500,jpg,gbif,bea48d0a2aac92b2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1928638909_gbif_2609.jpg,1
2,acanthophora,acerifolium,7.0359,3750,5500,jpg,gbif,b893b2a893b6a2e6,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1928803178_gbif_2510.jpg,1
3,acanthophora,acerifolium,7.873,3750,5500,jpg,gbif,98a6963cb69a8aca,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1929578053_gbif_2035.jpg,1
4,acanthophora,acerifolium,5.9595,3750,5500,jpg,gbif,96963696ca8adaf2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1929578350_gbif_2034.jpg,1
5,acanthophora,acerifolium,6.3095,3750,5500,jpg,gbif,b4938b8682beb2f2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1930342960_gbif_1533.jpg,1
6,acanthophora,acerifolium,6.4096,3750,5500,jpg,gbif,96a893832cb2a2c2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1930787117_gbif_1220.jpg,1
7,acanthophora,acerifolium,9.5827,6738,8934,jpg,idigbio,e6e62e3cc6879551,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_2ba4f510-8fb1-430b-a53c-833c6f8f7c45_idigbio_4044.jpeg,1
8,acanthophora,acerifolium,4.3827,3231,5008,jpg,gbif,8c0e26574accce88,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_437153638_gbif_6647.jpeg,1
9,acanthophora,acerifolium,5.7148,3260,5011,jpg,gbif,36379b933e8eccac,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_437784428_gbif_6461.jpeg,1


In [None]:
report.to_csv(os.path.join(root_folder, "Downloaded_images_report.csv"))

# 2 Detecting and removing duplicates

In [None]:
#report = pd.read_csv(os.path.join(root_folder, "Downloaded_images_report.csv"))
print(report.shape)

(8937, 11)


In [None]:
duplicates = report[report["hash"].duplicated(keep=False)].sort_values("hash")

# All duplicates will be kept until we mark them for deletion
duplicates["keep"] = 1

display(duplicates)
print(duplicates.shape)

Unnamed: 0.1,Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path,keep
2844,2844,holophylla,pseudocapsicum,0.0387,497,768,jpg,idigbio,0939260d383c2478,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_pseudocapsicum_ac8fd431-4a91-41ec-b284-079846b1583d_idigbio_1764.jpeg,1
2746,2746,holophylla,pseudocapsicum,0.0387,497,768,jpg,gbif,0939260d383c2478,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_pseudocapsicum_1056715543_gbif_4545.jpeg,1
5749,5749,petota,edinense,0.0697,496,768,jpg,idigbio,2325954e06898916,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_edinense_fa560f0c-502f-409d-952f-0c9aeac6cba8_idigbio_112.jpeg,1
5720,5720,petota,edinense,0.0697,496,768,jpg,gbif,2325954e06898916,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_edinense_1056561316_gbif_4579.jpeg,1
5745,5745,petota,edinense,0.061,493,768,jpg,idigbio,272f2d4dc3f32d0d,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_edinense_ae67415d-9025-451d-ab3f-eccc1390e733_idigbio_5154.jpeg,1
5717,5717,petota,edinense,0.061,493,768,jpg,gbif,272f2d4dc3f32d0d,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_edinense_1056122023_gbif_4794.jpeg,1
8196,8196,torva,lanceolatum,1.0476,2048,3362,jpg,gbif,288c363698909090,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_lanceolatum_144897344_gbif_6723.jpg,1
8531,8531,torva,lanceolatum,1.0249,2592,4255,jpg,idigbio,288c363698909090,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_lanceolatum_73f080f3-2b85-4f01-80fd-cd3854a45dbd_idigbio_4596.jpg,1
3110,3110,holophylla,pubigerum,5.7115,3228,5026,jpg,gbif,2c36672b3ae4b882,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_pubigerum_439119970_gbif_6071.jpeg,1
3060,3060,holophylla,pubigerum,5.7115,3228,5026,jpg,gbif,2c36672b3ae4b882,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_pubigerum_437455328_gbif_6557.jpeg,1


(190, 11)


In [None]:
dups_count_by_hash = duplicates.groupby(["hash"]).size().reset_index(name='count').sort_values(["count", "hash"], ascending=False)
display(dups_count_by_hash)
print(dups_count_by_hash.shape)

Unnamed: 0,hash,count
33,7476646c59572d39,6
17,451b169491cddd73,4
91,f89ab23c53935bd8,2
90,e9299696cae28c9b,2
89,e6c74f3e341e1b71,2
88,e36e0c0c183121b1,2
87,e22969c24c642663,2
86,ddf526363623e5e5,2
85,dabaca143636ece8,2
84,d699093e18ca0312,2


(92, 2)


We can see that we have 190 duplicated records out of which only 92 are unique images, many of them of the same size and resolution, some have different size but represent the same image and some of them are classified as being a member of a section and have the same picture classified as another, we need to remove these and make sure that the pictures are actually in the same place.

To break a tie and see what records we are keeping we will:
1. Say that absolute duplicates are ones such that they have the same:
    * hash
    * section
    * species
    * filesize_mb

   For these, it doesn't matter which ones we keep, let's keep one record and remove the rest.
1. For the records that are still duplicated on their `hash` but have different sizes, we'll set the `keep` column of the rows with the smallest size to 0 so that we can remove them.
1. Remove those records that have `keep = 0`. Out of those we will again get a report of the records that still have the same `hash`. These are the records that may have different section or species classification, for these we will determine which ones to keep by asking a biologist to make the final determination of which records are correct and which aren't, we'll manually set the `keep` value to 0 for those we want removed.
1. Remove the records that have a `keep = 0`.
1. Remove the `keep` columns.

This processing should leave us only with the best resolutions for the specimens and the correct images to be preprocessed for feature extraction.

## 2.1 Removal of absolute duplicates

In [None]:
absolute_columns = ["hash", "filesize_mb", "section", "species"]
duplicates.loc[duplicates.duplicated(subset=absolute_columns, keep="first"), "keep"] = 0

In [None]:
duplicates.sort_values(["hash", "keep"], ascending=False)

Unnamed: 0.1,Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path,keep
8416,8416,torva,lanceolatum,0.0458,468,768,jpg,idigbio,f89ab23c53935bd8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_lanceolatum_2cb4d6b2-f021-4717-8cf9-de2ee3854f28_idigbio_680.jpeg,1
8134,8134,torva,lanceolatum,0.0458,468,768,jpg,gbif,f89ab23c53935bd8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_lanceolatum_1055775695_gbif_4872.jpeg,0
2213,2213,holophylla,malacothrix,0.0362,489,768,jpg,idigbio,e9299696cae28c9b,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_malacothrix_b62b3b5c-8929-4431-98ab-0aa4350dd26e_idigbio_2682.jpeg,1
2197,2197,holophylla,malacothrix,0.0362,489,768,jpg,gbif,e9299696cae28c9b,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_malacothrix_1056408212_gbif_4720.jpeg,0
6008,6008,petota,oxycarpum,1.3979,2048,3235,jpg,gbif,e6c74f3e341e1b71,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_oxycarpum_144897416_gbif_6720.jpg,1
6036,6036,petota,oxycarpum,1.5676,2746,4338,jpg,idigbio,e6c74f3e341e1b71,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_oxycarpum_d30cf3ff-4f5e-4416-8f4d-0eaf901d85b2_idigbio_5726.jpg,1
6192,6192,petota,stenophyllidium,0.0415,488,768,jpg,idigbio,e36e0c0c183121b1,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stenophyllidium_0ad7d5a3-157d-4eea-91ee-4a1fd8214d41_idigbio_1622.jpeg,1
6195,6195,petota,stenophyllidium,0.0415,488,768,jpg,gbif,e36e0c0c183121b1,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stenophyllidium_1056990622_gbif_4474.jpeg,0
6724,6724,petota,stoloniferum,1.7507,1248,2000,jpg,gbif,e22969c24c642663,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stoloniferum_912440777_gbif_5018.jpg,1
6718,6718,petota,stoloniferum,1.7504,1248,2000,jpg,gbif,e22969c24c642663,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stoloniferum_912440740_gbif_5043.jpg,1


## 2.2 Pruning of smaller pictures that are have the same hash but different sizes

In [None]:
ordered_dups = duplicates[duplicates["keep"] == 1].sort_values(["hash", "filesize_mb"], ascending=False)
ordered_dups.loc[ordered_dups.duplicated(subset=["hash", "section", "species"], keep="first"), "keep"] = 0

In [None]:
# Update the original duplicates dataframe
duplicates.loc[duplicates.index.isin(ordered_dups[ordered_dups["keep"] == 0].index), "keep"] = 0

In [None]:
print(ordered_dups[ordered_dups["keep"] == 0].shape)
print(duplicates[duplicates["keep"] == 0].shape)

(27, 11)
(93, 11)


## 2.3 Pruning of duplicated records that are classified in multiple sections/species

In [None]:
reclassification_dups = ordered_dups[ordered_dups["keep"] == 1].drop("Unnamed: 0", axis=1)
reclassification_dups[reclassification_dups.duplicated("hash", keep=False)]

Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path,keep
4773,melongena,houstonii,0.0545,467,768,jpg,idigbio,c8229a4729b2e6e8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\melongena\melongena_houstonii_a25edb3c-eb4f-40eb-a9a9-2aa00578056f_idigbio_5006.jpeg,1
5033,micracantha,tampicense,0.0545,467,768,jpg,gbif,c8229a4729b2e6e8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\micracantha\micracantha_tampicense_1055692588_gbif_4898.jpeg,1
5786,petota,hintonii,2.8609,7079,10079,jpg,gbif,b1914eaccc9a9e92,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_hintonii_2430227118_gbif_420.jpeg,1
6105,petota,polyadenium,2.8609,7079,10079,jpg,gbif,b1914eaccc9a9e92,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_polyadenium_2430448876_gbif_407.jpeg,1
5886,petota,iopetalum,1.6025,2732,4359,jpg,idigbio,b02e4fcd47177658,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_iopetalum_4bff8a9b-552c-482a-9548-cc175d4d5802_idigbio_948.jpg,1
6006,petota,oxycarpum,1.4398,2048,3268,jpg,gbif,b02e4fcd47177658,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_oxycarpum_144897192_gbif_6725.jpg,1
4774,melongena,houstonii,0.0425,487,768,jpg,idigbio,6b651ac76d29272c,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\melongena\melongena_houstonii_a25edb3c-eb4f-40eb-a9a9-2aa00578056f_idigbio_5007.jpeg,1
5032,micracantha,tampicense,0.0425,487,768,jpg,gbif,6b651ac76d29272c,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\micracantha\micracantha_tampicense_1055692588_gbif_4897.jpeg,1
3537,melongena,angustifolium,0.5252,951,1440,jpg,gbif,43e7d1d3c20381d2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\melongena\melongena_angustifolium_1135482425_gbif_4313.jpeg,1
4923,melongena,rostratum,0.5252,951,1440,jpg,idigbio,43e7d1d3c20381d2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\melongena\melongena_rostratum_eac14d4c-209b-496a-a432-2efef51fb50b_idigbio_3480.jpeg,1


We need to consult with a taxonomist to get the records that must be kept and which need to be removed from the dataset.

This error of having the same picture in two different species or even sections most likely happened because of capturing errors or determination errors made by the taxonomist that uploaded the specimen to the online database.

It is noticed that the picture identified with hash `c8229a4729b2e6e8` and `6b651ac76d29272c` represent the same specimen but substantial changes were made to the specimen sheet.

In [None]:
indices_to_delete = [5033,
                     6105,
                     5886,
                     4774,5032,
                     3537
                     ]
reclassification_dups.loc[reclassification_dups.index.isin(indices_to_delete), "keep"] = 0



In [None]:
# Update the original duplicates dataframe
duplicates.loc[duplicates.index.isin(reclassification_dups[reclassification_dups["keep"] == 0].index), "keep"] = 0

In [None]:
# Update the original report to remove all duplicates
report.loc[report.index.isin(duplicates[duplicates["keep"] == 0].index), "keep"] = 0

In [None]:
dedup_images = report[report["keep"] == 1].drop(["keep", "Unnamed: 0"], axis=1)
dedup_images.to_csv(os.path.join(root_folder, "Downloaded_dedup_images_report.csv"))

# 3 Metrics on deduplicated images

In [None]:
dedup_images = pd.read_csv(os.path.join(root_folder, "Downloaded_dedup_images_report.csv"))
dedup_images.describe(include="all")

Unnamed: 0.1,Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path
count,8838.0,8838,8838,8838.0,8838.0,8838.0,8838,8838,8838,8838
unique,,21,127,,,,1,2,8838,8838
top,,petota,lanceolatum,,,,jpg,gbif,8aadb6828a12aa8c,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_ferrugineum_b3b3c31f-accb-4821-9838-76841cbe14e6_idigbio_2464.jpeg
freq,,1734,533,,,,8838,6314,1,1
mean,4460.286264,,,6.191186,3640.765897,5148.79905,,,,
std,2582.206533,,,8.051593,2122.25254,2743.151575,,,,
min,0.0,,,0.0064,120.0,160.0,,,,
25%,2224.25,,,1.8615,1216.0,2000.0,,,,
50%,4451.5,,,5.87305,3750.0,5500.0,,,,
75%,6700.75,,,8.962625,6023.25,8679.25,,,,


## Count per section

In [None]:
count_per_section = dedup_images.groupby(["section"]).size().reset_index(name='count').sort_values("count", ascending=False)
display(count_per_section)
#plot_histogram(count_per_section, 1, 1)

Unnamed: 0,section,count
17,petota,1734
10,holophylla,1449
13,melongena,1442
20,torva,1332
3,brevantherum,811
19,solanum,577
5,dulcamara,389
8,herposolanum,224
14,micracantha,180
11,lasiocarpa,178


## Averages and std for sizes and resolution per section

In [None]:
dedup_images[["section", "filesize_mb", "width","height"]].describe()

Unnamed: 0,filesize_mb,width,height
count,8838.0,8838.0,8838.0
mean,6.191186,3640.765897,5148.79905
std,8.051593,2122.25254,2743.151575
min,0.0064,120.0,160.0
25%,1.8615,1216.0,2000.0
50%,5.87305,3750.0,5500.0
75%,8.962625,6023.25,8679.25
max,96.6221,7319.0,10319.0


In [None]:
dedup_images[dedup_images["filesize_mb"] == 96.622100]

Unnamed: 0.1,Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path
6029,6087,petota,polyadenium,96.6221,7319,10319,jpg,gbif,dcd8cd4d7b1b9d97,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_polyadenium_1322105893_gbif_3652.jpeg


In [None]:
avg_per_section = dedup_images[["section", "filesize_mb", "width","height"]].groupby(
    ["section"], as_index=False)[["filesize_mb", "width", "height"]].mean()
display(avg_per_section)

Unnamed: 0,section,filesize_mb,width,height
0,acanthophora,4.982104,3384.14,4882.31
1,anarrhichomenum,3.327075,3623.23,5183.41
2,basarthrum,4.919305,3570.309524,5302.952381
3,brevantherum,6.603759,3879.697904,5459.734895
4,crinitum,5.918309,3724.075758,5085.090909
5,dulcamara,5.46892,3634.74036,5208.511568
6,erythrotrichum,9.045133,4754.666667,6655.0
7,gonatotrichum,10.414095,4834.189655,6540.517241
8,herposolanum,6.437967,3533.379464,4877.651786
9,herpystrichum,9.29355,5271.0,7256.5


In [None]:
std_per_section = dedup_images[["section", "filesize_mb", "width","height"]].groupby(
    ["section"], as_index=False)[["filesize_mb", "width", "height"]].std()
display(std_per_section)

Unnamed: 0,section,filesize_mb,width,height
0,acanthophora,3.782009,1946.907701,2537.533451
1,anarrhichomenum,4.120618,2518.173347,3421.364879
2,basarthrum,3.752585,1848.003214,2593.533844
3,brevantherum,7.31667,2072.632264,2670.786588
4,crinitum,4.663881,1979.131964,2781.564594
5,dulcamara,3.927902,1950.253127,2493.599843
6,erythrotrichum,1.889407,1740.133711,2000.518683
7,gonatotrichum,17.7748,2460.044939,3254.423161
8,herposolanum,7.09233,2050.892957,2657.993259
9,herpystrichum,1.83572,2151.018828,2484.066122


## How many images are smaller than 512x512 and to which section they belong?

Remove them if feasible.

### Width

In [None]:
#dedup_images.drop("Unnamed: 0", axis=1, inplace=True)
dedup_images[dedup_images["width"] < 512].sort_values("width", ascending=True).head(10)

Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path
6244,petota,stoloniferum,0.0064,120,160,jpg,gbif,d0b89a9ab2b0f1d8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stoloniferum_1057377186_gbif_4383.jpeg
2223,holophylla,nigricans,0.0577,455,768,jpg,gbif,46563625339eb6e8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_nigricans_1056273794_gbif_4749.jpeg
836,brevantherum,umbellatum,0.044,458,768,jpg,gbif,98a7b39a9e9ac2e8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\brevantherum\brevantherum_umbellatum_1056789629_gbif_4532.jpeg
6119,petota,schenckii,0.0479,461,768,jpg,idigbio,9e9a323b3a9579e8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_schenckii_8145f46f-685f-4c43-b174-f0c766a70e5e_idigbio_3747.jpeg
2722,holophylla,pseudocapsicum,0.0467,461,768,jpg,gbif,ac2c3e96369698e0,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\holophylla\holophylla_pseudocapsicum_1056213474_gbif_4770.jpeg
8828,torva,torvum,0.0445,461,768,jpg,idigbio,ea3ea636938a3ae8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_torvum_b94c0065-67d9-43d8-9b96-e9b68596ff33_idigbio_4724.jpeg
8429,torva,lanceolatum,0.0531,462,768,jpg,idigbio,8c4e4c5a4e7cb0c8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_lanceolatum_67a09a8c-3bef-4916-b9e0-6148c40f9f63_idigbio_4198.jpeg
8070,torva,lanceolatum,0.0538,462,768,jpg,gbif,8e8e16176313cad8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_lanceolatum_1057376101_gbif_4386.jpeg
8835,torva,torvum,0.0538,462,768,jpg,idigbio,383917333c8eb2c8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\torva\torva_torvum_f782d965-c5c9-4d01-a3ae-440141a62212_idigbio_3164.jpeg
3651,melongena,angustifolium,0.0508,462,768,jpg,idigbio,d8989a9676b2bad8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\melongena\melongena_angustifolium_2e9167b4-6d90-43f1-bbf9-b8b945dd5040_idigbio_1900.jpeg


### Height

In [None]:
dedup_images[dedup_images["width"] < 512].sort_values("height", ascending=True).head(10)

Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path
6244,petota,stoloniferum,0.0064,120,160,jpg,gbif,d0b89a9ab2b0f1d8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stoloniferum_1057377186_gbif_4383.jpeg
69,acanthophora,myriacanthum,0.0434,504,768,jpg,idigbio,d699093e18ca0312,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_myriacanthum_41c09901-0b87-4be2-8e98-673a7f93ff1d_idigbio_3163.jpeg
5774,petota,hjertingii,0.0647,500,768,jpg,idigbio,b18fbda12f6b3b63,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_hjertingii_ddc08aec-8bf6-4a16-9cfd-866f935f5cdf_idigbio_3167.jpeg
5808,petota,iopetalum,0.0425,500,768,jpg,gbif,d31b2bdcb0e9f170,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_iopetalum_1057308116_gbif_4403.jpeg
5899,petota,michoacanum,0.0442,496,768,jpg,gbif,c40cf28361659549,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_michoacanum_1055440673_gbif_4943.jpeg
6056,petota,polyadenium,0.0551,493,768,jpg,idigbio,c94533db8cc6d9d9,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_polyadenium_35d4f104-3e7c-4d62-ab61-a44d7d1f537f_idigbio_111.jpeg
6119,petota,schenckii,0.0479,461,768,jpg,idigbio,9e9a323b3a9579e8,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_schenckii_8145f46f-685f-4c43-b174-f0c766a70e5e_idigbio_3747.jpeg
6120,petota,schenckii,0.0417,488,768,jpg,idigbio,9199333838946569,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_schenckii_8145f46f-685f-4c43-b174-f0c766a70e5e_idigbio_3748.jpeg
6131,petota,stenophyllidium,0.0415,488,768,jpg,idigbio,e36e0c0c183121b1,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stenophyllidium_0ad7d5a3-157d-4eea-91ee-4a1fd8214d41_idigbio_1622.jpeg
6227,petota,stoloniferum,0.0458,500,768,jpg,gbif,c6c3c3c1c927372f,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\petota\petota_stoloniferum_1055662979_gbif_4902.jpeg


### Removing very small pictures

Given the results of the count, there's only one specimen that is too small to be upscaled to 512x512. The rest of the pictures have values less than 512 but they are close enough that distortion may not be too bad.

In [None]:
display(dedup_images.shape)
big_pictures = dedup_images.drop(dedup_images[dedup_images["height"] < 200].index)
display(big_pictures.shape)

(8838, 9)

(8837, 9)

In [None]:
big_pictures.to_csv(os.path.join(root_folder, "Downloaded_dedup_images_nosmall_report.csv"),
                    index=False)
#big_pictures = pd.read_csv(os.path.join(root_folder, "Downloaded_dedup_images_nosmall_report.csv"))


## Remove sections that have less than 100 pictures TBD

In [None]:
count_per_section = big_pictures.groupby(["section"]).size().reset_index(name='count').sort_values("count", ascending=False)
display(count_per_section)

Unnamed: 0,section,count
17,petota,1733
10,holophylla,1449
13,melongena,1442
20,torva,1332
3,brevantherum,811
19,solanum,577
5,dulcamara,389
8,herposolanum,224
14,micracantha,180
11,lasiocarpa,178


In [None]:
sections_of_interest = count_per_section[count_per_section["count"] >= 100]["section"]
display(sections_of_interest)

17             petota
10         holophylla
13          melongena
20              torva
3        brevantherum
19            solanum
5           dulcamara
8        herposolanum
14        micracantha
11         lasiocarpa
0        acanthophora
1     anarrhichomenum
Name: section, dtype: object

In [None]:
sections_with_100 = big_pictures[big_pictures["section"].isin(sections_of_interest)]
display(sections_with_100.shape)

previous_records = big_pictures.shape[0]
new_records = sections_with_100.shape[0]

print(f"We went from {previous_records} to {new_records}, {previous_records - new_records} were records were removed")

(8515, 9)

We went from 8837 to 8515, 322 were records were removed


In [None]:
#sections_with_100.to_csv(os.path.join(root_folder, "Downloaded_dedup_images_nosmall_100picspersection.csv"),
#                    index=False)
sections_with_100 = pd.read_csv(os.path.join(root_folder, "Downloaded_dedup_images_nosmall_100picspersection.csv"))

In [15]:
sections_with_100.head(5)

Unnamed: 0,section,species,filesize_mb,width,height,image_type,source,hash,full_path
0,acanthophora,acerifolium,8.9974,3750,5500,jpg,gbif,8c86b6326a6e0cca,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1928496814_gbif_2700.jpg
1,acanthophora,acerifolium,7.3703,3750,5500,jpg,gbif,bea48d0a2aac92b2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1928638909_gbif_2609.jpg
2,acanthophora,acerifolium,7.0359,3750,5500,jpg,gbif,b893b2a893b6a2e6,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1928803178_gbif_2510.jpg
3,acanthophora,acerifolium,7.873,3750,5500,jpg,gbif,98a6963cb69a8aca,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1929578053_gbif_2035.jpg
4,acanthophora,acerifolium,5.9595,3750,5500,jpg,gbif,96963696ca8adaf2,P:/CODE/ITESM/tesis-dataset-downloader/solanum_output\acanthophora\acanthophora_acerifolium_1929578350_gbif_2034.jpg


# 4 Resizing of relevant images to build final pre-processed dataset TBD

In [40]:
def win2linux_path(path: str) -> str:
    path = path.replace("\\", "/")
    return path.replace("P:", "/mnt/pepper")

def linux2win_path(path: str) -> str:
    path = path.replace("\\", "/")
    return path.replace("/mnt/pepper", "P:")

def get_filename(path: str) -> str:
    return path.split("/")[-1]

def create_folders(path: str):
    if not os.path.exists(path):
        os.makedirs(path)


In [44]:
%%time
# Output to a folder called resized that respects each of the folders for each section
resized_folder = os.path.join(root_folder, "z_clean_resized")
create_folders(resized_folder)

# Build a new dataframe with the new images, size in MB and save it to csv
# Print the reduction in size for the dataset
new_images = []

# For each of the records, read the input image and resize it
for index, row in sections_with_100.iterrows():
    
    # Change size
    image_path = win2linux_path(row["full_path"])
    image = Image.open(image_path)
    image = image.resize((512, 512))

    # Output folder
    output_folder = os.path.join(resized_folder, row["section"])
    create_folders(output_folder)

    output_file = os.path.join(output_folder, get_filename(image_path))
    image.save(output_file)

    # Size (MB)
    filesize_mb = round(os.path.getsize(output_file) / 1024 / 1024, 4)
    
    row = {
        "section": row["section"],
        "species": row["species"],
        "filesize_mb": filesize_mb,
        "image_type": row["image_type"],
        "source": row["source"],
        "full_path": linux2win_path(output_file),
    }

    new_images.append(row)

resized_images = pd.DataFrame(new_images)



CPU times: user 51min 36s, sys: 2min 48s, total: 54min 24s
Wall time: 59min 32s


In [45]:
%%time
#resized_images.to_csv(os.path.join(resized_folder, "images_dedup_512x512_100picspersection.csv"), index=False)
resized_images = pd.read_csv(os.path.join(resized_folder, "images_dedup_512x512_100picspersection.csv")

CPU times: user 126 ms, sys: 16.6 ms, total: 143 ms
Wall time: 533 ms


In [51]:
size_before = sections_with_100["filesize_mb"].sum()
size_after = resized_images["filesize_mb"].sum()

print(f"Dataset was resized from {size_before} MB to {size_after} MB, a reduction of {1 - (size_after / size_before)}%")
print(f"Before shape: {sections_with_100.shape}\nAfter shape: {resized_images.shape}")


Dataset was resized from 52669.9167 MB to 251.24760000000003 MB, a reduction of 0.9952297703178254%
Before shape: (8515, 9)
After shape: (8515, 6)


## Zip it up in a file so that the dataset can be uploaded to the DGX-1

In [None]:
!ls '/mnt/pepper/CODE/ITESM/tesis-dataset-downloader/solanum_output/z_clean_resized'
!zip -r '/mnt/pepper/CODE/ITESM/tesis-dataset-downloader/solanum_output/clean_resized_dataset.zip' "/mnt/pepper/CODE/ITESM/tesis-dataset-downloader/solanum_output/z_clean_resized"

In [None]:
!ls -lh '/mnt/pepper/CODE/ITESM/tesis-dataset-downloader/solanum_output/'