In [1]:
import os
import shutil
import pandas as pd

# Import dataset

In [2]:
!wget https://github.com/fsnobre/datasets/raw/main/2020-02-14_InfraredSolarModules.zip

--2024-01-07 11:57:58--  https://github.com/fsnobre/datasets/raw/main/2020-02-14_InfraredSolarModules.zip
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/fsnobre/OrganizingDatasets/raw/main/2020-02-14_InfraredSolarModules.zip [following]
--2024-01-07 11:57:59--  https://github.com/fsnobre/OrganizingDatasets/raw/main/2020-02-14_InfraredSolarModules.zip
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/fsnobre/OrganizingDatasets/main/2020-02-14_InfraredSolarModules.zip [following]
--2024-01-07 11:57:59--  https://raw.githubusercontent.com/fsnobre/OrganizingDatasets/main/2020-02-14_InfraredSolarModules.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting 

In [3]:
import zipfile

# unzip the downloaded file
zip_ref = zipfile.ZipFile("/content/2020-02-14_InfraredSolarModules.zip")
zip_ref.extractall()
zip_ref.close()

In [4]:
path_dataset = "/content/2020-02-14_InfraredSolarModules/InfraredSolarModules/"
path_dataset_json = path_dataset + "module_metadata.json"

df = pd.read_json(path_dataset_json, orient="index")

tam_dataset = df.shape[0]

df['image_filepath'] = path_dataset + df['image_filepath']

In [5]:
# Identify classes from dataset
class_names = df['anomaly_class'].unique()
class_names

array(['No-Anomaly', 'Cell', 'Hot-Spot', 'Offline-Module', 'Vegetation',
       'Diode', 'Shadowing', 'Cracking', 'Diode-Multi', 'Hot-Spot-Multi',
       'Cell-Multi', 'Soiling'], dtype=object)

In [6]:
# Count how many images from each class
for class_name in class_names:
    count = df.loc[df['anomaly_class']==class_name].count()[0]
    print(f"Class: {class_name} with {count} images, representing {(count/tam_dataset)*100:.1f}% of the Dataset")

Class: No-Anomaly with 10000 images, representing 50.0% of the Dataset
Class: Cell with 1877 images, representing 9.4% of the Dataset
Class: Hot-Spot with 249 images, representing 1.2% of the Dataset
Class: Offline-Module with 827 images, representing 4.1% of the Dataset
Class: Vegetation with 1639 images, representing 8.2% of the Dataset
Class: Diode with 1499 images, representing 7.5% of the Dataset
Class: Shadowing with 1056 images, representing 5.3% of the Dataset
Class: Cracking with 940 images, representing 4.7% of the Dataset
Class: Diode-Multi with 175 images, representing 0.9% of the Dataset
Class: Hot-Spot-Multi with 246 images, representing 1.2% of the Dataset
Class: Cell-Multi with 1288 images, representing 6.4% of the Dataset
Class: Soiling with 204 images, representing 1.0% of the Dataset


Checking with original information from dataset:

|Class | Name	| Images	| Description|
|-|-|-|-|
|Cell	|1,877	|Hot spot occurring with square geometry in single cell.|
|Cell-Multi	|1,288	|Hot spots occurring with square geometry in multiple cells.|
|Cracking	|941*|	Module anomaly caused by cracking on module surface.
|Hot-Spot	|251*|	Hot spot on a thin film module.|
|Hot-Spot-Multi	|247*|	Multiple hot spots on a thin film module.|
|Shadowing	|1056|	Sunlight obstructed by vegetation, man-made structures, or adjacent rows.|
|Diode	|1,499|	Activated bypass diode, typically 1/3 of module.|
|Diode-Multi	|175|	Multiple activated bypass diodes, typically affecting 2/3 of module.|
|Vegetation	|1,639|	Panels blocked by vegetation.|
|Soiling	|205*|	Dirt, dust, or other debris on surface of module.|
|Offline-Module	|828*|	Entire module is heated.|
|No-Anomaly	|10,000|	Nominal solar module.|

* not equal to counted images. This table sums to 20,006 anyway

In [7]:
# Create a new folder for the Dataset with internal folders, each internal folder referring to a class
path = "/content/InfraredSolarModules-Modificado"
os.mkdir(path)

for class_name in class_names:
    os.mkdir(os.path.join(path, class_name))

In [8]:
# Copy each file from the original folder to specific folders
for index, row in df.iterrows():
    filename = str(row.name) + ".jpg"
    file_path = os.path.join(row['anomaly_class'], filename)
    shutil.copyfile(row["image_filepath"], os.path.join(path, file_path))

In [9]:
# Count the number of files in each folder of each class (it must match the count above)
for class_name in class_names:
    cont = 0

    for p in os.scandir(os.path.join(path, class_name)):
        if p.is_file():
            cont += 1

    print(f"Class: {class_name} with {cont} images")

Class: No-Anomaly with 10000 images
Class: Cell with 1877 images
Class: Hot-Spot with 249 images
Class: Offline-Module with 827 images
Class: Vegetation with 1639 images
Class: Diode with 1499 images
Class: Shadowing with 1056 images
Class: Cracking with 940 images
Class: Diode-Multi with 175 images
Class: Hot-Spot-Multi with 246 images
Class: Cell-Multi with 1288 images
Class: Soiling with 204 images
