## Lib

In [81]:
import pandas as pd
pd.set_option('display.max_columns', None)

from pprint import pprint


import sys
sys.path.append("../")

from utils.file_processing import move_files_to_folder

## Vars

In [82]:
# File URL
csv_url="../../storage/datas/csv/clean/cleaned_dataset_with_features_and_dimensions.csv"

# Load CSV as DF
df = pd.read_csv(csv_url, low_memory=False)

## 1 - Choose labels to keep

In [83]:
print(f"Initial df shape: {df.shape}")

Initial df shape: (365252, 15)


In [84]:
df.head()

Unnamed: 0,image_lien,label,family,phylum,species,canonicalName,class,genus,order,width,height,red_color_mean,green_color_mean,blue_color_mean,all_color_mean
0,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria polymorpha,Xylariaceae,Ascomycota,Xylaria polymorpha,Xylaria polymorpha,Sordariomycetes,Xylaria,Xylariales,320,240,77.321185,72.687839,70.582227,73.530417
1,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria magnoliae,Xylariaceae,Ascomycota,Xylaria magnoliae,Xylaria magnoliae,Sordariomycetes,Xylaria,Xylariales,320,240,71.123307,64.609049,62.255977,65.996111
2,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria hypoxylon,Xylariaceae,Ascomycota,Xylaria hypoxylon,Xylaria hypoxylon,Sordariomycetes,Xylaria,Xylariales,320,240,84.582305,70.401758,54.611263,69.865109
3,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria hypoxylon,Xylariaceae,Ascomycota,Xylaria hypoxylon,Xylaria hypoxylon,Sordariomycetes,Xylaria,Xylariales,240,320,28.234687,27.118841,23.88349,26.412339
4,/home/guillaume/Téléchargements/mushroom-datas...,Xeromphalina campanella,Mycenaceae,Basidiomycota,Xeromphalina campanella,Xeromphalina campanella,Agaricomycetes,Xeromphalina,Agaricales,240,320,41.552617,35.021029,26.419349,34.330998


In [85]:
# Lets get top 10 species
top_10_species = df['species'].value_counts().head(10).index.tolist()
pprint(f"Top 10 species: {top_10_species}")

("Top 10 species: ['Pleurotus ostreatus', 'Trametes versicolor', 'Pluteus "
 "cervinus', 'Lepista nuda', 'Boletus edulis', 'Psilocybe zapotecorum', "
 "'Hypholoma fasciculare', 'Psilocybe cyanescens', 'Ganoderma applanatum', "
 "'Galerina marginata']")


In [86]:
df_top_10_species = df[df['species'].isin(top_10_species)]
print(f"Top 10 species shape: {df_top_10_species.shape}")

Top 10 species shape: (13674, 15)


In [87]:
df_top_10_species["species"].value_counts()

species
Pleurotus ostreatus      1605
Trametes versicolor      1574
Pluteus cervinus         1400
Lepista nuda             1385
Boletus edulis           1380
Psilocybe zapotecorum    1344
Hypholoma fasciculare    1272
Psilocybe cyanescens     1251
Ganoderma applanatum     1247
Galerina marginata       1216
Name: count, dtype: int64

In [88]:
# Add 1400 other random species classified as "other specy"
df_other_species = df[~df['species'].isin(top_10_species)].sample(1400)
df_other_species["species"] = "other"

# Concatenate top 10 species with other species
df = pd.concat([df_top_10_species, df_other_species])

In [89]:
# New shape
print(f"New df shape: {df.shape}")

New df shape: (15074, 15)


In [90]:
# Save new df
df.to_csv("../../storage/datas/csv/clean/cleaned_dataset_with_features_and_dimensions_top_10_species.csv", index=False)

 ## 2 - Keeps only imgs from top 10 species

In [77]:
csv_url = "../../storage/datas/csv/clean/cleaned_dataset_with_features_and_dimensions_top_10_species.csv"

df = pd.read_csv(csv_url, low_memory=False)

old_shape = df.shape

In [79]:
df["image_lien"][0]

'/home/guillaume/Téléchargements/mushroom-dataset/imgs_in_cleaned_dataset/66.jpg'

In [80]:
# Remove images that are not in the dataset
csv_url = "../../storage/datas/csv/clean/cleaned_dataset_with_features_and_dimensions_top_10_species.csv"

move_files_to_folder(csv_url = csv_url)

# Refer to ../logs/move_files_to_folder.log for more informations

# Reload df
df = pd.read_csv(csv_url, low_memory=False)
new_shape = df.shape
print(f"old shape: {old_shape} \n new shape: {new_shape} \n {old_shape[0] - new_shape[0]} images removed from the dataset")

# The cleaned dataset is saved by the function

old shape: (15074, 15) 
 new shape: (0, 15) 
 15074 images removed from the dataset


In [None]:
/home/guillaume/Téléchargements/mushroom-dataset/imgs_in_cleaned_dataset
/home/guillaume/Téléchargements/mushroom-dataset/destination

In [None]:
""" REVERSE """

In [None]:
# If needed, reverse the operation (Do not reverse cleaned dataset new shape ; it will be different ; use section 1 to recreate the cleaned dataset)
source = "/home/guillaume/Téléchargements/mushroom-dataset/mushroom_images_dataset/imgs_in_cleaned_dataset"
destination = "/home/guillaume/Téléchargements/mushroom-dataset/mushroom_images_dataset/observations_mushroom"

import os
import shutil
for file in os.listdir(source):
    shutil.move(os.path.join(source, file), destination)

In [None]:
""" END REVERSE """

In [None]:
cleaned_dataset.to_csv("", index=False)

## 2 - Remove bad imgs from the dataset

In [None]:
import sys
sys.path.append("../")

from utils.file_processing import make_file_list_as_csv

In [None]:
# Get good imgs dataset
make_file_list_as_csv(file_name="good_imgs_dataset")
good_imgs_df = pd.read_csv("../../storage/datas/csv/clean/good_imgs_dataset.csv")

In [None]:
# Get bad imgs dataset
main(file_name="bad_imgs_dataset")
bad_imgs_df = pd.read_csv("../../storage/datas/csv/clean/bad_imgs_dataset.csv")

In [None]:
# Get cleaned dataset with features | Top 10 species
cleaned_dataset = pd.read_csv("../../storage/datas/csv/clean/cleaned_dataset_with_features_top_10_species.csv")

In [None]:
# Remove bad imgs from cleaned dataset
cleaned_dataset = cleaned_dataset[~cleaned_dataset["id"].isin(bad_imgs_df["id"])]

# Save new cleaned dataset
cleaned_dataset.to_csv("../../storage/datas/csv/clean/cleaned_dataset_with_features_top_10_species_good_imgs_only.csv", index=False)