## 1 - Data Cleaning

### Lib

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

### Vars

In [2]:
# File URL
raw_csv_url="../../storage/datas/csv/raw/observations_mushroom.csv"

# Load CSV as DF
observation_mushroom_raw = pd.read_csv(raw_csv_url, low_memory=False)

### Describe

In [3]:
initial_observation_numbers=observation_mushroom_raw.shape[0]
observation_mushroom_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647623 entries, 0 to 647622
Data columns (total 33 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   image_lien                647623 non-null  object 
 1   image_id                  647623 non-null  int64  
 2   observation               647623 non-null  object 
 3   label                     647623 non-null  object 
 4   image_url                 647623 non-null  object 
 5   user                      647623 non-null  object 
 6   date                      647623 non-null  object 
 7   gbif_info/kingdom         613711 non-null  object 
 8   gbif_info/family          585083 non-null  object 
 9   gbif_info/speciesKey      411076 non-null  float64
 10  gbif_info/rank            613749 non-null  object 
 11  gbif_info/phylum          610016 non-null  object 
 12  gbif_info/orderKey        606009 non-null  float64
 13  gbif_info/species         411076 non-null  o

### Duplicated values

In [4]:
observation_mushroom_raw_no_duplicates=observation_mushroom_raw.drop_duplicates()
observation_numbers_no_duplicated=observation_mushroom_raw_no_duplicates.shape[0]

### Remove unnecessary columns

In [5]:
initial_columns_nb=observation_mushroom_raw_no_duplicates.shape[1]


columns_to_delete=[
    "observation",
    "image_url",
    "user",
    "date",
    "gbif_info/speciesKey",
    "gbif_info/orderKey",
    "gbif_info/confidence",
    "gbif_info/classKey",
    "gbif_info/matchType",
    "gbif_info/familyKey",
    "gbif_info/status",
    "gbif_info/usageKey",
    "gbif_info/kingdomKey",
    "gbif_info/genusKey",
    "gbif_info/phylumKey",
    "gbif_info/synonym",
    "thumbnail",
    "location",
    "gbif_info/note",
    "gbif_info",
    "gbif_info/scientificName",
    "image_id"
]

observation_mushroom_raw_no_duplicates_columns_cleaned=observation_mushroom_raw_no_duplicates.drop(columns=columns_to_delete, axis=1)
new_columns_nb=observation_mushroom_raw_no_duplicates_columns_cleaned.shape[1]

### Remove 'gbif' name in columns

In [6]:
columns_rename={
    "gbif_info/kingdom":"kingdom",
    "gbif_info/phylum":"phylum",
    "gbif_info/class":"class",
    "gbif_info/order":"order",
    "gbif_info/family":"family",
    "gbif_info/genus":"genus",
    "gbif_info/species":"species",
    "gbif_info/canonicalName":"canonicalName",
    "gbif_info/rank":"rank",}

In [7]:
observation_mushroom_raw_no_duplicates_columns_cleaned_renamed=observation_mushroom_raw_no_duplicates_columns_cleaned.rename(columns=columns_rename)

In [8]:
observation_mushroom_raw_no_duplicates_columns_cleaned_renamed.head()

Unnamed: 0,image_lien,label,kingdom,family,rank,phylum,species,canonicalName,class,genus,order
0,1.jpg,Xylaria polymorpha,Fungi,Xylariaceae,SPECIES,Ascomycota,Xylaria polymorpha,Xylaria polymorpha,Sordariomycetes,Xylaria,Xylariales
1,2.jpg,Xylaria magnoliae,Fungi,Xylariaceae,SPECIES,Ascomycota,Xylaria magnoliae,Xylaria magnoliae,Sordariomycetes,Xylaria,Xylariales
2,3.jpg,Xylaria hypoxylon,Fungi,Xylariaceae,SPECIES,Ascomycota,Xylaria hypoxylon,Xylaria hypoxylon,Sordariomycetes,Xylaria,Xylariales
3,4.jpg,Xylaria hypoxylon,Fungi,Xylariaceae,SPECIES,Ascomycota,Xylaria hypoxylon,Xylaria hypoxylon,Sordariomycetes,Xylaria,Xylariales
4,5.jpg,Xeromphalina,Fungi,Mycenaceae,GENUS,Basidiomycota,,Xeromphalina,Agaricomycetes,Xeromphalina,Agaricales


### Remove non Fungi images and observations

In [9]:
observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi = observation_mushroom_raw_no_duplicates_columns_cleaned_renamed[observation_mushroom_raw_no_duplicates_columns_cleaned_renamed['kingdom']=="Fungi"]
nb_imgs_kingdom_is_fungi=observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi.shape[0]


In [10]:
observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi.isna().sum()

image_lien            0
label                 0
kingdom               0
family            27640
rank                  0
phylum             2242
species          197425
canonicalName         0
class              3830
genus             35129
order              5424
dtype: int64

### NULL values

In [11]:
observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null=observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi.dropna()

In [12]:
number_imgs_no_null=observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null.isna().shape[0]

### Remove non "species" values for 'rank'

In [13]:
observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null_sorted_rank=observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null[observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null['rank']=='SPECIES']
nb_imgs_sorted_rank=observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null_sorted_rank.shape[0]

### Prépare datas 

Scientific specy classification:
- Domain
- Kingdom
- Phylum
- Class
- Order
- Family
- Genus
- Species

In [14]:
cleaned_dataset=observation_mushroom_raw_no_duplicates_columns_cleaned_renamed_only_fungi_no_null_sorted_rank.copy()

In [15]:
cleaned_dataset.drop(columns=["kingdom", "rank"], inplace=True)
cleaned_dataset.drop_duplicates(keep="first", inplace=True)

In [None]:
cleaned_dataset.to_csv("../../storage/datas/csv/clean/cleaned_dataset.csv", index=False)

## 2 - Get image datas

### Lib

In [30]:
import cv2
import pandas as pd
pd.set_option('display.max_columns', None)

### Vars

In [25]:
# File URL
clean_csv_url="../../storage/datas/csv/clean/cleaned_dataset.csv"
# Img URL
img_url = "/home/guillaume/Téléchargements/mushroom-dataset/"

# Load CSV as DF
cleaned_dataset = pd.read_csv(clean_csv_url, low_memory=False)

# Change path to img in the dataset
cleaned_dataset['image_lien'] = cleaned_dataset['image_lien'].apply(lambda x: img_url + x)

In [43]:
cleaned_dataset.head(5)

Unnamed: 0,image_lien,label,family,phylum,species,canonicalName,class,genus,order
0,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria polymorpha,Xylariaceae,Ascomycota,Xylaria polymorpha,Xylaria polymorpha,Sordariomycetes,Xylaria,Xylariales
1,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria magnoliae,Xylariaceae,Ascomycota,Xylaria magnoliae,Xylaria magnoliae,Sordariomycetes,Xylaria,Xylariales
2,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria hypoxylon,Xylariaceae,Ascomycota,Xylaria hypoxylon,Xylaria hypoxylon,Sordariomycetes,Xylaria,Xylariales
3,/home/guillaume/Téléchargements/mushroom-datas...,Xylaria hypoxylon,Xylariaceae,Ascomycota,Xylaria hypoxylon,Xylaria hypoxylon,Sordariomycetes,Xylaria,Xylariales
4,/home/guillaume/Téléchargements/mushroom-datas...,Xeromphalina campanella,Mycenaceae,Basidiomycota,Xeromphalina campanella,Xeromphalina campanella,Agaricomycetes,Xeromphalina,Agaricales


### Img features

### Avant de continuer sur les features, utiliser images_found.csv pour ne garder que les images qui sont bien présentes dans le dataset et réenregistrer cleaned_dataset.csv

In [46]:
image_list = pd.read_csv("../../storage/datas/csv/clean/images_found.csv")

In [49]:
image_list.head()
image_list.shape

(388355, 1)

In [50]:
cleaned_dataset.head()
cleaned_dataset.shape

(388094, 9)

In [31]:
def addFeaturestoDF(df, image_url_col):
    """
    Adds image features to a DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame to which the features will be added.
    image_url_col (str): The name of the column in the DataFrame that contains the image URLs or file paths.

    Returns:
    pandas.DataFrame: The DataFrame with the added image features.

    """

    def extract_features(img_url):
        """
        Extracts features from an image.

        Parameters:
        img_url (str): The URL or file path of the image.

        Returns:
        dict: A dictionary containing the extracted features:
            - 'width': The width of the image.
            - 'height': The height of the image.
            - 'red_color_mean': The average red channel value of the image.
            - 'green_color_mean': The average green channel value of the image.
            - 'blue_color_mean': The average blue channel value of the image.
        """
        img = cv2.imread(img_url)
        height, width, channels = img.shape
        features = {
            'width': width,
            'height': height,
            'red_color_mean': np.mean(img[:,:,2]),
            'green_color_mean': np.mean(img[:,:,1]),
            'blue_color_mean': np.mean(img[:,:,0])
        }
        
        return features
    

    features_list = []

    for index, row in df.iterrows():
        filepath = row[image_url_col]
        features = extract_features(filepath)
        features_list.append(features)

    features_df = pd.DataFrame(features_list)
    features_df['all_color_mean'] = (features_df['red_color_mean']\
                                     + features_df['green_color_mean']\
                                     + features_df['blue_color_mean'])\
                                     / 3
    
    df_with_features = pd.concat([df, features_df], axis=1)
    return df_with_features

In [None]:
# Add image features to the DataFrame
cleaned_dataset = addFeaturestoDF(df = cleaned_dataset,
                                  image_url_col = 'image_lien')

In [None]:
# Removing bad dimension images
cleaned_dataset = cleaned_dataset[(cleaned_dataset['width'] > 200) & (cleaned_dataset['height'] > 200)]

In [None]:
cleaned_dataset.to_csv('../../storage/datas/clean/cleaned_dataset.csv', index=False)

### Resume

In [None]:
cleaned_dataset.to_csv("../../storage/datas/csv/clean/cleaned_dataset.csv", index=False)

In [17]:
print(f"Number of images, raw file: {initial_observation_numbers}")
print(f"Number of images, duplicated removed: {observation_numbers_no_duplicated}")
print(f"Number of images, only fungi: {nb_imgs_kingdom_is_fungi}")
print(f"Number of images, no null values: {number_imgs_no_null}")
print(f"Number of images, only species: {nb_imgs_sorted_rank}")

print(f"columns number, old: {initial_columns_nb} ; new: {new_columns_nb} \n")

Number of images, raw file: 647623
Number of images, duplicated removed: 647615
Number of images, only fungi: 600439
Number of images, no null values: 398798
Number of images, only species: 388733
columns number, old: 33 ; new: 11 

