#PIPELINE to use classifier during the Cruise <br>
1-Convert roi to png (you will need pyifcb environment for this step)<br>
2-Make csv files with png imagepathways<br>
3-Process the images for tensorflow classifier <br>
4-Load and apply the classifier<br>
5-Plot 20 random images and predicted labels with and without detritus<br>
6-Merge the classified csv files for a day <br>
7-Summarize the counts of groups per day 


In [None]:
#you will need a Python 3.10.12 environment to upload pyifcb package ,it doesnt work with new versions
pip install git+https://github.com/joefutrelle/pyifcb

In [None]:
#STEP 1 ROI TO PNG CONVERSION
import os
from PIL import Image
import glob
import ifcb  # Assuming ifcb is the correct module for opening .roi files
import numpy as np

# Directory containing the .roi files
directory_path = r"D:\Dyson-June2024-Data"

# Define the start and end filenames for the range you want to process
start_file = "D20240622T000637_IFCB165.roi"
end_file = "D20240622T062658_IFCB165.roi"

# Find all .roi files in the directory
all_roi_files = glob.glob(os.path.join(directory_path, '*.roi'))

# Filter the list for files within the specified range
filtered_roi_files = [file for file in all_roi_files if start_file <= os.path.basename(file) <= end_file]

# Iterate over each filtered .roi file
for roi_file in filtered_roi_files:
    try:
        # Extract the base filename (without the extension) to use as the folder name
        base_filename = os.path.splitext(os.path.basename(roi_file))[0]
        
        # Construct the path for the new folder
        new_folder_path = os.path.join(directory_path, base_filename)

        # Create the new folder if it doesn't exist
        os.makedirs(new_folder_path, exist_ok=True)

        # Open the .roi file to access the image data
        with ifcb.open_raw(roi_file) as sample_bin:
            # Iterate over each image in sample_bin.images
            for index, (image_name, image_data) in enumerate(sample_bin.images.items(), start=1):  # Start from 1
                # Ensure that image data is converted to an appropriate integer type
                if not np.issubdtype(image_data.dtype, np.integer):
                    # Convert floating-point image data to uint8 (common for images)
                    image_data = (255 * (image_data / np.max(image_data))).astype(np.uint8)
                
                # Convert the image data to a PIL Image object
                img = Image.fromarray(image_data)
                
                # Format the filename with the incremented part
                filename = f"{base_filename}.{index:05}.png"
                
                # Construct the full path for the output file within the new folder
                output_path = os.path.join(new_folder_path, filename)
                
                # Save the image
                img.save(output_path)

                # Free memory by closing the image object after saving
                img.close()  # Explicitly close the image to release resources
                del img  # Ensure the object is deleted to free up memory

        print(f"Processed and saved images for: {roi_file}")

    except Exception as e:
        # If there's an error processing a specific file, it will be logged, and the process will continue
        print(f"Error processing {roi_file}: {e}")

# Process complete
print("Processing complete for all files.")


In [None]:
#2- MAKE THC CSV FILES
#Parent directory, make csv file of each image folder 
import os
import pandas as pd

# Define the parent directory containing the folders
parent_dir = r"D:\Dyson-June2024-Data"

# Iterate through each folder in the parent directory
for folder_name in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, folder_name)

    # Ensure the path is a directory (i.e., a folder)
    if os.path.isdir(folder_path):
        # List to hold file paths
        image_paths = []

        # Iterate over the files in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".png"):  # Assuming images are in PNG format
                img_path = os.path.join(folder_path, filename)
                image_paths.append(img_path)

        # Create a DataFrame with the image paths
        df = pd.DataFrame(image_paths, columns=['Image_Path'])

        # Define the CSV file name using the folder name
        csv_path = os.path.join(folder_path, f"{folder_name}_Image_Path.csv")
        
        # Save the DataFrame to a CSV file
        df.to_csv(csv_path, index=False)

        print(f"Image paths for folder '{folder_name}' saved to {csv_path}")


In [None]:
#3.1- PREPROCESS THE IMAGES DEFINE FUNCTIONS
#Preprocess the image and saving function
import os
import cv2
import numpy as np

def preprocess_input(image):
    fixed_size = 128  # Final image should be 128 x 128
    image_size = image.shape[:2]  # Gets the (y_dim, x_dim) for each image

    # The ratio needed to make the longest side of the image 128 pixels
    ratio = float(fixed_size) / max(image_size)

    # Calculates the new size by multiplying each dimension by the ratio
    new_size = tuple([int(x * ratio) for x in image_size])

    # Resizes the image to the new size
    img = cv2.resize(image, (new_size[1], new_size[0]))

    # Calculates the possible padding needed for the x and y dimensions
    delta_w = fixed_size - new_size[1]
    delta_h = fixed_size - new_size[0]
    top, bottom = delta_h // 2, delta_h - (delta_h // 2)
    left, right = delta_w // 2, delta_w - (delta_w // 2)

    # Makes a black border of 128x128 pixels around the image
    color = [0, 0, 0]  # RGB = 0,0,0 -> Black
    rescaled_image = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    
    return rescaled_image

def process_and_save_images(folder_path, destination_folder):
    # Create a new destination folder for the processed images
    processed_folder_name = os.path.basename(folder_path) + "_processed"
    processed_folder_path = os.path.join(destination_folder, processed_folder_name)
    os.makedirs(processed_folder_path, exist_ok=True)

    # Iterate through each image in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):  # Assuming images are in PNG format
            img_path = os.path.join(folder_path, filename)

            # Load the image
            image = cv2.imread(img_path)

            # Check if the image was loaded successfully
            if image is None:
                print(f"Warning: Could not load image {img_path}. Skipping this file.")
                continue

            # Process the image
            processed_image = preprocess_input(image)

            # Convert the processed image to grayscale
            processed_gray = cv2.cvtColor(processed_image, cv2.COLOR_BGR2GRAY)

            # Save the processed grayscale image as an .npy file
            npy_path = os.path.join(processed_folder_path, filename.replace('.png', '.npy'))
            np.save(npy_path, processed_gray)

    print(f"Processed images saved as .npy files to {processed_folder_path}")





In [None]:
#3.2 ITERATE THROUGH FOLDERS AND APPLY PREPROCESSING (THIS CAN TAKE TIME)
# Define the parent directory containing the folders
# Define the parent directory containing the folders
parent_dir = r"D:\EndSeason-DataDump22\DY22_06_IFCB\images_2022\D202205"

# Create the new destination directory for all processed folders
destination_folder = os.path.join(parent_dir, "D202205_class_processed")
os.makedirs(destination_folder, exist_ok=True)

# Iterate through each folder in the parent directory
for folder_name in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, folder_name)

    # Ensure the path is a directory (i.e., a folder)
    if os.path.isdir(folder_path):
        print(f"Processing folder: {folder_name}")
        process_and_save_images(folder_path, destination_folder)


In [None]:
#4.1. APPLYING THE SAME MODEL FOR STICKED IMAGES AFTER THE DATA FROM 09/10/2024 <br>
# (DY2410_model_v6, is the same model with 1--natural_epcoh_model_v5 just introduced the stucked blurry images in the flow cell as detritus and some tintinnid images s ciliate

#3. LOAD THE MODEL AND MAP THE LABELS
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import load_model

# Load the pre-trained model
model_path = r"C:\Users\kurta\OneDrive - UW\Desktop\During_Fall24_Cruise\training_folder\DY2024_model_v6.h5"
model = load_model(model_path)

# Updated class mapping with 'Corethron' added #THIS WILL CHANGE FOR EACH CLASSIFIER
class_mapping = {
    'Chaetoceros': 0,
    'Ciliate': 1,
    'Corethron': 2,
    'Coscinodiscus': 3,
    'Cryptophyte': 4,
    'Detritus': 5,
    'Dictyocha': 6,
    'Dinoflagellate': 7,
    'Elongated': 8,
    'Euglena': 9,
    'Nanoflagellate': 10,
    'Pennate': 11,
    'Thalassiosira': 12
}

# Initialize the LabelEncoder and fit it with the classes
label_encoder = LabelEncoder()
label_encoder.fit(list(class_mapping.keys()))

In [None]:
#4.2 APPLY CLASSIFIER MODEL
#3.2a second working version
import os
import numpy as np
import pandas as pd

# Define the directory where all CSV files will be saved
csv_output_folder = os.path.join(parent_dir, "classification_results")
os.makedirs(csv_output_folder, exist_ok=True)

# Iterate through each folder in the parent directory
for folder_name in os.listdir(parent_dir):
    folder_path = os.path.join(parent_dir, folder_name)
    
    # Check if the path is a directory
    if os.path.isdir(folder_path) and folder_name.endswith("_processed"):
        print(f"Processing folder: {folder_name}")
        
        # List to hold image data and paths
        image_data = []
        image_paths = []

        # Iterate through each .npy file in the processed folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".npy"):
                npy_path = os.path.join(folder_path, filename)
                try:
                    image = np.load(npy_path)  # Load the .npy file
                    image = np.expand_dims(image, axis=-1)  # Add the channel dimension if needed
                    image = np.expand_dims(image, axis=0)  # Add the batch dimension
                    image_data.append(image)
                    image_paths.append(npy_path)
                except Exception as e:
                    print(f"Error loading {npy_path}: {e}")

        # Check if there are images to process
        if len(image_data) == 0:
            print(f"No valid .npy files found in folder: {folder_name}. Skipping...")
            continue  # Skip to the next folder

        # Convert list of images to a single numpy array
        try:
            image_data = np.vstack(image_data)
        except ValueError as ve:
            print(f"Error concatenating images in folder {folder_name}: {ve}")
            continue

        # Make predictions
        predictions = model.predict(image_data)

        # Convert predictions from one-hot encoded format to label indices
        predicted_classes = np.argmax(predictions, axis=1)

        # Convert the integer predictions back to their original labels
        predicted_labels = label_encoder.inverse_transform(predicted_classes)

        # Create a DataFrame to hold the results
        results_df = pd.DataFrame({
            'Image_Path': image_paths,
            'Predicted_Label': predicted_labels
        })

        # Define the path to save the CSV file in the parent folder
        output_csv_path = os.path.join(csv_output_folder, f"{folder_name}_classification_results.csv")

        # Save the DataFrame to a CSV file
        results_df.to_csv(output_csv_path, index=False)

        print(f"Classification results saved to {output_csv_path}")

# Process complete
print("Processing complete for all folders.")


In [None]:
#5.1 ITERATE  THROUGH the CLASSIFIED FILES AND PLOT RANDOM 20 IMAGES
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Define the folder containing the classification results CSV files
classification_folder = r"D:\DY2410\D20240909\D20241017_processed\classification_results"

# Iterate through each CSV file in the classification folder
for csv_file in os.listdir(classification_folder):
    if csv_file.endswith("_classification_results.csv"):
        # Construct the full path to the CSV file
        csv_path = os.path.join(classification_folder, csv_file)
        
        # Read the CSV file
        results_df = pd.read_csv(csv_path)

        # Sample 20 random images from the DataFrame
        random_sample = results_df.sample(n=20, random_state=42)  # random_state ensures reproducibility

        # Create the plot
        plt.figure(figsize=(25, 10))

        for i, (index, row) in enumerate(random_sample.iterrows()):
            # Get the image path and predicted label
            image_path = row['Image_Path']
            predicted_label = row['Predicted_Label']

            # Load the image from the .npy file
            image = np.load(image_path)

            # Plot the image
            plt.subplot(4, 5, i + 1)  # 4 rows, 5 columns grid
            plt.imshow(image, cmap='gray')  # Assuming the image is grayscale
            plt.title(f"Predicted: {predicted_label}")
            plt.axis('off')

        plt.tight_layout()

        # Extract the base name (without the "_processed_classification_results.csv" part)
        base_name = csv_file.split("_processed_classification_results.csv")[0]

        # Define the path to save the PNG file
        output_png_path = os.path.join(classification_folder, f"{base_name}.png")

        # Save the plot as a PNG file
        plt.savefig(output_png_path)
        plt.close()  # Close the plot to free up memory

        print(f"Random 20 images plot saved as {output_png_path}")


In [None]:
#5.2 Exclude detritus images to the plots
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Define the folder containing the classification results CSV files
classification_folder = r"D:\DY2410\D20240922\D20241022_processed\classification_results"

# Iterate through each CSV file in the classification folder
for csv_file in os.listdir(classification_folder):
    if csv_file.endswith("_classification_results.csv"):
        # Construct the full path to the CSV file
        csv_path = os.path.join(classification_folder, csv_file)
        
        # Read the CSV file
        results_df = pd.read_csv(csv_path)

        # Filter out images classified as "Detritus"
        filtered_df = results_df[results_df['Predicted_Label'] != "Detritus"]

        # If no images remain after filtering, skip to the next CSV file
        if filtered_df.empty:
            print(f"No non-Detritus images found in {csv_file}. Skipping...")
            continue

        # Sample 20 random images from the filtered DataFrame, or less if fewer than 20 non-Detritus images exist
        random_sample = filtered_df.sample(n=min(20, len(filtered_df)), random_state=42)  # random_state ensures reproducibility

        # Create the plot
        plt.figure(figsize=(25, 10))

        for i, (index, row) in enumerate(random_sample.iterrows()):
            # Get the image path and predicted label
            image_path = row['Image_Path']
            predicted_label = row['Predicted_Label']

            # Load the image from the .npy file
            image = np.load(image_path)

            # Plot the image
            plt.subplot(4, 5, i + 1)  # 4 rows, 5 columns grid
            plt.imshow(image, cmap='gray')  # Assuming the image is grayscale
            plt.title(f"Predicted: {predicted_label}")
            plt.axis('off')

        plt.tight_layout()

        # Extract the base name (without the "_processed_classification_results.csv" part)
        base_name = csv_file.split("_processed_classification_results.csv")[0]

        # Define the path to save the PNG file
        output_png_path = os.path.join(classification_folder, f"{base_name}_non_det.png")

        # Save the plot as a PNG file
        plt.savefig(output_png_path)
        plt.close()  # Close the plot to free up memory

        print(f"Random 20 non-Detritus images plot saved as {output_png_path}")



In [None]:
#6 merge the all csv files per day
import os
import pandas as pd
import glob

# Directory containing the CSV files
csv_directory = r"D:\DY2410\D20240922\D20241022_processed\classification_results"

# Find all CSV files in the directory
csv_files = glob.glob(os.path.join(csv_directory, '*.csv'))

# List to hold dataframes from each CSV file
df_list = []

# Iterate over each CSV file and read it into a dataframe
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    df_list.append(df)

# Concatenate all dataframes into one
merged_df = pd.concat(df_list, ignore_index=True)



In [None]:
#7 summarize the results
import pandas as pd

# Summarize the Predicted_Label column
label_summary = merged_df['Predicted_Label'].value_counts()

# Convert the summary to a DataFrame for saving as CSV
summary_df = label_summary.reset_index()
summary_df.columns = ['Predicted_Label', 'Count']

# Path to save the output CSV file
output_path = r"D:\DY2410\D20240922_summary.csv"

# Save the summary as a CSV file
summary_df.to_csv(output_path, index=False)

print(f"Summary saved at: {output_path}")

