In [None]:
# Matplotlib enables us to plot within the notebook, matplotlib is very powerful plotting library
%matplotlib widget
# Imports additional packages to interact with your operating system (os),
# computer vision tasks (cv2), to retrieve files/pathnames that match a specified pattern (glob)
import os, cv2, glob
# Imports NumPy package into notebook, essential for scientific computing
import numpy as np
# Imports PlantCV into notebook so that we can conduct plant phenotyping analyses
from plantcv import plantcv as pcv
# Imports PyPlot which will provides us a MATLAB-like interface
from matplotlib import pyplot as plt
# Imports Pandas which supports R-like dataframes
import pandas as pd

## Locate filepaths of all data saved out

In [None]:
# Get current working directory (the file path where is this notebook located?)
path = os.getcwd()

# Any file with .csv file extension will get stored into list of csv filename
csv_filenames = glob.glob(os.path.join(path, "*.csv"))

In [None]:
# Do you have as many data filenames as expected?
csv_filenames

In [None]:
# Define empty list for storing our various dataframes
data_list = []

# loop over the list of csv files
for f in csv_filenames:

    # read the csv file
    df = pd.read_csv(f)
    # Append to the list called data_list
    data_list.append(df)

In [None]:
# Concatenate (combine) all dataframes into a single dataframe
all_data = pd.concat(data_list)

In [None]:
# Data wrangling steps
# a.k.a. change the shape (structure) of the dataframe into compatible format for next steps

# Filter the traits kept
bean_features = all_data[all_data['trait'].isin(['area', 'convex_hull_area', 'solidity',
                                                   'perimeter', 'width', 'height', 'ellipse_major_axis',
                                                   'ellipse_minor_axis', 'ellipse_eccentricity',
                                                   'hue_circular_mean', 'hue_median'])]

# Pivot (Transform) the dataframe from "long" format to "wide"
bean_features_wide = pd.pivot(bean_features, index=['sample',
                                                    'median_color_chip_size',
                                                    'median_color_chip_width'],
                              columns="trait", values="value", )

# Convert the multiindex to a single index
bean_features_wide = bean_features_wide.reset_index(level=["median_color_chip_size", "median_color_chip_width"])

# Scale area and length measurements by the color card chip measurements
bean_features_wide["area"] = bean_features_wide["area"] / bean_features_wide["median_color_chip_size"]
bean_features_wide["convex_hull_area"] = bean_features_wide["convex_hull_area"] / bean_features_wide["median_color_chip_size"]
bean_features_wide["ellipse_major_axis"] = bean_features_wide["ellipse_major_axis"] / bean_features_wide["median_color_chip_width"]
bean_features_wide["ellipse_minor_axis"] = bean_features_wide["ellipse_minor_axis"] / bean_features_wide["median_color_chip_width"]
bean_features_wide["perimeter"] = bean_features_wide["perimeter"] / bean_features_wide["median_color_chip_width"]
bean_features_wide["width"] = bean_features_wide["width"] / bean_features_wide["median_color_chip_width"]

bean_features_wide = bean_features_wide.drop(["median_color_chip_size", "median_color_chip_width"], axis=1)

# Cast (change data type) to numpy array
np_features = bean_features_wide.to_numpy()

In [None]:
# Investigate the formatted data
bean_features_wide

In [None]:
# Extract list of traits
trait_list = bean_features_wide.index.to_list()

In [None]:
# Collect list of labels
labels = []
for name in trait_list:
    bean_num = name.split("_")[0]
    labels.append(bean_num)

labels = np.array(labels)

In [None]:
labels

Feed the trait data into the Random Forest Classifier. Giving the function data to train a model on.

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train = np_features
y_train = labels

feature_names = list(bean_features_wide.columns)
forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

In [None]:
# Extract feature importances from the model created, and standard deviations for each

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)


In [None]:
# Create a plot to display

forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()


# STOP 🛑  HERE !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

Below this point is an example of how to use a trained classifier on unlabeled data, getting collected from a bean scatter image with mixed bean types.

<span style="color:purple">

# Take a bean scatter image and extract traits
    
</span>


In [None]:
# Turn debugging images on
pcv.params.debug = "plot"

# Read image

# Inputs:
#   filename - Image file to be read in
#   mode - How to read in the image; either 'native' (default), 'rgb', 'gray', or 'csv'

# Read in bean scatter image
img, path, filename = pcv.readimage(filename="")


In [None]:
# Why are they directly extracting the B*?
# Inputs:
#   rbg_img - original image
#   channel - desired colorspace ('l', 'a', or 'b')

gray = pcv.rgb2gray_lab(rgb_img=img, channel="b")


In [None]:
# Inputs:
#   gray_img    = grayscale image created from selected colorspace

auto_mask = pcv.threshold.otsu(gray_img=gray)

<span style="color:purple">

# Set Region Of Interest
    
</span>


In [None]:
# Inputs:
#   img         = RGB or grayscale image for plotting
#   x           = x coordinate of the center of ROI
#   y           = y coordinate of the center of ROI
#   r           = radius of the ROI to get drawn


roi = pcv.roi.rectangle(img=img, x=100, y=2000, h=1900, w=2800)


In [None]:
# Inputs:
#   mask         = Binary image
#   roi          = Region of interest, defined in an upstream step
#   roi_type     = 'cutto', 'partial' (for partially inside, default), or
#                 'largest' (keep only the largest contour)

filtered_mask = pcv.roi.filter(mask=auto_mask, roi=roi, roi_type="partial")


In [None]:
pcv.params.text_size = 5
pcv.params.text_thickness = 5

# Inputs:
#   img         = gray image in selected colorspace
#   mask        = None (default), or mask
#   num_objects = Optional parameter to limit the number of objects that will get annotated.

sizes = pcv.visualize.obj_sizes(img=img, mask=filtered_mask, num_objects=50)


In [None]:
# Inputs:
#   bin_img - binary mask image
#   size - maximum size for objects that should be filled in as background (non-plant) pixels
fill = pcv.fill(bin_img=filtered_mask, size=1000)
#                                            ^
#                                           |
#                                 change this value if needed


In [None]:
# Flood fill

# Inputs:
#   bin_img - binary mask image

clean_mask = pcv.fill_holes(bin_img=fill)


In [None]:
# Inputs:
#    mask            = mask image
#    rois            = (Optional) list of multiple ROIs (from roi.multi or roi.auto_grid)
#    roi_type        = (Optional) type of filtering, either partial' (for partially inside, default),
#                       cutto' (hard cut at boundary), 'largest' (keep only the largest contour)

labeled_mask, num = pcv.create_labels(mask=clean_mask)


In [None]:
# Extract size traits

# Inputs:
        #   img          = RGB image for debugging
        #   labeled_mask = Grayscale mask with unique pixel value per object of interest
        #   n_labels     = Total number expected individual objects (default = 1).
        #   label        = Modifies the variable name of observations recorded (default = "default").

shape_img = pcv.analyze.size(img=img, labeled_mask=labeled_mask, n_labels=num, label="default")


In [None]:
# Parameters
# ----------
# rgb_img : numpy.ndarray
#     Input RGB image data containing a color card.
# label : str, optional
#     modifies the variable name of observations recorded (default = pcv.params.sample_label).
# **kwargs
#     Other keyword arguments passed to cv2.adaptiveThreshold and cv2.circle.

#     Valid keyword arguments:
#     adaptive_method: 0 (mean) or 1 (Gaussian) (default = 1)
#     block_size: int (default = 51)
#     radius: int (default = 20)

cc_mask = pcv.transform.detect_color_card(rgb_img=img)

In [None]:
# These functions extract color matrices for the image and a standard set of values
_, color_mat = pcv.transform.get_color_matrix(rgb_img=img, mask=cc_mask)
std_mat = pcv.transform.std_color_matrix(pos=3)

In [None]:
cc_img = pcv.transform.affine_color_correction(rgb_img=img,
                                               source_matrix=color_mat,
                                               target_matrix=std_mat)

In [None]:
# Save the median chip area
pcv.outputs.add_metadata(term="median_color_chip_size",
                         datatype=float,
                         value=pcv.outputs.observations["default"]["median_color_chip_size"]["value"])
# Save the median chip width
pcv.outputs.add_metadata(term="median_color_chip_width",
                         datatype=float,
                         value=pcv.outputs.observations["default"]["median_color_chip_width"]["value"])

In [None]:
# Extract color traits from each replicate

# Inputs:
        #   img          = RGB image for debugging
        #   labeled_mask = Grayscale mask with unique pixel value per object of interest
        #   n_labels     = Total number expected individual objects (default = 1).
        #   colorspaces  = 'all', 'rgb', 'lab', or 'hsv' (default = 'hsv').
        #   label        = Modifies the variable name of observations recorded (default = "default").

color_img = pcv.analyze.color(rgb_img=cc_img, labeled_mask=labeled_mask, n_labels=num, label="default")


In [None]:
# Save out unclassified bean trait data
pcv.outputs.save_results("unclassified_bean_data.csv", "csv")


In [None]:
# Read in CSV data and train on X traits
f2 = "unclassified_bean_data.csv"
df2 = pd.read_csv(f2)

# Filter the traits kept
bean_features2 = df2[df2['trait'].isin(['area', 'convex_hull_area', 'solidity',
                                     'perimeter', 'width', 'height', 'ellipse_major_axis',
                                     'ellipse_minor_axis', 'ellipse_eccentricity',
                                     'hue_circular_mean', 'hue_median'])]

# Pivot the dataframe from "long" format to "wide"
bean_features_wide2 = pd.pivot(bean_features2, index=['sample',
                                                      'median_color_chip_size',
                                                      'median_color_chip_width'],
                               columns="trait", values="value")

# Convert the multiindex to a single index
bean_features_wide2 = bean_features_wide2.reset_index(level=["median_color_chip_size", "median_color_chip_width"])

# Scale area and length measurements by the color card chip measurements
bean_features_wide2["area"] = bean_features_wide2["area"] / bean_features_wide2["median_color_chip_size"]
bean_features_wide2["convex_hull_area"] = bean_features_wide2["convex_hull_area"] / bean_features_wide2["median_color_chip_size"]
bean_features_wide2["ellipse_major_axis"] = bean_features_wide2["ellipse_major_axis"] / bean_features_wide2["median_color_chip_width"]
bean_features_wide2["ellipse_minor_axis"] = bean_features_wide2["ellipse_minor_axis"] / bean_features_wide2["median_color_chip_width"]
bean_features_wide2["perimeter"] = bean_features_wide2["perimeter"] / bean_features_wide2["median_color_chip_width"]
bean_features_wide2["width"] = bean_features_wide2["width"] / bean_features_wide2["median_color_chip_width"]

# Remove the chip size features
bean_features_wide2 = bean_features_wide2.drop(["median_color_chip_size", "median_color_chip_width"], axis=1)

# Cast to numpy array
np_features2 = bean_features_wide2.to_numpy()

# Extrat list of traits
trait_list2 = bean_features_wide2.index.to_list()

In [None]:
# Then predict instead of forest.fit
X_class = np_features2

classifier = forest.predict(X_class)

In [None]:
# Investigate the predictions
classifier

In [None]:
# Combine the predictions with PlantCV data that marks the location of each bean in the image
classes = pd.DataFrame({"sample": bean_features_wide2.index.tolist(), "class": classifier.tolist()})
classes = classes.merge(df2.loc[(df2["trait"] == "center_of_mass") & (df2["label"] == "x")])
classes.drop(["trait", "label"], axis=1, inplace=True)
classes.rename({"value": "cmx"}, inplace=True, axis=1)
classes = classes.merge(df2.loc[(df2["trait"] == "center_of_mass") & (df2["label"] == "y")])
classes.drop(["trait", "label"], axis=1, inplace=True)
classes.rename({"value": "cmy"}, inplace=True, axis=1)

# Label the bean class next to each bean on the image
outimg = img.copy()
for index, row in classes.iterrows():
    cv2.putText(img=outimg, text=row["class"], org=(int(row["cmx"]), int(row["cmy"])),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=pcv.params.text_size,
                color=(255, 255, 255), thickness=pcv.params.text_thickness)
pcv.plot_image(outimg)

In [None]:
# Print out a table of the probability each bean belongs to each category/class
print(forest.classes_)
forest.predict_proba(X_class)