# Regression Experiment
Try regression on input parameters (sampledataframe, maestronetlist) to determine characterization metrics.

## Load Data

In [1]:
from frgpascal.analysis.processing import load_all
import matplotlib.pyplot as plt
from pathlib import Path
import json # load maestro logs
import pandas as pd
import numpy as np
import cv2
import tensorflow as tf
import sklearn
import os

In [2]:
from natsort import index_natsorted

def load_all_sorted(chardir):
	metricdf, rawdf = load_all(
		chardir,
		t_kwargs=dict(
			wlmin=700,
			wlmax=900
		)
	)
	rawdf= rawdf.sort_values(
   		by='name',
   		key=lambda x: np.argsort(index_natsorted(rawdf['name']))
    )
	rawdf = rawdf.reset_index(drop=True)


	metricdf= metricdf.sort_values(
    	by='name',
    	key=lambda x: np.argsort(index_natsorted(metricdf['name']))
    )
	metricdf = metricdf.reset_index(drop=True)
	return metricdf, rawdf

In [3]:
# TODO: clean up this code, perhaps put into separate utilities script


def line_defect_p(img):
    '''
    Parameters:
    - img: a 3-channel RGB image

    Returns:
    - a float, representing the proportion of the image area taken up by lines
    - a numpy array, representing the image run through line detection algorithm
    - a list of tuples, representing the lines found
    '''

    # NOTE: this currently does not discriminate between lines and splotches (which are shaped as round-ish fractals)
    # will need to fix up to only select lines

    # if image channels were in float format [0,1], then
    # convert image channels to uint8 format [0,255]
    # manual scaling seems better than cv2.normalize function, which might map a float <1 to 255
    # which seems to produce line artificts
    if np.issubdtype(img.dtype, np.floating):
        img = np.uint8(255*img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # convert brightfield image to grayscale

    # apply canny edge detection
    edges = cv2.Canny(gray, 40, 60, apertureSize=3)
    edges_dilated = cv2.dilate(edges, np.ones((5,5), np.uint8), iterations=1) # dilate to expand the edges and connect lines

    # apply hough lines transform to find lines
    # may need to tune resolution and threshold parameters
    minLineLength = edges_dilated.shape[0]*0.03 # 3% of the width of the image
    maxLineGap = 30 # higher means more lines combined together so there are less overall
    lines = cv2.HoughLinesP(edges_dilated, 1, np.pi/180, 100, minLineLength=minLineLength, maxLineGap=maxLineGap)
    if lines is None:
        lines = np.zeros(0)

    # now visualize the lines
    line_img = np.zeros(gray.shape)
    for l in lines:
        arr = np.array(l[0], dtype=np.int32)
        x1, y1, x2, y2 = arr

        # cv.line draws a line in img from the point(x1,y1) to (x2,y2).
        # 255 denotes the colour of the line to be drawn
        cv2.line(line_img, (x1, y1), (x2, y2), color=255, thickness=2)

    return line_img.sum()/line_img.size/255, line_img, lines


def defect_p(img):
    '''
    Parameters:
    - img: a 3-channel RGB image

    Returns:
    - a tuple of floats, representing the proportion of the image area taken up by each defect (line, splotches)
    - a tuple of numpy arrays, representing the image run through defect detection algorithm (line, splotches)
    '''

    # if image channels were in float format [0,1], then
    # convert image channels to uint8 format [0,255]
    # manual scaling seems better than cv2.normalize function, which might map a float <1 to 255
    # which seems to produce line artificts
    if np.issubdtype(img.dtype, np.floating):
        img = np.uint8(255*img)
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # convert brightfield image to grayscale

    # apply canny edge detection
    edges = cv2.Canny(gray, 40, 60, apertureSize=3)
    edges_dilated = cv2.dilate(edges, np.ones((5,5), np.uint8), iterations=1) # dilate to expand the edges and connect lines
    # now find the contours
    contours, _ = cv2.findContours(edges_dilated, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cont_img = np.zeros(edges_dilated.shape, np.uint8)
    cont_img = cv2.drawContours(cont_img, contours, -1, color=255, thickness=3)

    # now perform a floodfill to separate contour exterior from interior
    # anything "outside" a defect is connected, and anything "inside" a defect is connected
    
    # first add padding to outside so that any lines that bisect the image don't mess up the floodfill
    b = 1 # border padding amount
    cont_img = cv2.copyMakeBorder(cont_img, b, b, b, b, cv2.BORDER_CONSTANT, 0)
    h, w = cont_img.shape
    mask = np.zeros((h+2, w+2), np.uint8) # floodfill mask
    cv2.floodFill(cont_img, mask, (0,0), 123) # fill with grey so it doesn't obscure edges
    cont_img = cv2.inRange(cont_img, 122, 124) # threshold so that anything outside a defect
    cont_img = cv2.bitwise_not(cont_img) # now invert colors so that contour interior is white and exterior is black
    cont_img = cont_img[b:-b, b:-b] # now crop to remove border padding

    # now remove lines from the contour image so we're just left with splotches and circles
    remove_lines_img = line_defect_p(cv2.cvtColor(cont_img, cv2.COLOR_GRAY2RGB))[1]
    line_img_dilate = cv2.dilate(remove_lines_img, np.ones((5,5)), iterations=1) # dilate lines so that we're sure we remove them
    cont_img = np.maximum(0, cont_img - line_img_dilate) # subtract out lines and rectify
    cont_img = cv2.erode(cont_img, np.ones((5,5)), iterations=1) # erode contours to remove any leftover lines
    cont_img = cv2.dilate(cont_img, np.ones((5,5)), iterations=1) # now get back the area in the real contours that was eroded

    # now remove splotches from the line image so it only has lines
    line_img = line_defect_p(img)[1]
    cont_img_dilate = cv2.dilate(cont_img, np.ones((5,5)), iterations = 1)
    line_img = np.maximum(0, line_img - cont_img_dilate) # subtract out splotches and rectify

    # compute proportion of image taken up by defects
    cont_p = cont_img.sum()/cont_img.size/255
    line_p = line_img.sum()/line_img.size/255

    return (line_p, cont_p), (line_img, cont_img)

### Load Characterization Metrics
This includes fitted characterization metrics + other metrics we define on the images.

In [9]:
# first load output data (characterization metrics)
# TODO: use a tf.data.Dataset to stream data in so we can load more without memory constraints

chardir_0 = ['data/output/20221011_B9-char_1/Characterization']

metricdf = None
for dir in chardir_0:
    print(f'Loading {dir}')
    # load data from directory
    mdf, rdf = load_all_sorted(dir)
    # record what batch it came from
    mdf['batch'] = dir
    rdf['batch'] = dir

    # instead of saving the images, extract metrics from them
    # df_metrics = pd.DataFrame(rdf['df_0'].apply(lambda img: defect_p(img)[0]))
    # print(df_metrics.head(4))
    bf_metrics = rdf['bf_0'].apply(lambda img: defect_p(img)[0])
    bf_metrics = pd.DataFrame(bf_metrics.to_list(), columns=['bf_linep_0', 'bf_splotchp_0'])
    # pl_metrics = pd.DataFrame(rdf['plimg_0'].apply(lambda img: defect_p(img)[0]))

    mdf = mdf.join(bf_metrics)
    # now add it to collective dataframe
    metricdf = pd.concat([metricdf, mdf])


    del mdf, rdf # explicitly delete to save memory cause HD images use a LOT

# reset indices so the sample number doesn't interfere with indexing
metricdf = metricdf.reset_index().rename(columns={'index': 'sample_num'})
metricdf

Loading data/output/20221011_B9-char_1/Characterization


  a = -np.log10(t)
  a = -np.log10(t)
Loading data: 100%|██████████| 31/31 [00:16<00:00,  1.87sample/s]


In [29]:
# sort metric columns alphabetically
metricdf = metricdf.sort_index(axis=1)
a = metricdf[['batch', 'sample_num', 'name']] # put these columns in front
b = metricdf.drop(columns=['batch', 'name', 'sample_num']) # the actual metrics 
metricdf = a.join(b)
metricdf

Unnamed: 0,batch,sample_num,name,bf_inhomogeneity_0,bf_linep_0,bf_splotchp_0,df_median_0,pl_fwhm_0,pl_intensity_0,pl_peakev_0,t_bandgap_0,t_samplepresent_0
0,data/output/20221011_B9-char_1/Characterization,0,sample0,0.154434,0.001388,0.034574,140.035843,0.096823,174.176938,1.677392,1.670885,True
1,data/output/20221011_B9-char_1/Characterization,1,sample1,0.034818,0.0,0.00345,140.845169,0.098287,175.216607,1.676216,1.669811,True
2,data/output/20221011_B9-char_1/Characterization,2,sample2,0.033021,0.0,0.002436,140.304779,0.097397,253.082279,1.677381,1.669848,True
3,data/output/20221011_B9-char_1/Characterization,3,sample3,0.036609,0.0,0.002884,139.916306,0.097168,183.368298,1.677104,1.664593,True
4,data/output/20221011_B9-char_1/Characterization,4,sample4,0.031844,0.0,0.002175,140.987106,0.096425,279.598707,1.677065,1.669493,True
5,data/output/20221011_B9-char_1/Characterization,5,sample5,0.12381,0.013432,0.019091,140.95224,0.095493,210.086825,1.67696,1.669777,True
6,data/output/20221011_B9-char_1/Characterization,6,sample6,0.032335,0.0,0.002328,141.803909,0.096805,235.359654,1.677107,1.669393,True
7,data/output/20221011_B9-char_1/Characterization,7,sample7,0.034108,0.0,0.002494,141.101654,0.096087,160.327952,1.676423,1.669312,True
8,data/output/20221011_B9-char_1/Characterization,8,sample8,0.052047,0.00477,0.008308,140.71817,0.098563,176.236739,1.676698,1.671236,True
9,data/output/20221011_B9-char_1/Characterization,9,sample9,0.036251,0.0,0.002654,140.060745,0.101346,83.560557,1.677186,1.669862,True
