# Extract the scale data from the original raw images

We need to be able to convert the pixel values in the raw images to the actual scale values. This function will extract the scale data from the raw images using the 'py-tesseract' library, which does OCR (Optical Character Recognition) on the black and white scale bar on the bottom of the TEM images.

If verify is true, the extracted scale data will be displayed on the image, and saved into the ./images/scale_bars/verified folder.

The csv file will be saved to the filename you specify.

In [None]:
from gapfinder.scale import extract_scale_conversion_metadata

input_folder = './images/raw_images'
output_folder = './images/scale_bars'
csv_filename = './metadata/image_scale_conversion.csv'

extract_scale_conversion_metadata(input_folder, output_folder, csv_filename, verify=True)

# Use the conversion data and the contours to calculate membrane/lumen widths, repeat distance, etc

In [2]:
from gapfinder.data_processing import calculate_contour_parameters

clear_existing = True

output_folder = './output'
contour_base_path = './output/processed_images'
metadata_filename = './images/roi_images/roi_metadata.csv'
conversion_df_filename = './metadata/image_scale_conversion.csv'

min_lumen_width = 0.5
min_lumen_peak_distance = 1

min_membrane_width = 0.5
min_membrane_peak_distance = 1

# can split membrane w/ identical width if they are over this distance apart
max_membrane_width = 16.0

calculate_contour_parameters(
    contour_base_path,
    output_folder,
    metadata_filename,
    conversion_df_filename,
    min_lumen_width=min_lumen_width,
    min_lumen_peak_distance=min_lumen_peak_distance,
    min_membrane_width=min_membrane_width,
    min_membrane_peak_distance=min_membrane_peak_distance,
    max_membrane_width=max_membrane_width,
    clear_existing=clear_existing
)

Deleted existing grana data files
Found 42 grana data items for 0_otsuOffset
Saving membrane data to ./output/processed_images/0_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/0_otsuOffset/grana_data_lumen.csv
Found 42 grana data items for 10_otsuOffset
Saving membrane data to ./output/processed_images/10_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/10_otsuOffset/grana_data_lumen.csv
Found 42 grana data items for 11_otsuOffset
Saving membrane data to ./output/processed_images/11_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/11_otsuOffset/grana_data_lumen.csv
Found 42 grana data items for 1_otsuOffset
Saving membrane data to ./output/processed_images/1_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/1_otsuOffset/grana_data_lumen.csv
Found 42 grana data items for 2_otsuOffset
Saving membrane data to ./output/processed_images/2_otsuOffset/grana_data

In [2]:
# combine the data from all the images
from gapfinder.data_processing import combine_data_files

base_path = './output'
processed_image_path = './output/processed_images'
output_folder = './output'

# @todo: may need to filter out the mini peaks BEFORE we do the splitting of the big peaks?
# currently it looks for the big peaks, trys to split them, then we filter out the mini peaks
# maybe we need to do this earlier in the process
combine_data_files(base_path, processed_image_path, output_folder)

process_names: ['0_otsuOffset', '10_otsuOffset', '11_otsuOffset', '1_otsuOffset', '2_otsuOffset', '3_otsuOffset', '4_otsuOffset', '5_otsuOffset', '6_otsuOffset', '7_otsuOffset', '8_otsuOffset', '9_otsuOffset']
First few entries of 'lumen_width':
0    1.833333
1    1.000000
2    4.237537
3    5.373866
4    6.101626
Name: lumen_width, dtype: float64

Data types in 'lumen_width' column:
[<class 'float'>]

Number of NaN values in 'lumen_width': 0

Data type of 'lumen_width' after conversion:
float64

First few results:
   lumen_width  nm_per_px  lumen_width_nm
0     1.833333   0.760456        1.394170
1     1.000000   0.760456        0.760456
2     4.237537   0.760456        3.222461
3     5.373866   0.760456        4.086590
4     6.101626   0.760456        4.640020
lumen_df shape: (4197, 16)
membrane_df shape: (4658, 17)
saved data: ./output/lumen.csv and ./output/membrane.csv


# create the 