# Extract the scale data from the original raw images

We need to be able to convert the pixel values in the raw images to the actual scale values. This function will extract the scale data from the raw images using the 'py-tesseract' library, which does OCR (Optical Character Recognition) on the black and white scale bar on the bottom of the TEM images.

If verify is true, the extracted scale data will be displayed on the image, and saved into the ./images/scale_bars/verified folder.

The csv file will be saved to the filename you specify.

In [1]:
from gapfinder.scale import extract_scale_conversion_metadata

input_folder = './images/raw_images'
output_folder = './images/scale_bars'
csv_filename = './metadata/image_scale_conversion.csv'

extract_scale_conversion_metadata(input_folder, output_folder, csv_filename, verify=True)

{'filename': './images/raw_images\\2022-1-10_Wild Type_500uE 1 hour_1_13_29k.png', 'scalebar_filename': './images/scale_bars\\2022-1-10_Wild Type_500uE 1 hour_1_13_29k.png', 'scale': 200, 'x0': np.int64(790), 'x1': np.int64(1053), 'scale_pixels': np.int64(263), 'nm_per_pixel': np.float64(0.7604562737642585), 'pixel_per_nm': np.float64(1.315)}
{'filename': './images/raw_images\\2022-1-10_Wild Type_500uE 1 hour_1_16_29k.png', 'scalebar_filename': './images/scale_bars\\2022-1-10_Wild Type_500uE 1 hour_1_16_29k.png', 'scale': 200, 'x0': np.int64(790), 'x1': np.int64(1053), 'scale_pixels': np.int64(263), 'nm_per_pixel': np.float64(0.7604562737642585), 'pixel_per_nm': np.float64(1.315)}
{'filename': './images/raw_images\\2022-1-10_Wild Type_500uE 1 hour_1_25_29k.png', 'scalebar_filename': './images/scale_bars\\2022-1-10_Wild Type_500uE 1 hour_1_25_29k.png', 'scale': 200, 'x0': np.int64(790), 'x1': np.int64(1053), 'scale_pixels': np.int64(263), 'nm_per_pixel': np.float64(0.7604562737642585), 

# Use the conversion data and the contours to calculate membrane/lumen widths, repeat distance, etc

In [2]:
from gapfinder.data_processing import calculate_contour_parameters


# change this to false if you don't want to clear the existing datafiles. I find it easier to just start from scratch
clear_existing = True

output_folder = './output'
contour_base_path = './output/processed_images'
metadata_filename = './images/roi_images/roi_metadata.csv'
conversion_df_filename = './metadata/image_scale_conversion.csv'

min_lumen_width = 0.5
min_lumen_peak_distance = 1

min_membrane_width = 0.5
min_membrane_peak_distance = 1

# can split membrane w/ identical width if they are over this distance apart
max_membrane_width = 16.0

calculate_contour_parameters(
    contour_base_path,
    output_folder,
    metadata_filename,
    conversion_df_filename,
    min_lumen_width=min_lumen_width,
    min_lumen_peak_distance=min_lumen_peak_distance,
    min_membrane_width=min_membrane_width,
    min_membrane_peak_distance=min_membrane_peak_distance,
    max_membrane_width=max_membrane_width,
    clear_existing=clear_existing
)

Deleted existing grana data files
Found 13 grana data items for 0_otsuOffset
Saving membrane data to ./output/processed_images/0_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/0_otsuOffset/grana_data_lumen.csv
Found 13 grana data items for 10_otsuOffset
Saving membrane data to ./output/processed_images/10_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/10_otsuOffset/grana_data_lumen.csv
Found 13 grana data items for 11_otsuOffset
Saving membrane data to ./output/processed_images/11_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/11_otsuOffset/grana_data_lumen.csv
Found 13 grana data items for 1_otsuOffset
Saving membrane data to ./output/processed_images/1_otsuOffset/grana_data_membrane.csv
Saving lumen data to ./output/processed_images/1_otsuOffset/grana_data_lumen.csv
Found 13 grana data items for 2_otsuOffset
Saving membrane data to ./output/processed_images/2_otsuOffset/grana_data

# Combine the individual data files

This next combines all of the individual data files into a few combined csv files, for ease of use.

In [5]:
# combine the data from all the images
from gapfinder.data_processing import combine_data_files

base_path = './output'
processed_image_path = './output/processed_images'
output_folder = './output'

# @todo: may need to filter out the mini peaks BEFORE we do the splitting of the big peaks?
# currently it looks for the big peaks, trys to split them, then we filter out the mini peaks
# maybe we need to do this earlier in the process
combine_data_files(base_path, processed_image_path, output_folder)


In [6]:
# print out the destination files:
import glob
import os

output_files = glob.glob(os.path.join(output_folder, '*.csv'))

print('Destination files:')
for file in output_files:
    print(file)

Destination files:
./output\lumen.csv
./output\membrane.csv
./output\threshold_metadata.csv


# Next steps

Now that we have those comvined files, we can use them however you normally would. 