In this notebook we will use boto to load each image from the bucket and store its metadata. We will create an inventory file which contains some useful properties of the images (such as dimensions, size, etc.). This notebook should be run from an ec2 instance, otherwise one will be charged for data egress. Ideally, we should find a solution which does not require downloading the files, but only extracting the metadata.

In [1]:
import boto3

In [2]:
import pandas as pd

In [3]:
import os

In [4]:
import tempfile

In [5]:
from skimage.external import tifffile

In [6]:
import json

In [7]:
session = boto3.Session(profile_name='default')
s3_resource = session.resource('s3')

In [8]:
cornea_bucket = s3_resource.Bucket('cornea-bucket')

In [9]:
# get a list of all filenames in the bucket
files = cornea_bucket.objects.all()

In [8]:
# local system names
TEMP_UPLOAD_SPACE = '/home/ubuntu/test_mount'
INVENTORY_FILE = os.path.join('data','metadata','inventory.csv')
METADATA = os.path.join('data','metadata')

if not os.path.isdir(METADATA):
    os.mkdir(METADATA)

In [11]:
# a function to extract metadata from a tif file

def read_metadata(filename):
    """
    read_metadata extracts metadata from a tif file and stores it into a dictionary.
    
    Each tag becomes a key in the dictionary. 
    
    Sometimes the image_description contains extra information in separate fields.
    
    Those are split into their own dictionary fields.
    The formats of the important fields are converted from strings to the corresponding types.
    
    images,channels,slices: integer
    hyperstack: boolean
    strip_byte_counts: renamed to sizeInGB and values are converted from bytes to Gigabytes.
    
    Output: d dictionary
    
    """
    
    d = {} #dictionary to store the metadata
    
    with tifffile.TiffFile(filename) as tif:
        for page in tif:
            for tag in page.tags.values():
                # the image_description field contains extra imagej info
                if tag.name == 'image_description':
                    for item in tag.value.decode('ASCII').split('\n')[:-1]:
                        key, value = item.split('=')
                        
                        
                        # converting the variables to the correct type (not converting all variables)
                        if key in ['images','channels','slices']:
                            d[key] = int(value)
                        elif key == 'hyperstack':
                            d[key] = bool(value)
                        else:
                            d[key] = value
                else:
                    if tag.name == 'strip_byte_counts':
                        d['sizeInGB'] = float(tag.value[0])/1000000000.
                    else:
                        d[tag.name] = str(tag.value)
    return(d)

In [12]:
# a function to extract the metadata from the tif file and convert it to a dataframe row

def process_tif(filename, label):
    """
    process_tif extracts metadata from a tif file and converts it to a one-row dataframe
    
          |image_width | image_length | channels | slices | hyperstack | sizeInGB
    ------|-----------------------------------------------------------------------
    label |            |              |          |        |            |       
    
    if a field is missing in the metadata dictionary for a particular file, the cell left empty.
    
    The label provided becomes an index to the row.
    
    Output: df pandas dataframe
    
    """
    
    d = read_metadata(filename)
    
    
    # dump the metadata to a json file
    
    metadata_filename = os.path.join(METADATA,label.split('/')[-1].split('.')[0] +'_metadata.txt')
    with open(metadata_filename, 'w') as file:
        json.dump(d, file)
    
    # create a small dictionary with only a few important fields
    small_d = {}

    for key in ['image_width','image_length','channels','slices', 'hyperstack', 'sizeInGB']:
        if key in d.keys():
            small_d[key] = d[key]
        else:
            small_d[key] = None
    
    # convert dictionary to data frame
    df = pd.DataFrame(small_d,index = [label])
    
    return(df)


In [13]:
# we will exclude the folders which have tiff sequences
exclude_list = ['Lepto_fovea', 'Lepto_head_lowres', 'Holco_Scan53_fly1']

In [14]:
# list of all tif filenames
tif_files = []
tif_files_done = []

In [None]:
%%time
# a loop which reads through all tif files (except from the sequences)
# TODO: test tif_files_done if connections is inte

for file in files:
    
    # exclude files in the exclude list
    exclude = False
    for name in exclude_list:
        if name in file.key:
            exclude = True
            
    if not exclude:
        if file.key.endswith('tif'):
            tif_files.append(file.key)
            
            # we will store the temporary files a mounted volume which has more space
            with tempfile.NamedTemporaryFile(dir=TEMP_UPLOAD_SPACE) as temp_file:
                if file.key not in tif_files_done:
                    
                    # download files from s3 bucket (should find a way to avoid downloading)
                    cornea_bucket.download_file(file.key, temp_file.name)
                    tif = tifffile.TiffFile(temp_file.name)
                    print(tif.info())
                    
                    # process the metadata
                    df = process_tif(temp_file.name, file.key)
                    if os.path.isfile(INVENTORY_FILE): 
                        df.to_csv(INVENTORY_FILE, mode = 'a', header = None)
                    else:
                        df.to_csv(INVENTORY_FILE)
                        
                    tif_files_done.append(file.key)
             

In [13]:
inventory = pd.read_csv(INVENTORY_FILE, index_col = 0)
inventory[:26]

Unnamed: 0,image_width,image_length,channels,slices,hyperstack,sizeInGB
Droso_Obscura_45_left_eye/ZSeries-05272015-1754-045_001.tif,512,512,,,,2.5e-05
Droso_Obscura_47_right_eye/ZSeries-05272015-1754-047_001.tif,512,512,,,,2.5e-05
Droso_Obscura_49/ZSeries-05272015-1754-049_8bit_Opt_Dobs_Reye.tif,512,512,3.0,737.0,True,1.159201
Fused_ZSeries-11072016-1000-053/Fused_ZSeries-11072016-1000-053.tif,2271,1400,2.0,1071.0,True,13.62055
Good2sort_Scan48_Dmel_lefteye/ZSeries-05272015-1754-048_8bit_Opt_Dmel_Leye.tif,512,512,3.0,670.0,True,1.053819
Good2sort_Scan48_Dmel_righteye/ZSeries-05272015-1754-048_8bit_Opt_Dmel_Reye.tif,512,512,3.0,670.0,True,1.053819
Good2sort_Scan49_Dmel_lefteye/ZSeries-05272015-1754-049_8bit_Opt_Dmel_Leye.tif,512,512,3.0,737.0,True,1.159201
Holco_030/CompositeFused.tif,1266,856,3.0,678.0,True,2.204238
Holco_scan027/Fused.tif,1264,1050,3.0,648.0,True,2.580077
Holco_scan029/Fused.tif,1480,1066,3.0,624.0,True,2.953417


----

Extra stuff below:

In [None]:
%%time
# a loop which reads through all tif files (except from the sequences)
# TODO: test tif_files_done if connections is inte

for file in files:
    
    # exclude files in the exclude list
    exclude = False
    for name in exclude_list:
        if name in file.key:
            exclude = True
            
    if exclude:
        if file.key.endswith('tif'):
            tif_files.append(file.key)
            
            # we will store the temporary files a mounted volume which has more space
            with tempfile.NamedTemporaryFile(dir=TEMP_UPLOAD_SPACE) as temp_file:
                if file.key not in tif_files_done:
                    
                    # download files from s3 bucket (should find a way to avoid downloading)
                    cornea_bucket.download_file(file.key, temp_file.name)
                    tif = tifffile.TiffFile(temp_file.name)
                    print(tif.info())
                    
                    # process the metadata
                    df = process_tif(temp_file.name, file.key)
                    if os.path.isfile(INVENTORY_FILE): 
                        df.to_csv(INVENTORY_FILE, mode = 'a', header = None)
                    else:
                        df.to_csv(INVENTORY_FILE)
                        
                    tif_files_done.append(file.key)
             

In [None]:
# tempfile will not work as this will mess up the structure of the tiff sequence
# what I need is the number 
tifffile.TiffSequence()

In [None]:
# print('s3-amazonaws.com/cornea-bucket/'+test_name)

In [None]:
#tif = open('s3-amazonaws.com/cornea-bucket/'+test_name)
#tif = open('s3://cornea-bucket/'+'home/ubuntu/'+test_name)
tif = tifffile.TiffFile('http://cornea-bucket.s3.amazonaws.com/Scan050/Fused_ZSeries-11062016-1603-050.tif')
#tif = tifffile.TiffFile('s3://cornea-bucket/Scan050/Fused_ZSeries-11062016-1603-050.tif')

In [None]:
tif = tifffile.TiffFile('s3://cornea-bucket/Scan050/Fused_ZSeries-11062016-1603-050.tif')

In [None]:
image.imread('s3://cornea-bucket/Scan050/Fused_ZSeries-11062016-1603-050.tif')

With scikit-image `imread` function we can directly read the file from the web address but it takes long time to read it.

In [None]:
# try using requests
import requests
resp = requests.get('http://cornea-bucket.s3.amazonaws.com/Scan050/Fused_ZSeries-11062016-1603-050.tif')
tif = tifffile.TiffFile(io.BytesIO(resp.content))

# note: seems to work but it takes very long time

In [None]:
from skimage import io
# im = io.imread('http://cornea-bucket.s3.amazonaws.com/Scan050/Fused_ZSeries-11062016-1603-050.tif')

Looking carefully at the imread code, seems it creates a temporary file within the reading context. 

With dask imread function we cannot read directly from the web address:

In [None]:
import dask
from dask.array import image

In [None]:
im = image.imread('http://cornea-bucket.s3.amazonaws.com/Scan050/Fused_ZSeries-11062016-1603-050.tif')

What about the new `dask-image` package?

In [None]:
!conda install -c conda-forge --yes dask-image

In [None]:
from dask_image import imread

In [None]:
im = imread.imread('s3://cornea-bucket/Scan050/Fused_ZSeries-11062016-1603-050.tif')

In [None]:
url = s3.generate_presigned_url(ClientMethod='get_object', Params=params)

In [None]:
df = process_tif(file.key)
df.to_csv('inventory.csv',mode = 'a')

In [None]:
exclude = False
for name in exclude_list:
    if name in 'Holco_Scan53_fly1_eye1/Fused_ZSeries-11072016-1000-053_crop8bit0007.tif':
        exclude = True

In [None]:
test_name = 'data/Scan050/Fused_ZSeries-11062016-1603-050.tif'

In [None]:
process_tif(test_name)

In [None]:
tif = tifffile.TiffFile(test_name)
print(len(tif.series))
for s in tif.series:
    print(s[0])
    
    

In [None]:
with tifffile.TiffFile(test_name) as tif:

    d = {}
    for page in tif:
        for tag in page.tags.values():
            if tag.name == 'image_description':
                for item in tag.value.decode('ASCII').split('\n')[:-1]:
                    key, value = item.split('=')
                    d[key] = value
            else:
                d[tag.name] = tag.value
                    
d

In [None]:
metadata = read_metadata(test_name)
metadata


In [None]:
small_d = {}
for key in ['image_width','image_length','channels','slices']:
    small_d[key] = d[key]

In [None]:
pd.DataFrame(small_d,index=[test_name])

How should I store the metadata? csv? the name of the file and the size will be nice to have.

In [None]:
for item in d['image_description'].decode('ASCII').split('\n'):
    key, value = item.split('=')
    d[key] = value
    

In [None]:
# something is wrong with reading 
# A_Fused_ZSeries-11242017-0953-233_Crop.tif
# invalid TIFF file error

In [None]:
# what is rows per strip???
# the number of images is image_description tag within the string

In [None]:
#!pip install exifread #not available in conda

In [None]:
import exifread
f = open(test_name, 'rb')

# Return Exif tags
tags = exifread.process_file(f)

# Print the tag/ value pairs
for tag in tags.keys():
    if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'EXIF MakerNote'):
        print ("Key: %s, value %s" % (tag, tags[tag]))

In [None]:
# I would also like the order xyzc, the dimensions, memmappable

In [None]:
from skimage.io import imread

In [None]:
# im = imread(test_name) # memmory error

Variables I want:

* Image ImageWidth (image_width)
* Image ImageLength (image_length)
* Image RowsPerStrip
* channel?

This sort of defines the order
* memmappable?


multipage? hypestack?


imagej channels has colors: how do I extract it?
