In [1]:
# Author and date
import datetime, getpass
print(getpass.getuser(), datetime.datetime.today())

dimitri 2019-11-03 15:36:53.412184


# Attachnments and configurable blobs (new in datajoint 0.12.0)

This notebooks demonstrates the storage and retrieval of complex datatypes (blobs) and file attachments in DataJoint.

A **blob** refers to an attribute in a table that can store complex data structures such as numeric arrays.

An **attachment** refers to an attribute that can store an entire file with its filename, etc.

Both blobs and attachments can be stored directly in the tables of the relational database or in configurable external "stores" such as network-attached storage servers or object storage systems such [Amazon S3](https://aws.amazon.com/s3/) and [Minio](https://min.io/).

Many of these features existing in prior releases of datajoint but have been substantially expanded in version 0.12.0.

In [2]:
%matplotlib inline
from IPython import display
from matplotlib import pyplot as plt
import os
import imageio
import requests
import numpy as np

import datajoint as dj

In [3]:
dj.__version__

'0.12.1'

## Configure stores
The following is a configuration defining two external stores. This should only be done once for all users and the configuration file must be saved and provided to all users.

The first store is named `"shared"` and is hosted on an S3 endpoint. 

The second store is named `"local"` and it uses the local path `./dj-store`.

Now these repositories can be used for blobs and attachments.

In [4]:
## Storage configuration

# set up stores
dj.config['stores'] = {
    'shared': dict(
        protocol='s3',
        endpoint='localhost:9000',
        access_key='datajoint',
        secret_key='datajoint',
        bucket='datajoint-demo',        
        location=''
    ),       
    'local': {  # store in files
        'protocol': 'file',
        'location': os.path.abspath('./dj-store')
    }}

In [6]:
# create a schema for this demo
schema = dj.schema('test_attach')

Please enter DataJoint username: dimitri
Please enter DataJoint password: ········
Connecting dimitri@localhost:3306


In [8]:
dj.config['database.password'] = 'dosaafdosaaf'

In [10]:
dj.config['database.user'] = 'dimitri'

In [11]:
dj.config

{   'connection.charset': '',
    'connection.init_function': None,
    'database.host': 'localhost',
    'database.password': 'dosaafdosaaf',
    'database.port': 3306,
    'database.reconnect': True,
    'database.use_tls': None,
    'database.user': 'dimitri',
    'display.limit': 12,
    'display.show_tuple_count': True,
    'display.width': 14,
    'enable_python_native_blobs': False,
    'fetch_format': 'array',
    'loglevel': 'INFO',
    'safemode': True,
    'stores': {   'local': {   'location': '/home/dimitri/dev/dj-python-101/ch1/dj-store',
                               'protocol': 'file'},
                  'shared': {   'access_key': 'datajoint',
                                'bucket': 'datajoint-demo',
                                'endpoint': 'localhost:9000',
                                'location': '',
                                'protocol': 's3',
                                'secret_key': 'datajoint'}}}

In [9]:
dj.config

{   'connection.charset': '',
    'connection.init_function': None,
    'database.host': 'localhost',
    'database.password': 'dosaafdosaaf',
    'database.port': 3306,
    'database.reconnect': True,
    'database.use_tls': None,
    'database.user': None,
    'display.limit': 12,
    'display.show_tuple_count': True,
    'display.width': 14,
    'enable_python_native_blobs': False,
    'fetch_format': 'array',
    'loglevel': 'INFO',
    'safemode': True,
    'stores': {   'local': {   'location': '/home/dimitri/dev/dj-python-101/ch1/dj-store',
                               'protocol': 'file'},
                  'shared': {   'access_key': 'datajoint',
                                'bucket': 'datajoint-demo',
                                'endpoint': 'localhost:9000',
                                'location': '',
                                'protocol': 's3',
                                'secret_key': 'datajoint'}}}

In [None]:
schema.drop()  # drop if exists to create anew

In [None]:
# create a schema for this demo
schema = dj.schema('test_attach')

# A Minimal example of blobs and configurable blobs
Let's declear the table Test with blobs and attachments stored intrnally and externally.

In [None]:
@schema
class Test(dj.Manual):
    definition = """
    # Test blob and attachments
    id : int
    ---
    b0 : longblob       # a python object stored internally in the table
    b1 : blob@shared    # a python object stored on S3
    b2 : blob@local     # a python object store on the file system
    a0 : attach         # a file attachment stored internally in the table
    a1 : attach@shared  # a file attachment stored on s3
    a2 : attach@local   # a file attachment stored on the file system
    """

In [None]:
# Create three numpy arrays as save them in different files
q0, q1, q2 = np.random.randn(3,4), np.random.randn(7), np.random.randn(2, 3, 4)
f0, f1, f2 = './outfile0.npy', './outfile1.npy', './outfile2.npy'
np.save(f0, q0)
np.save(f1, q1)
np.save(f2, q2)

In [None]:
Test.delete()

In [None]:
# insert the blobs and the attachments into the table
Test.insert1(dict(id=1, b0=q0, b1=q1, b2=q2, a0=f0, a1=f1, a2=f2))

In [None]:
Test()

In [None]:
# delete the attached files
os.remove(f0)
os.remove(f1)
os.remove(f2)

In [None]:
# now fetch them and verify that they retrieved correctly
result = Test.fetch(as_dict=True)

In [None]:
result[0]['b0']

In [None]:
q0

In [None]:
np.array_equal(q0, result[0]['b0'])

In [None]:
result[0]['a1']

In [None]:
np.load(result[0]['a1'])

In [None]:
q1

In [None]:
schema.external

In [None]:
schema.external['local']

In [None]:
schema.external['shared']

In [None]:
schema.external['local'].fetch_external_paths()

In [None]:
schema.external['shared'].fetch_external_paths()

In [None]:
schema.external['shared'].used()

In [None]:
schema.external['local'].used()

In [None]:
schema.external['shared'].unused()

In [None]:
Test.delete()

In [None]:
schema.external['shared']

In [None]:
schema.external['shared'].delete(delete_external_files=True)

In [None]:
# complete cleanup of all external stores
for s in schema.external.values():
    s.delete(delete_external_files=True)

## Lookup of images on the web
We create a lookup table, WebImage to point to some images available on the web

In [None]:
@schema
class WebImage(dj.Lookup):
    definition = """
    # A reference to a web image
    image_number : int
    ---
    image_name : varchar(30)
    image_description : varchar(1024)
    image_url : varchar(1024)
    
    unique index(image_name)
    """
    contents = [
        (0, "pyramidal", 
         
         'Coronal section containing the chronically imaged pyramidal neuron "dow" '\
         '(visualized by green GFP) does not stain for GABA (visualized by antibody staining in red). '\
         'Confocal image stack, overlay of GFP and GABA channels. Scale bar: 100 um',
         
         "https://upload.wikimedia.org/wikipedia/commons/d/dc/PLoSBiol4.e126.Fig6fNeuron.jpg"
        ),
        (1, "striatal", 
         
         "Mouse spiny striatal projection neuron expressing a transgenic fluorescent protein "\
         "(colored yellow) delivered by a recombinant virus (AAV). "\
         "The striatal interneuron are stainerd in green for the neurokinin-1 receptor.",
         
         "https://upload.wikimedia.org/wikipedia/commons/e/e8/Striatal_neuron_in_an_interneuron_cage.jpg"
        )
    ]

### Preview the images directly from the web

In [None]:
Image.from_url((WebImage & 'image_number=0').fetch1('image_url'))

In [None]:
Image.from_url((WebImage & 'image_number=1').fetch1('image_url'))

## Define a table with attachments
Now we can use the stores to define attachment attributes in the form `attribute_name : attach@store  # comment` where the store is either `@local` or `@shared` as defined above.

Let's define the table `OriginalFile` to automatically download and attach files from `WebImage` and stores the attachments in the shared store.

In [None]:
@schema
class OriginalFile(dj.Imported):
    definition = """
    -> WebImage
    ---
    image_file : attach@shared
    """
    
    def make(self, key):
        # get the URL
        url = (WebImage & key).fetch1('image_url')
        
        # download the file from the web
        local_file = os.path.join(os.path.abspath('.'), url.split('/')[-1])
        with open(local_file, 'wb') as f:
            f.write(requests.get(url).content)
            
        # attach the file
        self.insert1(dict(key, image_file=local_file))
        
        # delete the downloaded file
        os.remove(local_file)

In [None]:
dj.Diagram(schema)

In [None]:
# perform the download
OriginalFile.populate()

In [None]:
OriginalFile()

In [None]:
# preview downloaded attachment
file = (OriginalFile & 'image_number=1').fetch1('image_file')
Image.from_file(file)

In [None]:
os.remove(file)

## Extract images into blobs
Now let's define another class that extracts imags from attached files and stores as blobs in the local store.

In [None]:
# Declare a table with a configurable blob
@schema
class Slide(dj.Computed):
    definition = """
    -> OriginalFile
    ---
    image_array : blob@local  # array in specified store
    """
    
    def make(self, key):
        # get the attached file
        file = (OriginalFile & key).fetch1('image_file')
        
        # save image data
        self.insert1(dict(key, image_array=imageio.imread(file)))
        
        # remove the downloaded file
        os.remove(file)

In [None]:
Slide.populate()

In [None]:
Slide()

In [None]:
schema.external

In [None]:
schema.external['local']

In [None]:
# Plot image form a blob
plt.imshow((Slide & 'image_number=0').fetch1('image_array'));

## Caching
By default, the data from blobs and attachments are retrieved from remote stores with every fetch command. 
For repeated queries, a cache folder may be specified to improve performance and reduce cost of operations.
After the first fetch of a given blob or attachment, it will be read from the cache. 

In [None]:
# configure the cache
dj.config['cache'] = './dj-cache'

In [None]:
# clear the cache for the timing test
import shutil
if os.path.isdir(dj.config['cache']):
    shutil.rmtree(dj.config['cache'])

In [None]:
%%timeit -n1 -r1

# first time no cache
files = OriginalFile.fetch('image_file')

In [None]:
%%timeit -n1 -r1

# now with cache
files = OriginalFile.fetch('image_file')

## Deleting
Deleting from tables using external storage is just as simple and transaction-safe as with all other kinds of attributes. Simply use the `delete` method:

In [None]:
schema.external

In [None]:
schema.external['shared']

In [None]:
schema.external['shared'].unused()

In [None]:
(WebImage & 'image_number=0').delete()

In [None]:
schema.external['shared'].unused()

# Deleting

For the sake of performance, deleting from the data tables does not remove the data from external storage. 

The `delete` method of the external table deletes its **unused** entries and their corresponding external files.

In [None]:
schema.external

You may cleanup the external table using its `delete` method.  It is a transaction-safe operation and can be performed at any time.

In [None]:
schema.external['local'].delete(delete_external_files=True)

In [None]:
schema.external['shared'].delete(delete_external_files=True)

In [None]:
for s in schema.external.values():
    s.delete(delete_external_files=True)

In [None]:
schema.external['shared'].used()

In [None]:
schema.external['shared'].unused()