Copy Wayne's
```
/allen/aibs/technology/waynew/behavior/behavior_only_nwb/20210227_visual_behavior_big_table.csv
```
into `data/`

To run this off-premises, you will also need to set up an ssh tunnel connecting your local machine to the test warehouse, i.e.
```
ssh -L 8080:testwarehouse:9000 your.username@synapse.corp.alleninstitute.org
```

Define a class (that really just wraps pandas) to do the manipulation of the metadata

In [1]:
import pandas as pd
import requests
from requests_toolbelt.downloadutils import stream
import hashlib
import json
import os
from contextlib import closing
import warnings
import time

In [2]:
def get_md5_checksum(file_path):
    checker = hashlib.md5()
    with open(file_path, 'rb') as in_file:
        chunk = in_file.read(1000000)
        while len(chunk)>0:
            checker.update(chunk)
            chunk = in_file.read(1000000)
    return checker.hexdigest()

In [3]:
class MetadataCache(object):

    def update_manifest(self):
        """
        Write contents of self._manifest to self._manifest_path
        """
        with open(self._manifest_path, 'w') as out_file:
            out_file.write(json.dumps(self._manifest, indent=2, sort_keys=True))

    def _download_file(self, file_url, file_path):
        """
        Actually download the file from file_url and save
        it to file_path
        """
        try:
            with closing(requests.get(file_url, stream=True)) as response:
                response.raise_for_status()
                with open(file_path, 'wb') as out_file:
                    stream.stream_response_to_file(response, path=out_file)
        except:
            if os.path.exists(file_path):
                os.unlink(file_path)
            raise

    def _check_for_file(self, file_url, file_version):
        """
        Returns True if the file has already been downloaded;
        False otherwise
        (Will also return False if the md5 checksum of the file on
        disk is not what the manifest says to expect)
        """
        if file_version not in self._manifest:
            return False

        if file_url not in self._manifest[file_version]:
            return False

        target_checksum = self._manifest[file_version][file_url]['md5_checksum']
        file_path = self._manifest[file_version][file_url]['file_path']
        checksum = get_md5_checksum(file_path)

        if checksum != target_checksum:
            warnings.warn('checksum changed; deleting old file')
            os.unlink(file_path)
            self._manifest[file_version].pop(file_url)
            self.update_manifest()
            return False
        return True
        
    def get_file(self, file_url, file_version):
        """
        file_url -- url to file
        file_version -- string denoting version of the file to download
                        (not actually sure how we are going to do versioning)

        Returns
        -------
        Path to the downloaded file on the local system
         """

        if self._check_for_file(file_url=file_url, file_version=file_version):
            return self._manifest[file_version][file_url]['file_path']

        print(f"Actually downloading {file_url} version {file_version}")

        # create a sub directory for files with this version tag
        version_dir = os.path.join(self.cache_dir, file_version)
        if not os.path.exists(version_dir):
            os.makedirs(version_dir)
        if not os.path.isdir(version_dir):
            raise RuntimeError(f"\n{version_dir}\nis not a directory")

        # the path where we will actually store the downloaded file
        file_path = os.path.join(version_dir,
                                 os.path.basename(file_url)+'.nwb')
            
        # make sure it does not exist yet
        if os.path.exists(file_path):
            raise RuntimeError(f"\ntrying to create {file_path}\nbut it already exists")
            
        # download the file
        self._download_file(file_url, file_path)

        checksum = get_md5_checksum(file_path)
        
        # update the manifest
        if file_version not in self._manifest:
            self._manifest[file_version] = {}

        self._manifest[file_version][file_url] = {'file_path': file_path,
                                                  'md5_checksum': checksum}
        self.update_manifest()

        return file_path

    def get_all_files(self, file_dataframe, file_version='v0'):
        """
        Get all of the files in a dataframe (hopefully a dataframe
        that is a result of querying the full dataframe of metdata
        associated with this cache)
        """
        output = {}
        for file_url in file_dataframe.wkf:
            file_url = file_url.replace('testwarehouse:9000', self._warehouse_host)
            path = self.get_file(file_url, file_version)
            output[file_url] = path
        return output
        
    
    @property
    def dataframe(self):
        return self._dataframe

    def __init__(self,
                 metadata_file = 'data/20210227_visual_behavior_big_table.csv',
                 cache_dir = None):
        """
        metadata_file -- path to the metadata csv file
        manifest_dir -- directory where manifest of donwloaded files will be kept
        """

        self._warehouse_host = 'localhost:8080'
        
        self._dataframe = pd.read_csv(metadata_file)

        if cache_dir is None:
            this_dir = os.path.abspath('')
            cache_dir = os.path.join(this_dir, 'data/cache')
        self.cache_dir = cache_dir

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        if not os.path.isdir(self.cache_dir):
            raise RuntimeError(f"\ncache_dir\n{self.cache_dir}\nis not a dir")

        self._manifest_path = os.path.join(self.cache_dir, 'manifest.json')
        if os.path.exists(self._manifest_path):
            if not os.path.isfile(self._manifest_path):
                raise RuntimeError(f"\nmanifest_path\n{self._manifest_path}\nis not a file")
            with open(self._manifest_path, 'rb') as in_file:
                self._manifest = json.load(in_file)
        else:
            warnings.warn("Creating new manifest; all data will be re-downloaded")
            self._manifest = {}
            self.update_manifest()

    def _all_possible(self, column_name):
        """
        List all possible values for a column
        
        column_name -- a str; the name of the column
        
        Returns an unsorted set of possible values; returns None of column
        does not exist
        """
        if column_name not in self.dataframe.columns:
            return None
        values = set(self.dataframe[column_name])
        return values

    def get_all_reporter_lines(self):
        return self._all_possible('reporter_line')

    def get_all_targeted_structures(self):
        return self._all_possible('targeted_structure')

    def get_all_genotypes(self):
        return self._all_possible('full_genotype')

In [4]:
cache = MetadataCache()

Try to download an arbitrary file

In [None]:
uri = 'http://testwarehouse:9000/api/v2/well_known_file_download/1085755199'

In [None]:
cache.get_file(file_url=uri, file_version='v0')

Try to download it again, note that it just returns the path to where it was saved, without donwloading a second time

In [None]:
cache.get_file(file_url=uri, file_version='v0')

List the values for some columns in the metadata dataframe

In [None]:
cache.get_all_targeted_structures()

In [None]:
cache.get_all_reporter_lines()

In [5]:
cache.get_all_genotypes()

{'Slc17a7-IRES2-Cre/wt;Camk2a-tTA/wt;Ai93(TITL-GCaMP6f)/wt',
 'Slc17a7-IRES2-Cre/wt;Camk2a-tTA/wt;Ai94(TITL-GCaMP6s)/wt',
 'Sst-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt',
 'Vip-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt'}

Select a subset of rows from the dataframe

In [6]:
subset = cache.dataframe.loc[cache.dataframe.full_genotype=='Vip-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt']

In [7]:
len(subset)

1476

In [8]:
set(subset.full_genotype)

{'Vip-IRES-Cre/wt;Ai148(TIT2L-GC6f-ICL-tTA2)/wt'}

In [9]:
subset = subset[:5]

In [10]:
len(subset)

5

Download all of the files in that subset

In [11]:
path_map = cache.get_all_files(subset)

Actually downloading http://localhost:8080/api/v2/well_known_file_download/1085755589 version v0
Actually downloading http://localhost:8080/api/v2/well_known_file_download/1085755636 version v0
Actually downloading http://localhost:8080/api/v2/well_known_file_download/1085755699 version v0
Actually downloading http://localhost:8080/api/v2/well_known_file_download/1085755676 version v0
Actually downloading http://localhost:8080/api/v2/well_known_file_download/1085755697 version v0
