update

flatironinstitute · Oct 23, 2018 · b6bec40 · b6bec40
1 parent f3100d0
commit b6bec40
Show file tree

Hide file tree

Showing 3 changed files with 132 additions and 71 deletions.
diff --git a/bin/kbucket-download b/bin/kbucket-download
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import sys
+from kbucket import client as kb
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description='Download a file or directory from kbucket')
+
+parser.add_argument('-a', action="store_true", default=False)
+parser.add_argument('-b', action="store", dest="b")
+parser.add_argument('-c', action="store", dest="c", type=int)
+
+def print_usage():
+  print('Usage:')
+  print('kbucket-download kbucket://[share-id]/[path] [output_file]')
+  print('kbucket-download sha1://[sha1]/[path] [output_file]')
+
+def download(path,output_path,args):
+  if os.path.exists(output_path):
+    if not args.overwrite:
+      print('Cannot download directory... output directory already exists. Use --overwrite flag to force.')
+      sys.exit(-1)
+  path0=kb.findFile(path)
+  if not path0:
+    print('Unable to find file: '+path)
+    sys.exit(-1)
+  if is_url(path0):
+    size0=kb.getFileSize(path)
+    if size0>args.max_file_size_mb*1024*1024:
+      print('Ignoring large file ({} > {}): {}'.format(size0/(1024*1024),args.max_file_size_mb,output_path))
+      return
+  kb.realizeFile(path,target_path=output_path)
+
+def is_url(path):
+  return (path.startswith('http://') or path.startswith('https://'))
+
+def download_dir(path,output_path,args):
+  try:
+    dd=kb.readDir(path)
+  except:
+    print('Unable to read directory: '+path)
+    sys.exit(-1)
+  if os.path.exists(output_path):
+    if not args.overwrite:
+      print('Cannot download directory... output directory already exists. Use --overwrite flag to force.')
+      sys.exit(-1)
+  else:
+    os.mkdir(output_path)
+  for ff in dd.files:
+    print(output_path+'/'+ff.name)
+    download(path+'/'+ff.name,output_path+'/'+ff.name,args)
+  for dd in dd.dirs:
+    print(output_path+'/'+dd.name+'/')
+    download_dir(path+'/'+dd.name,output_path+'/'+dd.name,args)
+
+if __name__== "__main__":
+  parser = argparse.ArgumentParser(description = 'Download a file or directory from kbucket')
+  parser.add_argument('path', help='The path of the file (or directory) on kbucket')
+  parser.add_argument('output_path', help='The path of the destination file on the local computer')
+  parser.add_argument('--dir', action='store_true', help='Download a directory rather than a file')
+  parser.add_argument('--overwrite', action='store_true', help='Allow overwriting existing files')
+  parser.add_argument('--max_file_size_mb', help='Ignore files larger than this size in megabytes', type=float, default=10)
+
+  args = parser.parse_args()
+
+  if args.dir:
+    download_dir(args.path,args.output_path,args)
+  else:
+    download(args.path,args.output_path,args)
+
+
diff --git a/kbucket/kbucketclient.py b/kbucket/kbucketclient.py
@@ -55,46 +55,60 @@ def getFileSize(self, path=None,*,sha1=None,share_ids=None,key=None,collection=N
   def moveFileToCache(self,path):
     return self._sha1_cache.moveFileToCache(path)
 
-  def readDir(self,path):
+  def readDir(self,path,recursive=False,include_sha1=True):
     if path.startswith('kbucket://'):
       list=path.split('/')
       share_id=_filter_share_id(list[2])
       path0='/'.join(list[3:])
-      obj=self._read_kbucket_dir(share_id=share_id,path=path0)
-      if not obj:
-        return None
-      ret=KBucketClientDirectory()
-      for a in obj['files']:
-        ff=KBucketClientDirectoryFile()
-        ff.name=a['name']
-        ff.size=a['size']
-        ff.path=path+'/'+ff.name
-        ff.sha1=a['prv']['original_checksum']
-        ret.files.append(ff)
-      for a in obj['dirs']:
-        ff=KBucketClientDirectoryDir()
-        ff.name=a['name']
-        ff.path=path+'/'+ff.name
-        ret.dirs.append(ff)
-      return ret
+      ret=self._read_kbucket_dir(share_id=share_id,path=path0,recursive=recursive,include_sha1=include_sha1)
     else:
-      ret=KBucketClientDirectory()
+      ret=self._read_file_system_dir(path=path,recursive=recursive,include_sha1=include_sha1)
+    return ret
+
+  def _read_file_system_dir(self,*,path,recursive,include_sha1):
+      ret=dict(
+        files={},
+        dirs={}
+      )
       list=os.listdir(path)
-      for fname in list:
-        if os.path.isfile(fname):
-          ff=KBucketClientDirectoryFile()
-          ff.name=fname
-          ff.path=path+'/'+ff.name
-          ff.size=os.path.getsize(ff.path)
-          ff.sha1=None
-          ret.files.append(ff)
-        elif os.path.isdir(fname):
-          ff=KBucketClientDirectoryDir()
-          ff.name=fname
-          ff.path=path+'/'+ff.name
-          ret.dirs.append(ff)
+      for name0 in list:
+        path0=path+'/'+name0
+        if os.path.isfile(path0):
+          ret['files'][name0]=dict(
+            size=os.path.getsize(path0)
+          )
+          if include_sha1:
+            ret['files'][name0]['sha1']=self.computeFileSha1(path0)
+        elif os.path.isdir(path0):
+          ret['dirs'][name0]={}
+          if recursive:
+            ret['dirs'][name0]=self._read_file_system_dir(path=path0,recursive=recursive,include_sha1=include_sha1)
       return ret
 
+  def _read_kbucket_dir(self,*,share_id,path,recursive,include_sha1):
+    url=self._config['url']+'/'+share_id+'/api/readdir/'+path
+    obj=_http_get_json(url)
+    if not obj['success']:
+      return None
+
+    ret=dict(
+      files={},
+      dirs={}
+    )
+    for file0 in obj['files']:
+      name0=file0['name']
+      ret['files'][name0]=dict(
+        size=file0['size']
+      )
+      if include_sha1:
+        ret['files'][name0]['sha1']=file0['prv']['original_checksum']
+    for dir0 in obj['dirs']:
+      name0=dir0['name']
+      ret['dirs'][name0]={}
+      if recursive:
+        ret['dirs'][name0]=_read_kbucket_dir(path+'/'+name0)
+    return ret
+
   def computeFileSha1(self,path):
     if path.startswith('sha1://'):
       list=path.split('/')
@@ -106,6 +120,10 @@ def computeFileSha1(self,path):
     else:
       return self._sha1_cache.computeFileSha1(path)
 
+  def computeDirHash(self,path):
+    dd=self.readDir(path=path,recursive=True,include_sha1=True)
+    return _sha1_of_object(dd)
+
   def uploadFile(self,path,share_id=None,upload_token=None):
     if not share_id:
       share_id=self._config['upload_share_id']
@@ -197,7 +215,7 @@ def _find_file_helper(self,*,path,sha1,share_ids,key,collection,local=True):
     if key is not None:
       sha1=pairio.get(key=key,collection=collection)
       if not sha1:
-        raise Exception('Unable to find file SHA-1 for this key.')
+        return (None,None,None)
     if path is not None:
       if sha1 is not None:
         raise Exception('Cannot specify both path and sha1 in find file')
@@ -257,13 +275,6 @@ def _find_in_share(self,*,sha1,share_id):
         return (url0,size0)
     return (None,None)
 
-  def _read_kbucket_dir(self,*,share_id,path):
-    url=self._config['url']+'/'+share_id+'/api/readdir/'+path
-    obj=_http_get_json(url)
-    if not obj['success']:
-      return None
-    return obj
-
   def _get_cas_upload_url_for_share(self,share_id):
     node_info=self.getNodeInfo(share_id)
     if not node_info:
@@ -272,41 +283,19 @@ def _get_cas_upload_url_for_share(self,share_id):
 
 class KBucketClientDirectory:
   def __init__(self):
-    self.files=[]
-    self.dirs=[]
+    self.files=dict()
+    self.dirs=dict()
   def toDict(self):
     ret=dict(
-      files=[],
-      dirs=[]
+      files={},
+      dirs={}
     )
-    for file in self.files:
-      ret['files'].append(file.toDict())
-    for dir in self.dirs:
-      ret['dirs'].append(dir.toDict())
+    for name in self.files:
+      ret['files'][name]=self.files[name].toDict()
+    for name in self.dirs:
+      ret['dirs'][name]=dir.toDict()
     return ret
 
-class KBucketClientDirectoryFile:
-  def __init__(self):
-    self.name=''
-    self.path=''
-    self.size=None
-    self.sha1=None
-  def toDict(self):
-    return dict(
-      name=self.name,
-      size=self.size,
-      sha1=self.sha1
-    )
-
-class KBucketClientDirectoryDir:
-  def __init__(self):
-    self.name=''
-    self.path=''
-  def toDict(self):
-    return dict(
-      name=self.name
-    )
-
 def _http_get_json(url):
   return json.load(urllib.request.urlopen(url))
 

diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setuptools.setup(
     name=pkg_name,
-    version="0.11.5",
+    version="0.11.6",
     author="Jeremy Magland",
     author_email="jmagland@flatironinstitute.org",
     description="Python client for kbucket",