-
Notifications
You must be signed in to change notification settings - Fork 979
/
library_upload_dir.py
176 lines (148 loc) · 6.77 KB
/
library_upload_dir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
import argparse
import os
import sys
from bioblend import galaxy
class Uploader:
def __init__(self, url, api, library_id, folder_id, should_link, non_local):
self.gi = galaxy.GalaxyInstance(url=url, key=api)
self.library_id = library_id
self.folder_id = folder_id
self.should_link = should_link
self.non_local = non_local
self.memo_path = {}
self.prepopulate_memo()
def prepopulate_memo(self):
"""
Because the Galaxy Data Libraries API/system does not act like any
other file system in existence, and allows multiple files/folders with
identical names in the same parent directory, we have to prepopulate
the memoization cache with everything currently in the target
directory.
Because the Galaxy Data Libraries API does not work from a perspective
of "show me what is in this directory", we are forced to get the entire
contents of the data library, and then filter out things that are
interesting to us based on a folder prefix.
"""
existing = self.gi.libraries.show_library(self.library_id, contents=True)
uploading_to = [x for x in existing if x["id"] == self.folder_id]
if len(uploading_to) == 0:
raise Exception(f"Unknown folder [{self.folder_id}] in library [{self.library_id}]")
else:
uploading_to = uploading_to[0]
for x in existing:
# We only care if it's a subdirectory of where we're uploading to
if not x["name"].startswith(uploading_to["name"]):
continue
name_part = x["name"].split(uploading_to["name"], 1)[-1]
if name_part.startswith("/"):
name_part = name_part[1:]
self.memo_path[name_part] = x["id"]
def memoized_path(self, path_parts, base_folder=None):
"""Get the folder ID for a given folder path specified by path_parts.
If the folder does not exist, it will be created ONCE (during the
instantiation of this Uploader object). After that it is stored and
recycled. If the Uploader object is re-created, it is not aware of
previously existing paths and will not respect those. TODO: handle
existing paths.
"""
if base_folder is None:
base_folder = self.folder_id
dropped_prefix = []
fk = "/".join(path_parts)
if fk in self.memo_path:
return self.memo_path[fk]
else:
for i in reversed(range(len(path_parts))):
fk = "/".join(path_parts[0 : i + 1])
if fk in self.memo_path:
dropped_prefix = path_parts[0 : i + 1]
path_parts = path_parts[i + 1 :]
base_folder = self.memo_path[fk]
break
nfk = []
for i in range(len(path_parts)):
nfk.append("/".join(list(dropped_prefix) + list(path_parts[0 : i + 1])))
# Recursively create the path from our base_folder starting points,
# getting the IDs of each folder per path component
ids = self.recursively_build_path(path_parts, base_folder)
# These are then associated with the paths.
for key, fid in zip(nfk, ids):
self.memo_path[key] = fid
return ids[-1]
def recursively_build_path(self, path_parts, parent_folder_id, ids=None):
"""Given an iterable of path components and a parent folder id, recursively
create directories below parent_folder_id"""
if ids is None:
ids = []
if len(path_parts) == 0:
return ids
else:
pf = self.gi.libraries.create_folder(self.library_id, path_parts[0], base_folder_id=parent_folder_id)
ids.append(pf[0]["id"])
return self.recursively_build_path(path_parts[1:], pf[0]["id"], ids=ids)
# http://stackoverflow.com/questions/13505819/python-split-path-recursively/13505966#13505966
def rec_split(self, s):
if s == "/":
return ()
rest, tail = os.path.split(s)
if tail == ".":
return ()
if rest == "":
return (tail,)
return self.rec_split(rest) + (tail,)
def upload(self):
all_files = [x.strip() for x in list(sys.stdin.readlines())]
for idx, path in enumerate(all_files):
(dirName, fname) = path.rsplit(os.path.sep, 1)
if not os.path.exists(os.path.join(dirName, fname)):
continue
# Figure out what the memo key will be early
basepath = self.rec_split(dirName)
if len(basepath) == 0:
memo_key = fname
else:
memo_key = os.path.join(os.path.join(*basepath), fname)
# So that we can check if it really needs to be uploaded.
already_uploaded = memo_key in self.memo_path.keys()
fid = self.memoized_path(basepath, base_folder=self.folder_id)
print(f"[{idx + 1}/{len(all_files)}] {fid}/{fname} uploaded={already_uploaded}")
if not already_uploaded:
if self.non_local:
self.gi.libraries.upload_file_from_local_path(
self.library_id,
os.path.join(dirName, fname),
folder_id=fid,
)
else:
self.gi.libraries.upload_from_galaxy_filesystem(
self.library_id,
os.path.join(dirName, fname),
folder_id=fid,
link_data_only="link_to_files" if self.should_link else "copy_files",
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Upload a directory into a data library")
parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL")
parser.add_argument("-a", "--api", dest="api", required=True, help="API Key")
parser.add_argument("-l", "--lib", dest="library_id", required=True, help="Library ID")
parser.add_argument(
"-f", "--folder", dest="folder_id", help="Folder ID. If not specified, will go to root of library."
)
parser.add_argument(
"--nonlocal",
dest="non_local",
action="store_true",
default=False,
help="Set this flag if you are NOT running this script on your Galaxy head node with access to the full filesystem",
)
parser.add_argument(
"--link",
dest="should_link",
action="store_true",
default=False,
help="Link datasets only, do not upload to Galaxy. ONLY Avaialble if you run 'locally' relative to your Galaxy head node/filesystem ",
)
args = parser.parse_args()
u = Uploader(**vars(args))
u.upload()