/
SpeedBagIt.java
428 lines (391 loc) · 18.3 KB
/
SpeedBagIt.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/**
* This work was created by participants in the DataONE project, and is
* jointly copyrighted by participating institutions in DataONE. For
* more information on DataONE, see our web site at http://dataone.org.
*
* Copyright ${year}
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* $Id$
*/
package org.dataone.speedbagit;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PipedInputStream;
import java.io.PipedOutputStream;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.SimpleDateFormat;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* The main interface for creating a BagIt compliant zip file. The SpeedBagIt class
* manages SpeedFile objects, which represent files that will be written to the output
* stream.
* This class is also responsible for generating the tagmanifest and manfiest files along
* with the default BagIt files: bagit.txt & bag-info.txt.
*
*/
public class SpeedBagIt {
private final static Log logger = LogFactory.getLog(SpeedBagIt.class);
// The properties file holding string constants
private Properties properties;
// Version that the bag is (0.97, 1.0, etc)
public double version;
// Contents of tagmanifest-{algo}.txt file
public Map<String, String> tagManifestFile;
// Contents of manifest-{algo}.txt file
public Map<String, String> dataManifestFile;
// The name of the algorithm. Should be compatible with the MessageDigest class
public String checksumAlgorithm;
// Map of key-values that go in the bagit.txt file
public Map<String, String> bagitMetadata;
// A list holding all of the files in the bag
private List<SpeedFile> dataFiles;
private List<SpeedFile> tagFiles;
// An ExecutorService to run the piped stream in another thread
private static ExecutorService executor = null;
static {
// use a shared executor service with nThreads == one less than available processors
int availableProcessors = Runtime.getRuntime().availableProcessors();
int nThreads = availableProcessors * 1;
nThreads--;
nThreads = Math.max(1, nThreads);
executor = Executors.newFixedThreadPool(nThreads);
}
/**
* Creates a new instance of a SpeedBagIt. This constructor supports adding
* additional metadata to the bagit.txt file, when created.
* @param version: The bag version (0.97, 1.0, etc)
* @param checksumAlgorithm: The name of the algorithm used to checksum the files
* @param bagitMetadata: A key-value mapping of metadata that belongs in bagit.txt
*/
public SpeedBagIt(double version,
String checksumAlgorithm,
Map<String, String> bagitMetadata) throws IOException {
this.version = version;
this.checksumAlgorithm = checksumAlgorithm;
this.dataFiles = new ArrayList<>();
this.tagFiles = new ArrayList<>();
this.bagitMetadata = bagitMetadata;
this.dataManifestFile = new HashMap<> ();
this.tagManifestFile = new HashMap<> ();
this.properties = new Properties();
this.properties.load(Objects.requireNonNull(this.getClass().
getClassLoader().getResourceAsStream("speed-bagit.properties")));
}
/**
* Creates a new SpeedBagIt object. This constructor requires the bare
* minimum arguments to make a valid bag.
* @param version: The bag version (0.97, 1.0, etc)
* @param checksumAlgorithm: The name of the algorithm used to checksum the files
*/
public SpeedBagIt(double version,
String checksumAlgorithm) throws IOException {
this.version = version;
this.checksumAlgorithm = checksumAlgorithm;
this.dataFiles = new ArrayList<>();
this.tagFiles = new ArrayList<>();
this.bagitMetadata = new HashMap<> ();
this.dataManifestFile = new HashMap<> ();
this.tagManifestFile = new HashMap<> ();
this.properties = new Properties();
this.properties.load(Objects.requireNonNull(this.getClass().
getClassLoader().getResourceAsStream("speed-bagit.properties")));
}
/**
* Adds a stream of data to the bag.
*
* @param file: The stream representing a file or data that will be placed in the bag
* @param bagPath: The path, relative to the bag root where the file belongs
* @param checksum: A MessageDigest object that will hold the checksum
* @param isTagFile: Boolean set to True when the file is a tag file
*/
public void addFile(InputStream file, String bagPath, MessageDigest checksum, boolean isTagFile)
throws SpeedBagException {
logger.debug(String.format("Adding %s to the bag", bagPath));
// Check to see if there's a path conflict
if (this.hasPathCollisions(bagPath, isTagFile)) {
throw new SpeedBagException(
String.format("The tag file with path %s conflicts with another file.", bagPath)
);
}
SpeedFile newFile = new SpeedFile(new SpeedStream(file, checksum), bagPath, isTagFile);
if (isTagFile) {
this.tagFiles.add(newFile);
} else {
this.dataFiles.add(newFile);
}
}
/**
* Checks whether two paths collide, based on their file type (tag vs data file).
*
* @param path: Path being checked against the previously added files
* @param isTagFile: A flag whether the file is a data file or not (otherwise it will be a tag)
*/
private boolean hasPathCollisions(String path, boolean isTagFile) {
if (isTagFile) {
for (SpeedFile tagFile : this.tagFiles) {
if (Objects.equals(tagFile.getPath(), path)) {
return true;
}
}
} else {
for (SpeedFile dataFile : this.dataFiles) {
if (Objects.equals(dataFile.getPath(), path)) {
return true;
}
}
}
return false;
}
/**
* Adds a stream of data to the bag.
*
* @param file: The stream representing a file or data that will be placed in the bag
* @param bagPath: The path, relative to the bag root where the file belongs
* @param isTagFile: Boolean set to True when the file is a tag file
*/
public void addFile(InputStream file, String bagPath, boolean isTagFile) throws NoSuchAlgorithmException, SpeedBagException {
logger.debug(String.format("Adding %s to the bag", bagPath));
MessageDigest newDigest = MessageDigest.getInstance(this.checksumAlgorithm);
this.addFile(file, bagPath, newDigest, isTagFile);
}
/**
* Generates a bagit.txt file.
*
* @return A string representing the bagit.txt file.
*/
public String generateBagitTxt() {
logger.debug("Creating the bagit.txt file");
String bagitFile = "";
for (Map.Entry<String, String> entry : this.bagitMetadata.entrySet()) {
if(bagitFile != null) {
bagitFile = String.format("%s%s: %s\n", bagitFile, entry.getKey(), entry.getValue());
} else {
bagitFile = String.format("%s: %s\n", entry.getKey(), entry.getValue());
}
}
String tagFileversion = this.properties.getProperty("tag.file.version");
String tagFileCharacterEncodingName = this.properties.getProperty("tag.file.character.encoding.name");
String tagFileCharacterEncodingValue = this.properties.getProperty("tag.file.character.encoding.value");
bagitFile = String.format("%s%s: %s\n", bagitFile, tagFileversion, version);
bagitFile = String.format("%s%s: %s\n", bagitFile, tagFileCharacterEncodingName, tagFileCharacterEncodingValue);
return bagitFile;
}
/**
* Takes a size and returns B, KB, Mb, GB, etc. Taken from
* https://stackoverflow.com/questions/3758606/how-can-i-convert-byte-size-into-a-human-readable-format-in-java
*
* @param size: The size being converted
* @return The size as 5 B, 5 KB, 5 GB, etc
*/
public static String formatSize(long size) {
if (size < 1024) return size + " B";
int z = (63 - Long.numberOfLeadingZeros(size)) / 10;
return String.format("%.1f %sB", (double)size / (1L << (z*10)), " KMGTPE".charAt(z));
}
/**
* Generates the bag-info.txt file contents.
*
* @param payloadOxum The payload oxum of the bag
* @param bagSize: The size of the bag
* @return A text string with the file contents
*/
public String generateBagInfoTxt(String payloadOxum, int bagSize) {
logger.debug("Generating bag-info.txt");
LocalDateTime dateTime = LocalDateTime.now();
DateTimeFormatter dateFormat = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
String bagInfoDateKey = this.properties.getProperty("bag.info.date");
String bagInfo = String.format("%s: %s\n", bagInfoDateKey, dateFormat.format(dateTime));
String bagInfoPayloadOxum = this.properties.getProperty("bag.info.payloadOxum");
bagInfo = String.format("%s%s: %s\n", bagInfo, bagInfoPayloadOxum, payloadOxum);
String bagInfoBagSize = this.properties.getProperty("bag.info.bagSize");
bagInfo = String.format("%s%s: %s\n", bagInfo, bagInfoBagSize, formatSize(bagSize));
return bagInfo;
}
/**
* Writes a line to the tag manifest file. The line conforms to the
* <path> <checksum> format specified by BagIt.
*
* @param path: The path the the file, relative to the bag root
* @param checksum: The checksum of the file
*/
public void writeToTagManifest(String path, String checksum) {
logger.debug(String.format("Writing line to the tag-manifest %s %s", path, checksum));
// Check to see if it doesn't exist (so we don't write null)
tagManifestFile.put(checksum, path);
}
/**
* Writes a line to the manifest file describing the data files. The line conforms to the
* "<path> <checksum>" format specified by BagIt.
*
* @param path: The path the the file, relative to the bag root
* @param checksum: The checksum of the file
*/
public void writeToDataManifest(String path, String checksum) {
logger.debug(String.format("Writing line to the data manifest %s %s", path, checksum));
dataManifestFile.put(checksum, path);
}
/**
* Streams an individual file
*
* @param zos The output stream that the file is being written to
* @param streamingFile The file stream that's being written to the output stream
* @throws IOException
*/
private void streamFile(ZipOutputStream zos, SpeedFile streamingFile) throws IOException {
try {
ZipEntry entry = new ZipEntry(streamingFile.getPath());
zos.putNextEntry(entry);
SpeedStream fileStream = streamingFile.getStream();
IOUtils.copy(fileStream, zos);
} finally {
zos.closeEntry();
}
}
/**
* Writes the files to a stream under the BagIt specification. The manifest, bagit.txt,
* and bag.info are generated inside.
*
* @throws IOException Throws when something went wrong with streaming the bag
* @throws NoSuchAlgorithmException Thrown when an unsupported checksum algorithm is used
*/
public InputStream stream()
throws IOException, NoSuchAlgorithmException {
PipedOutputStream ps = new PipedOutputStream();
PipedInputStream is = new PipedInputStream(ps);
ZipOutputStream zos = new ZipOutputStream(ps);
executor.execute(
new Runnable() {
public void run() {
try {
String timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss").format(new Date());
logger.info(String.format("Streaming bag at %s", timeStamp));
int totalSize = 0;
// Stream all of the files in the root 'data' directory
for (SpeedFile streamingFile : dataFiles) {
try {
streamFile(zos, streamingFile);
String checksum = new String(streamingFile.getStream().getChecksum());
writeToDataManifest(streamingFile.getPath(), checksum);
totalSize += streamingFile.getStream().getSize();
} finally {
streamingFile.getStream().close();
}
}
String payloadOxum = String.format("%s.%s",totalSize, dataFiles.size());
// Generate and add the bagit.txt file
InputStream bagTextStream = new ByteArrayInputStream(generateBagitTxt().getBytes(StandardCharsets.UTF_8));
String bagitFileName = properties.getProperty("bagit.file.name");
addFile(bagTextStream, bagitFileName, MessageDigest.getInstance(checksumAlgorithm), true);
// Generate and add the bag-info.txt file
String bagInfoFile = generateBagInfoTxt(payloadOxum, totalSize);
InputStream fileStream = new ByteArrayInputStream(bagInfoFile.getBytes(StandardCharsets.UTF_8));
String bagitInfoFileName = properties.getProperty("bag.info.file.name");
addFile(fileStream, bagitInfoFileName, MessageDigest.getInstance(checksumAlgorithm), true);
// BagIt requires checksum filenames to be lower cased and without dashes
String sanitizedChecksum = checksumAlgorithm.toLowerCase();
sanitizedChecksum = sanitizedChecksum.replaceAll("[^A-Za-z0-9]", "");
// Generate and add the data manifest file
String dataManifest = bagFileToString(dataManifestFile);
String fileName = String.format("manifest-%s.txt", sanitizedChecksum);
fileStream = new ByteArrayInputStream(dataManifest.getBytes(StandardCharsets.UTF_8));
addFile(fileStream, fileName, MessageDigest.getInstance(checksumAlgorithm), true);
// Write all of the tag files
for (SpeedFile streamingFile : tagFiles) {
try {
streamFile(zos, streamingFile);
String checksum = streamingFile.getStream().getChecksum();
writeToTagManifest(streamingFile.getPath(), checksum);
} finally {
streamingFile.getStream().close();
}
}
// Create the tag manifest and stream it
String tagMannifest = bagFileToString(tagManifestFile);
fileStream = new ByteArrayInputStream(tagMannifest.getBytes(StandardCharsets.UTF_8));
fileName = String.format("tagmanifest-%s.txt", sanitizedChecksum);
SpeedFile tagManifestStreamFile = new SpeedFile(new SpeedStream(fileStream,
MessageDigest.getInstance(checksumAlgorithm)), fileName, true);
try {
streamFile(zos, tagManifestStreamFile);
} finally {
tagManifestStreamFile.getStream().close();
}
zos.close();
timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss").format(new Date());
logger.info(String.format("Finished streaming bag at %s", timeStamp));
} catch (Exception e) {
e.printStackTrace();
}
}
});
return is;
}
/**
* Returns the number of data files in the bag
*
* @return The number of data files
*/
public int getPayloadFileCount() {
return this.dataFiles.size();
}
/**
* Returns all of the tag files that have been added to
* the bag.
* @return List of tag files
*/
public List<SpeedFile> getTagFiles() {
return this.tagFiles;
}
/**
* Returns a list of the data files that have been added
* to the bag. These are the files that belong under data/
* @return List of data files
*/
public List<SpeedFile> getDataFiles() {
return this.dataFiles;
}
public static String bagFileToString(Map<String, String> mapFile) {
StringBuilder builder = new StringBuilder();
for(Map.Entry<String, String> e : mapFile.entrySet())
{
String key = e.getKey();
String value = e.getValue();
builder.append(key);
builder.append(' ');
builder.append(value);
builder.append(System.getProperty("line.separator"));
}
return builder.toString();
}
}