Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DURACLOUD-1268: refactor the RetrievalTool to create full list of contentIds, including chunked content, when a list-file is used #143

Merged
merged 6 commits into from
Jul 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions retrievaltool/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>org.duracloud</groupId>
<artifactId>manifest</artifactId>
<version>7.1.0-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>org.duracloud</groupId>
<artifactId>storeclient</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,30 +49,38 @@ public DuraStoreRetrievalSource(ContentStore store,
}
}

this.spaceIds = verifySpaceIds(spaces);
}

protected Iterator<String> verifySpaceIds(List<String> spaces) throws RuntimeException {
Iterator<String> verifiedSpaceIds = null;

if (spaces != null && spaces.size() > 0) {
try {
// check if provided spaces exist
List<String> spaceList = store.getSpaces();
List<String> nonExistantSpaces = new ArrayList<String>();
List<String> spaceList = this.contentStore.getSpaces();
List<String> nonExistentSpaces = new ArrayList<String>();
for (String space : spaces) {
if (!spaceList.contains(space)) {
nonExistantSpaces.add(space);
nonExistentSpaces.add(space);
}
}
if (!nonExistantSpaces.isEmpty()) {
if (!nonExistentSpaces.isEmpty()) {
String error = "The following provided spaces do not exist: " +
StringUtils.join(nonExistantSpaces, ", ");
StringUtils.join(nonExistentSpaces, ", ");
throw new DuraCloudRuntimeException(error);
}

spaceIds = spaces.iterator();
verifiedSpaceIds = spaces.iterator();
} catch (ContentStoreException cse) {
throw new DuraCloudRuntimeException("Error retrieving spaces list", cse);
}
} else {
throw new RuntimeException("Spaces list is empty, there is " +
"no content to retrieve");
}

return verifiedSpaceIds;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,25 @@
*/
package org.duracloud.retrieval.source;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.duracloud.chunk.manifest.ChunksManifest;
import org.duracloud.chunk.util.ChunkUtil;
import org.duracloud.client.ContentStore;
import org.duracloud.common.constant.ManifestFormat;
import org.duracloud.common.error.DuraCloudRuntimeException;
import org.duracloud.common.model.ContentItem;
import org.duracloud.domain.Content;
import org.duracloud.error.ContentStoreException;
import org.duracloud.retrieval.mgmt.RetrievalListener;
import org.duracloud.stitch.error.MissingContentException;
import org.duracloud.manifest.ManifestFormatter;
import org.duracloud.manifest.impl.ManifestFormatterFactory;
import org.duracloud.mill.db.model.ManifestItem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -39,6 +47,8 @@ public class DuraStoreSpecifiedRetrievalSource extends DuraStoreStitchingRetriev

private Iterator<String> specifiedContentIds;

private ChunkUtil chunkUtil = null;

public DuraStoreSpecifiedRetrievalSource(ContentStore store,
List<String> singleSpaceList,
Iterator<String> specifiedContentIds) {
Expand All @@ -54,7 +64,99 @@ public DuraStoreSpecifiedRetrievalSource(ContentStore store,
"DuraStoreSpecifiedRetrievelSource must contain only 1 space ID.");
}

this.chunkUtil = new ChunkUtil();
this.specifiedContentIds = specifiedContentIds;
this.reviewSpecifiedContentIdsForChunkedContent(singleSpaceList);
}

private void reviewSpecifiedContentIdsForChunkedContent(List<String> singleSpaceList) {
log.debug("enter reviewSpecifiedContentIdsForChunkedContent()");
System.out.println("Reviewing space manifest for content IDs in list-file.");

List<String> retrievalContentIds = new ArrayList<String>();
while (specifiedContentIds.hasNext()) {
String specifiedContentId = specifiedContentIds.next();
retrievalContentIds.add(specifiedContentId);
}
log.debug("total contentIds in list-file: " + retrievalContentIds.size());

Map<String, String> retrievalSpaceContentIds = new HashMap<String, String>();

Iterator<String> retrievalSpaceIds = verifySpaceIds(singleSpaceList);
if (retrievalSpaceIds.hasNext()) {
String currentRetrievalSpaceId = retrievalSpaceIds.next();
log.debug("searching for contentIds in space: " + currentRetrievalSpaceId);
try {
InputStream manifest = contentStore.getManifest(currentRetrievalSpaceId, ManifestFormat.TSV);
ManifestFormatter formatter = new ManifestFormatterFactory().create(ManifestFormat.TSV);
String header = formatter.getHeader();
BufferedReader reader = new BufferedReader(new InputStreamReader(manifest));
String line = null;
ManifestItem item = null;

try {
while ((line = reader.readLine()) != null) {
// ignore any whitespace
if (line.trim().length() == 0) {
continue;
}

// ignore header line
if (line.equals(header)) {
continue;
}

try {
item = formatter.parseLine(line);
} catch (ParseException e) {
throw new IOException(e);
}

String spaceContentId = item.getContentId();

// check if spaceContentId is for chunk manifest
if (chunkUtil.isChunkManifest(spaceContentId)) {
String rootContentId = chunkUtil.preChunkedContentId(spaceContentId);
log.debug("found chunk manifest for contentId from list-file: " + spaceContentId);
retrievalSpaceContentIds.put(rootContentId, spaceContentId);
} else {
retrievalSpaceContentIds.put(spaceContentId, null);
}
}
} catch (IOException ex) {
log.error("Error reading space manifest.");
}
} catch (ContentStoreException cse) {
log.error("Unable to retrieve space manifest. If files-list.txt contains chunked files and the " +
"retrieval fails the local content dir will need to be empty.");
}
}

// check if contentIds in list-file are in space manifest, potentially as chunk manifests
List<String> retrievalContentIdsFinal = new ArrayList<String>();
Iterator<String> retrievalContentIdsIterator = retrievalContentIds.iterator();
while (retrievalContentIdsIterator.hasNext()) {
String retrievalContentId = retrievalContentIdsIterator.next();
if (retrievalSpaceContentIds.containsKey(retrievalContentId)) {
if (null != retrievalSpaceContentIds.get(retrievalContentId)) {
// add chunk-manifest contentId
String chunkManifestContentId = retrievalSpaceContentIds.get(retrievalContentId);
log.debug("replacing {} with chunk manifest {} in list of contentIds to retrieve.",
retrievalContentId, chunkManifestContentId);
retrievalContentIdsFinal.add(chunkManifestContentId);
} else {
// silently add contentId since it exists in space manifest
retrievalContentIdsFinal.add(retrievalContentId);
}
} else {
log.warn("Content ID {} in list-file is not present in the retrieval space manifest.",
retrievalContentId);
retrievalContentIdsFinal.add(retrievalContentId);
}
}

System.out.println("Finished reviewing space manifest for contentIDs in list-file.\n");
this.specifiedContentIds = retrievalContentIdsFinal.iterator();
}

@Override
Expand All @@ -64,25 +166,4 @@ protected void getNextSpace() {
currentContentList = specifiedContentIds;
}
}

@Override
protected Content doGetContent(ContentItem item, RetrievalListener listener) {
try {
return contentStore.getContent(item.getSpaceId(),
item.getContentId());
} catch (ContentStoreException cse) {
log.info("Error retrieving content ID: " + item.getContentId() +
". Trying to get this content again by checking for " +
"a chunk manifest for this content ID.");
// Create a new ContentItem representing the manifest file content ID
// for the passed in ContentItem to this method.
ContentItem manifestItem = new ContentItem(item.getSpaceId(),
item.getContentId() + ChunksManifest.manifestSuffix);
try {
return doGetContentFromManifest(manifestItem, listener);
} catch (MissingContentException mse) {
throw mse;
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,28 @@
package org.duracloud.retrieval.source;

import static org.duracloud.retrieval.source.DuraStoreSpecifiedRetrievalSourceTest.ContentType.BASIC;

import static org.duracloud.retrieval.source.DuraStoreSpecifiedRetrievalSourceTest.ContentType.CHUNK;
import static org.duracloud.retrieval.source.DuraStoreSpecifiedRetrievalSourceTest.ContentType.MANIFEST;
import static org.easymock.EasyMock.expect;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import junit.framework.AssertionFailedError;
import org.duracloud.chunk.manifest.ChunksManifest;
import org.duracloud.client.ContentStore;
import org.duracloud.common.constant.ManifestFormat;
import org.duracloud.common.error.DuraCloudRuntimeException;
import org.duracloud.common.model.ContentItem;
import org.duracloud.manifest.ManifestFormatter;
import org.duracloud.manifest.impl.TsvManifestFormatter;
import org.duracloud.mill.db.model.ManifestItem;
import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.Before;
Expand All @@ -32,6 +45,8 @@ public class DuraStoreSpecifiedRetrievalSourceTest {
private ContentStore store;
private List<String> spaces;
private List<String> specifiedContentIds;
private List<String> manifestContentIds;
private List<String> verifiedContentIds;

private final static String spaceId0 = "space-0";
private final static String spaceId1 = "space-1";
Expand All @@ -41,13 +56,46 @@ public class DuraStoreSpecifiedRetrievalSourceTest {
public void setUp() throws Exception {
spaces = new ArrayList<String>();
spaces.add(spaceId0);

store = EasyMock.createMock("ContentStore", ContentStore.class);
EasyMock.expect(store.getSpaces()).andReturn(spaces).times(1);
EasyMock.expect(store.getSpaces()).andReturn(spaces).times(2);

specifiedContentIds = new ArrayList<String>();
specifiedContentIds.add(BASIC.getContentId(0));
specifiedContentIds.add(BASIC.getContentId(1));
specifiedContentIds.add(BASIC.getContentId(2));
specifiedContentIds.add(BASIC.getContentId(4));

manifestContentIds = new ArrayList<String>();
manifestContentIds.add(BASIC.getContentId(0));
manifestContentIds.add(BASIC.getContentId(1));
manifestContentIds.add(BASIC.getContentId(2));
manifestContentIds.add(CHUNK.getContentId(2));
manifestContentIds.add(CHUNK.getContentId(2));
manifestContentIds.add(MANIFEST.getContentId(2));
manifestContentIds.add(BASIC.getContentId(4));

TsvManifestFormatter formatter = new TsvManifestFormatter();
File unstitchedManifest = File.createTempFile("unstitched", "tsv");
unstitchedManifest.deleteOnExit();

BufferedWriter writer =
new BufferedWriter(new OutputStreamWriter(new FileOutputStream(unstitchedManifest)));
writer.write(formatter.getHeader() + "\n");

for (String manifestContentId : manifestContentIds) {
write(writer, formatter, spaceId0, manifestContentId);
}

writer.close();

expect(store.getManifest(spaceId0, ManifestFormat.TSV)).andReturn(new FileInputStream(unstitchedManifest));

verifiedContentIds = new ArrayList<String>();
verifiedContentIds.add(BASIC.getContentId(0));
verifiedContentIds.add(BASIC.getContentId(1));
verifiedContentIds.add(MANIFEST.getContentId(2));
verifiedContentIds.add(BASIC.getContentId(4));
}

@Test
Expand All @@ -63,7 +111,7 @@ public void testGetNextContentItem() throws Exception {
retrievalSource = new DuraStoreSpecifiedRetrievalSource(store,
spaces,
specifiedContentIds.iterator());
verifyContents(spaceId0, specifiedContentIds);
verifyContents(spaceId0, verifiedContentIds);

ContentItem item = retrievalSource.getNextContentItem();
Assert.assertNull(item);
Expand All @@ -82,24 +130,35 @@ public void testSpecifiedRetrievalWithMultipleSpaces() {
specifiedContentIds.iterator());
}

private void verifyContents(String spaceId, List<String> specifiedContentIds) {
private void verifyContents(String spaceId, List<String> verifiedContentIds) {
ContentItem item = null;
List<ContentItem> retrievedItems = new ArrayList<ContentItem>();
int i = 0;
while ((item = retrievalSource.getNextContentItem()) != null) {
retrievedItems.add(item);
Assert.assertNotNull(item);
Assert.assertEquals(spaceId, item.getSpaceId());
Assert.assertEquals(specifiedContentIds.get(i), item.getContentId());
Assert.assertEquals(verifiedContentIds.get(i), item.getContentId());
i++;
}
Assert.assertEquals(specifiedContentIds.size(), retrievedItems.size());
Assert.assertEquals(verifiedContentIds.size(), retrievedItems.size());
}

private void replayMocks() {
EasyMock.replay(store);
}

private void write(BufferedWriter writer,
ManifestFormatter formatter,
String spaceId,
String contentId) throws IOException {
ManifestItem item = new ManifestItem();
item.setContentChecksum("checksum-md5");
item.setContentId(contentId);
item.setSpaceId(spaceId);
writer.write(formatter.formatLine(item) + "\n");
}

/**
* This inner class helps define types of test content items.
*/
Expand Down