Skip to content

Commit

Permalink
Refactor the results assemblers
Browse files Browse the repository at this point in the history
Refactor the multi-origin assembler and the individual result assembler classes to allow them to be created during the regional analysis creation process so that the Broker and MultiOriginAssembler can be simpler and depend on fewer components.
  • Loading branch information
trevorgerhardt committed Jan 29, 2023
1 parent c6bcd28 commit f53df35
Show file tree
Hide file tree
Showing 16 changed files with 251 additions and 406 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public LocalBackendComponents () {
authentication = new LocalAuthentication();
// TODO add nested LocalWorkerComponents here, to reuse some components, and pass it into the LocalWorkerLauncher?
workerLauncher = new LocalWorkerLauncher(config, fileStorage, gtfsCache, osmCache);
broker = new Broker(config, fileStorage, eventBus, workerLauncher);
broker = new Broker(config, eventBus, workerLauncher);
censusExtractor = new SeamlessCensusGridExtractor(config);
// Instantiate the HttpControllers last, when all the components except the HttpApi are already created.
List<HttpController> httpControllers = standardHttpControllers();
Expand Down
83 changes: 9 additions & 74 deletions src/main/java/com/conveyal/analysis/components/broker/Broker.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,11 @@
import com.conveyal.analysis.components.eventbus.EventBus;
import com.conveyal.analysis.components.eventbus.RegionalAnalysisEvent;
import com.conveyal.analysis.components.eventbus.WorkerEvent;
import com.conveyal.analysis.models.RegionalAnalysis;
import com.conveyal.analysis.results.MultiOriginAssembler;
import com.conveyal.analysis.util.JsonUtil;
import com.conveyal.file.FileStorage;
import com.conveyal.file.FileStorageKey;
import com.conveyal.file.FileUtils;
import com.conveyal.r5.analyst.WorkerCategory;
import com.conveyal.r5.analyst.cluster.RegionalTask;
import com.conveyal.r5.analyst.cluster.RegionalWorkResult;
import com.conveyal.r5.analyst.cluster.WorkerStatus;
import com.conveyal.r5.analyst.scenario.Scenario;
import com.conveyal.r5.util.ExceptionUtils;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.MultimapBuilder;
Expand All @@ -27,8 +21,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand All @@ -42,7 +34,6 @@
import static com.conveyal.analysis.components.eventbus.WorkerEvent.Action.REQUESTED;
import static com.conveyal.analysis.components.eventbus.WorkerEvent.Role.REGIONAL;
import static com.conveyal.analysis.components.eventbus.WorkerEvent.Role.SINGLE_POINT;
import static com.conveyal.file.FileCategory.BUNDLES;
import static com.google.common.base.Preconditions.checkNotNull;

/**
Expand Down Expand Up @@ -93,7 +84,6 @@ public interface Config {
private Config config;

// Component Dependencies
private final FileStorage fileStorage;
private final EventBus eventBus;
private final WorkerLauncher workerLauncher;

Expand Down Expand Up @@ -143,9 +133,8 @@ public interface Config {
public TObjectLongMap<WorkerCategory> recentlyRequestedWorkers =
TCollections.synchronizedMap(new TObjectLongHashMap<>());

public Broker (Config config, FileStorage fileStorage, EventBus eventBus, WorkerLauncher workerLauncher) {
public Broker (Config config, EventBus eventBus, WorkerLauncher workerLauncher) {
this.config = config;
this.fileStorage = fileStorage;
this.eventBus = eventBus;
this.workerLauncher = workerLauncher;
}
Expand All @@ -154,83 +143,29 @@ public Broker (Config config, FileStorage fileStorage, EventBus eventBus, Worker
* Enqueue a set of tasks for a regional analysis.
* Only a single task is passed in, which the broker will expand into all the individual tasks for a regional job.
*/
public synchronized void enqueueTasksForRegionalJob (RegionalAnalysis regionalAnalysis) {

// Make a copy of the regional task inside the RegionalAnalysis, replacing the scenario with a scenario ID.
RegionalTask templateTask = templateTaskFromRegionalAnalysis(regionalAnalysis);

LOG.info("Enqueuing tasks for job {} using template task.", templateTask.jobId);
if (findJob(templateTask.jobId) != null) {
LOG.error("Someone tried to enqueue job {} but it already exists.", templateTask.jobId);
throw new RuntimeException("Enqueued duplicate job " + templateTask.jobId);
public synchronized void enqueueTasksForRegionalJob (Job job, MultiOriginAssembler assembler) {
LOG.info("Enqueuing tasks for job {} using template task.", job.jobId);
if (findJob(job.jobId) != null) {
LOG.error("Someone tried to enqueue job {} but it already exists.", job.jobId);
throw new RuntimeException("Enqueued duplicate job " + job.jobId);
}
WorkerTags workerTags = WorkerTags.fromRegionalAnalysis(regionalAnalysis);
Job job = new Job(templateTask, workerTags);
jobs.put(job.workerCategory, job);

// Register the regional job so results received from multiple workers can be assembled into one file.
// TODO encapsulate MultiOriginAssemblers in a new Component
// Note: if this fails with an exception we'll have a job enqueued, possibly being processed, with no assembler.
// That is not catastrophic, but the user may need to recognize and delete the stalled regional job.
MultiOriginAssembler assembler = new MultiOriginAssembler(regionalAnalysis, job, fileStorage);
resultAssemblers.put(templateTask.jobId, assembler);
resultAssemblers.put(job.jobId, assembler);

if (config.testTaskRedelivery()) {
// This is a fake job for testing, don't confuse the worker startup code below with null graph ID.
return;
}

if (workerCatalog.noWorkersAvailable(job.workerCategory, config.offline())) {
createOnDemandWorkerInCategory(job.workerCategory, workerTags);
createOnDemandWorkerInCategory(job.workerCategory, job.workerTags);
} else {
// Workers exist in this category, clear out any record that we're waiting for one to start up.
recentlyRequestedWorkers.remove(job.workerCategory);
}
eventBus.send(new RegionalAnalysisEvent(templateTask.jobId, STARTED).forUser(workerTags.user, workerTags.group));
}

/**
* The single RegionalTask object represents a lot of individual accessibility tasks at many different origin
* points, typically on a grid. Before passing that RegionalTask on to the Broker (which distributes tasks to
* workers and tracks progress), we remove the details of the scenario, substituting the scenario's unique ID
* to save time and bandwidth. This avoids repeatedly sending the scenario details to the worker in every task,
* as they are often quite voluminous. The workers will fetch the scenario once from S3 and cache it based on
* its ID only. We protectively clone this task because we're going to null out its scenario field, and don't
* want to affect the original object which contains all the scenario details.
* TODO Why is all this detail added after the Persistence call?
* We don't want to store all the details added below in Mongo?
*/
private RegionalTask templateTaskFromRegionalAnalysis (RegionalAnalysis regionalAnalysis) {
RegionalTask templateTask = regionalAnalysis.request.clone();
// First replace the inline scenario with a scenario ID, storing the scenario for retrieval by workers.
Scenario scenario = templateTask.scenario;
templateTask.scenarioId = scenario.id;
// Null out the scenario in the template task, avoiding repeated serialization to the workers as massive JSON.
templateTask.scenario = null;
String fileName = String.format("%s_%s.json", regionalAnalysis.bundleId, scenario.id);
FileStorageKey fileStorageKey = new FileStorageKey(BUNDLES, fileName);
try {
File localScenario = FileUtils.createScratchFile("json");
JsonUtil.objectMapper.writeValue(localScenario, scenario);
// FIXME this is using a network service in a method called from a synchronized broker method.
// Move file into storage before entering the synchronized block.
fileStorage.moveIntoStorage(fileStorageKey, localScenario);
} catch (IOException e) {
LOG.error("Error storing scenario for retrieval by workers.", e);
}
// Fill in all the fields in the template task that will remain the same across all tasks in a job.
// I am not sure why we are re-setting all these fields, it seems like they are already set when the task is
// initialized by AnalysisRequest.populateTask. But we'd want to thoroughly check that assumption before
// eliminating or moving these lines.
templateTask.jobId = regionalAnalysis._id;
templateTask.graphId = regionalAnalysis.bundleId;
templateTask.workerVersion = regionalAnalysis.workerVersion;
templateTask.height = regionalAnalysis.height;
templateTask.width = regionalAnalysis.width;
templateTask.north = regionalAnalysis.north;
templateTask.west = regionalAnalysis.west;
templateTask.zoom = regionalAnalysis.zoom;
return templateTask;
eventBus.send(new RegionalAnalysisEvent(job.jobId, STARTED).forUser(job.workerTags.user, job.workerTags.group));
}

/**
Expand Down
43 changes: 25 additions & 18 deletions src/main/java/com/conveyal/analysis/components/broker/Job.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import java.util.Set;

import static com.conveyal.r5.common.Util.notNullOrEmpty;
import static com.google.common.base.Preconditions.checkNotNull;

/**
* A Job is a collection of tasks that represent all the origins in a regional analysis. All the
Expand Down Expand Up @@ -61,13 +60,13 @@ public class Job {
* The number of remaining tasks can be derived from the deliveredTasks BitSet, but as an
* optimization we keep a separate counter to avoid constantly scanning over that whole bitset.
*/
protected int nTasksCompleted;
protected int nTasksCompleted = 0;

/**
* The total number of task deliveries that have occurred. A task may be counted more than
* once if it is redelivered.
*/
protected int nTasksDelivered;
protected int nTasksDelivered = 0;

/** Every task in this job will be based on this template task, but have its origin coordinates changed. */
public final RegionalTask templateTask;
Expand Down Expand Up @@ -128,23 +127,31 @@ private RegionalTask makeOneTask (int taskNumber) {
*/
public final Set<String> errors = new HashSet();

public Job (RegionalTask templateTask, WorkerTags workerTags) {
this.jobId = templateTask.jobId;
this.templateTask = templateTask;
this.workerCategory = new WorkerCategory(templateTask.graphId, templateTask.workerVersion);
this.nTasksCompleted = 0;
this.nextTaskToDeliver = 0;

if (templateTask.originPointSetKey != null) {
checkNotNull(templateTask.originPointSet);
this.nTasksTotal = templateTask.originPointSet.featureCount();
} else {
this.nTasksTotal = templateTask.width * templateTask.height;
}

this.completedTasks = new BitSet(nTasksTotal);
public Job (RegionalTask task, WorkerTags workerTags) {
templateTask = templateTaskFromRegionalTask(task);
jobId = task.jobId;
workerCategory = new WorkerCategory(task.graphId, task.workerVersion);
nTasksTotal = task.getTasksTotal();
completedTasks = new BitSet(nTasksTotal);
this.workerTags = workerTags;
}

/**
* The single RegionalTask object represents a lot of individual accessibility tasks at many different origin
* points, typically on a grid. Before passing that RegionalTask on to the Broker (which distributes tasks to
* workers and tracks progress), we remove the details of the scenario, substituting the scenario's unique ID
* to save time and bandwidth. This avoids repeatedly sending the scenario details to the worker in every task,
* as they are often quite voluminous. The workers will fetch the scenario once from S3 and cache it based on
* its ID only. We protectively clone this task because we're going to null out its scenario field, and don't
* want to affect the original object which contains all the scenario details.
*/
private static RegionalTask templateTaskFromRegionalTask(RegionalTask task) {
RegionalTask templateTask = task.clone();
// First replace the inline scenario with a scenario ID, storing the scenario for retrieval by workers.
templateTask.scenarioId = templateTask.scenario.id;
// Null out the scenario in the template task, avoiding repeated serialization to the workers as massive JSON.
templateTask.scenario = null;
return templateTask;
}

public boolean markTaskCompleted(int taskId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import com.conveyal.analysis.components.BackendComponents;
import com.conveyal.analysis.components.LocalBackendComponents;
import com.conveyal.analysis.models.RegionalAnalysis;
import com.conveyal.analysis.results.MultiOriginAssembler;
import com.conveyal.r5.analyst.cluster.RegionalTask;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.UUID;

/**
Expand Down Expand Up @@ -66,7 +68,9 @@ private static void sendFakeJob(Broker broker) {
templateTask.scenarioId = "FAKE";
RegionalAnalysis regionalAnalysis = new RegionalAnalysis();
regionalAnalysis.request = templateTask;
broker.enqueueTasksForRegionalJob(regionalAnalysis);
var job = new Job(templateTask, WorkerTags.fromRegionalAnalysis(regionalAnalysis));
var assembler = new MultiOriginAssembler(job, new ArrayList<>());
broker.enqueueTasksForRegionalJob(job, assembler);
}

public static String compactUUID() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,20 @@
import com.conveyal.analysis.SelectingGridReducer;
import com.conveyal.analysis.UserPermissions;
import com.conveyal.analysis.components.broker.Broker;
import com.conveyal.analysis.components.broker.Job;
import com.conveyal.analysis.components.broker.JobStatus;
import com.conveyal.analysis.components.broker.WorkerTags;
import com.conveyal.analysis.models.AnalysisRequest;
import com.conveyal.analysis.models.OpportunityDataset;
import com.conveyal.analysis.models.RegionalAnalysis;
import com.conveyal.analysis.persistence.Persistence;
import com.conveyal.analysis.results.AccessCsvResultWriter;
import com.conveyal.analysis.results.CsvResultType;
import com.conveyal.analysis.results.GridResultWriter;
import com.conveyal.analysis.results.MultiOriginAssembler;
import com.conveyal.analysis.results.PathCsvResultWriter;
import com.conveyal.analysis.results.RegionalResultWriter;
import com.conveyal.analysis.results.TimeCsvResultWriter;
import com.conveyal.analysis.util.JsonUtil;
import com.conveyal.file.FileStorage;
import com.conveyal.file.FileStorageFormat;
Expand All @@ -20,6 +28,7 @@
import com.conveyal.r5.analyst.PointSet;
import com.conveyal.r5.analyst.PointSetCache;
import com.conveyal.r5.analyst.cluster.RegionalTask;
import com.conveyal.r5.analyst.scenario.Scenario;
import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.primitives.Ints;
import com.mongodb.QueryBuilder;
Expand All @@ -44,6 +53,7 @@
import static com.conveyal.analysis.util.JsonUtil.toJson;
import static com.conveyal.file.FileCategory.BUNDLES;
import static com.conveyal.file.FileCategory.RESULTS;
import static com.conveyal.r5.common.Util.notNullOrEmpty;
import static com.conveyal.r5.transit.TransportNetworkCache.getScenarioFilename;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
Expand Down Expand Up @@ -506,17 +516,64 @@ private RegionalAnalysis createRegionalAnalysis (Request req, Response res) thro
// This assigns it creation/update time stamps and an ID, which is needed to name any output CSV files.
regionalAnalysis = Persistence.regionalAnalyses.create(regionalAnalysis);

// Create the regional job
var regionalJob = new Job(task, WorkerTags.fromRegionalAnalysis(regionalAnalysis));

// Create the result writers. Store their result file paths in the database.
var resultWriters = new ArrayList<RegionalResultWriter>();
if (!task.makeTauiSite) {
if (task.recordAccessibility) {
if (task.originPointSet != null) {
var accessWriter = new AccessCsvResultWriter(task, fileStorage);
resultWriters.add(accessWriter);
regionalAnalysis.resultStorage.put(accessWriter.resultType(), accessWriter.getFileName());
} else {
resultWriters.addAll(GridResultWriter.createWritersFromTask(regionalAnalysis, task, fileStorage));
}
}

if (task.recordTimes) {
var timesWriter = new TimeCsvResultWriter(task, fileStorage);
resultWriters.add(timesWriter);
regionalAnalysis.resultStorage.put(timesWriter.resultType(), timesWriter.getFileName());
}

if (task.includePathResults) {
var pathsWriter = new PathCsvResultWriter(task, fileStorage);
resultWriters.add(pathsWriter);
regionalAnalysis.resultStorage.put(pathsWriter.resultType(), pathsWriter.getFileName());
}
checkArgument(notNullOrEmpty(resultWriters), "A regional analysis should always create at least one grid or CSV file.");
}

// Create the multi-origin assembler with the writers.
var assembler = new MultiOriginAssembler(regionalJob, resultWriters);

// Stored scenario is needed by workers. Must be done ahead of enqueueing the job.
storeScenarioJson(task.graphId, task.scenario);

// Register the regional job with the broker, which will distribute individual tasks to workers and track progress.
broker.enqueueTasksForRegionalJob(regionalAnalysis);
broker.enqueueTasksForRegionalJob(regionalJob, assembler);

// Flush to the database any information added to the RegionalAnalysis object when it was enqueued.
// This includes the paths of any CSV files that will be produced by this analysis.
// TODO verify whether there is a reason to use regionalAnalyses.modifyWithoutUpdatingLock() or put().
// This includes the paths of any CSV files that will be produced by this analysis. The regional analysis was
// created in this method and therefore we can bypass the nonce / permission checking.
Persistence.regionalAnalyses.modifiyWithoutUpdatingLock(regionalAnalysis);

return regionalAnalysis;
}

/**
* Store the regional analysis scenario as JSON for retrieval by the workers.
*/
private void storeScenarioJson(String graphId, Scenario scenario) throws IOException {
String fileName = getScenarioFilename(graphId, scenario.id);
FileStorageKey fileStorageKey = new FileStorageKey(BUNDLES, fileName);
File localScenario = FileUtils.createScratchFile("json");
JsonUtil.objectMapper.writeValue(localScenario, scenario);
fileStorage.moveIntoStorage(fileStorageKey, localScenario);
}

private RegionalAnalysis updateRegionalAnalysis (Request request, Response response) throws IOException {
RegionalAnalysis regionalAnalysis = JsonUtil.objectMapper.readValue(request.body(), RegionalAnalysis.class);
return Persistence.regionalAnalyses.updateByUserIfPermitted(regionalAnalysis, UserPermissions.from(request));
Expand Down

0 comments on commit f53df35

Please sign in to comment.