Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Stratified cross validation split for classification #54087

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,8 @@ private void refreshIndices(String jobId) {
);
refreshRequest.indicesOptions(IndicesOptions.lenientExpandOpen());

LOGGER.debug("[{}] Refreshing indices {}", jobId, Arrays.toString(refreshRequest.indices()));
LOGGER.debug(() -> new ParameterizedMessage("[{}] Refreshing indices {}",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@benwtrent I forgot to address your comment before so I squeezed this one here

jobId, Arrays.toString(refreshRequest.indices())));

try (ThreadContext.StoredContext ignore = client.threadPool().getThreadContext().stashWithOrigin(ML_ORIGIN)) {
client.admin().indices().refresh(refreshRequest).actionGet();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,20 @@ public CrossValidationSplitterFactory(Client client, DataFrameAnalyticsConfig co

public CrossValidationSplitter create() {
if (config.getAnalysis() instanceof Regression) {
Regression regression = (Regression) config.getAnalysis();
return new RandomCrossValidationSplitter(
fieldNames, regression.getDependentVariable(), regression.getTrainingPercent(), regression.getRandomizeSeed());
return createRandomSplitter();
}
if (config.getAnalysis() instanceof Classification) {
return createStratifiedSplitter((Classification) config.getAnalysis());
}
return (row, incrementTrainingDocs, incrementTestDocs) -> incrementTrainingDocs.run();
}

private CrossValidationSplitter createRandomSplitter() {
Regression regression = (Regression) config.getAnalysis();
return new RandomCrossValidationSplitter(
fieldNames, regression.getDependentVariable(), regression.getTrainingPercent(), regression.getRandomizeSeed());
}

private CrossValidationSplitter createStratifiedSplitter(Classification classification) {
String aggName = "dependent_variable_terms";
SearchRequestBuilder searchRequestBuilder = client.prepareSearch(config.getDest().getIndex())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,11 @@ class RandomCrossValidationSplitter implements CrossValidationSplitter {
}

private static int findDependentVariableIndex(List<String> fieldNames, String dependentVariable) {
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(dependentVariable)) {
return i;
}
int dependentVariableIndex = fieldNames.indexOf(dependentVariable);
if (dependentVariableIndex < 0) {
throw ExceptionsHelper.serverError("Could not find dependent variable [" + dependentVariable + "] in fields " + fieldNames);
}
throw ExceptionsHelper.serverError("Could not find dependent variable [" + dependentVariable + "] in fields " + fieldNames);
return dependentVariableIndex;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,16 @@ public StratifiedCrossValidationSplitter(List<String> fieldNames, String depende
this.dependentVariableIndex = findDependentVariableIndex(fieldNames, dependentVariable);
this.samplingRatio = trainingPercent / 100.0;
this.random = new Random(randomizeSeed);
this.classSamples = new HashMap<>(classCardinalities.size());
this.classSamples = new HashMap<>();
classCardinalities.entrySet().forEach(entry -> classSamples.put(entry.getKey(), new ClassSample(entry.getValue())));
}

private static int findDependentVariableIndex(List<String> fieldNames, String dependentVariable) {
for (int i = 0; i < fieldNames.size(); i++) {
if (fieldNames.get(i).equals(dependentVariable)) {
return i;
}
int dependentVariableIndex = fieldNames.indexOf(dependentVariable);
if (dependentVariableIndex < 0) {
throw ExceptionsHelper.serverError("Could not find dependent variable [" + dependentVariable + "] in fields " + fieldNames);
}
throw ExceptionsHelper.serverError("Could not find dependent variable [" + dependentVariable + "] in fields " + fieldNames);
return dependentVariableIndex;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public void setUpTests() {
classCardinalities.put("c", classC);
}

public void testConstructor_GivenMissingDependendVariable() {
public void testConstructor_GivenMissingDependentVariable() {
ElasticsearchException e = expectThrows(ElasticsearchException.class, () -> new StratifiedCrossValidationSplitter(
Collections.emptyList(), "foo", Collections.emptyMap(), 100.0, 0));
assertThat(e.getMessage(), equalTo("Could not find dependent variable [foo] in fields []"));
Expand Down Expand Up @@ -127,7 +127,7 @@ public void testProcess_GivenRowsWithDependentVariableValue_AndTrainingPercentIs
String[] processedRow = Arrays.copyOf(row, row.length);
splitter.process(processedRow, this::incrementTrainingDocsCount, this::incrementTestDocsCount);

// As all these rows have no dependent variable value, they're not for training and should be unaffected
// As training percent is 100 all rows should be unaffected
assertThat(Arrays.equals(processedRow, row), is(true));
}
assertThat(trainingDocsCount, equalTo(500L));
Expand Down