-
Notifications
You must be signed in to change notification settings - Fork 24.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ML] Data frame analytics data counts (#53998)
This commit instruments data frame analytics with stats for the data that are being analyzed. In particular, we count training docs, test docs, and skipped docs. In order to account docs with missing values as skipped docs for analyses that do not support missing values, this commit changes the extractor so that it only ignores docs with missing values when it collects the data summary, which is used to estimate memory usage.
- Loading branch information
1 parent
0a35f39
commit 39785eb
Showing
28 changed files
with
744 additions
and
123 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
119 changes: 119 additions & 0 deletions
119
...gh-level/src/main/java/org/elasticsearch/client/ml/dataframe/stats/common/DataCounts.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.client.ml.dataframe.stats.common; | ||
|
||
import org.elasticsearch.common.Nullable; | ||
import org.elasticsearch.common.ParseField; | ||
import org.elasticsearch.common.inject.internal.ToStringBuilder; | ||
import org.elasticsearch.common.xcontent.ConstructingObjectParser; | ||
import org.elasticsearch.common.xcontent.ToXContentObject; | ||
import org.elasticsearch.common.xcontent.XContentBuilder; | ||
|
||
import java.io.IOException; | ||
import java.util.Objects; | ||
|
||
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.optionalConstructorArg; | ||
|
||
public class DataCounts implements ToXContentObject { | ||
|
||
public static final String TYPE_VALUE = "analytics_data_counts"; | ||
|
||
public static final ParseField TRAINING_DOCS_COUNT = new ParseField("training_docs_count"); | ||
public static final ParseField TEST_DOCS_COUNT = new ParseField("test_docs_count"); | ||
public static final ParseField SKIPPED_DOCS_COUNT = new ParseField("skipped_docs_count"); | ||
|
||
public static final ConstructingObjectParser<DataCounts, Void> PARSER = new ConstructingObjectParser<>(TYPE_VALUE, true, | ||
a -> { | ||
Long trainingDocsCount = (Long) a[0]; | ||
Long testDocsCount = (Long) a[1]; | ||
Long skippedDocsCount = (Long) a[2]; | ||
return new DataCounts( | ||
getOrDefault(trainingDocsCount, 0L), | ||
getOrDefault(testDocsCount, 0L), | ||
getOrDefault(skippedDocsCount, 0L) | ||
); | ||
}); | ||
|
||
static { | ||
PARSER.declareLong(optionalConstructorArg(), TRAINING_DOCS_COUNT); | ||
PARSER.declareLong(optionalConstructorArg(), TEST_DOCS_COUNT); | ||
PARSER.declareLong(optionalConstructorArg(), SKIPPED_DOCS_COUNT); | ||
} | ||
|
||
private final long trainingDocsCount; | ||
private final long testDocsCount; | ||
private final long skippedDocsCount; | ||
|
||
public DataCounts(long trainingDocsCount, long testDocsCount, long skippedDocsCount) { | ||
this.trainingDocsCount = trainingDocsCount; | ||
this.testDocsCount = testDocsCount; | ||
this.skippedDocsCount = skippedDocsCount; | ||
} | ||
|
||
@Override | ||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { | ||
builder.startObject(); | ||
builder.field(TRAINING_DOCS_COUNT.getPreferredName(), trainingDocsCount); | ||
builder.field(TEST_DOCS_COUNT.getPreferredName(), testDocsCount); | ||
builder.field(SKIPPED_DOCS_COUNT.getPreferredName(), skippedDocsCount); | ||
builder.endObject(); | ||
return builder; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
DataCounts that = (DataCounts) o; | ||
return trainingDocsCount == that.trainingDocsCount | ||
&& testDocsCount == that.testDocsCount | ||
&& skippedDocsCount == that.skippedDocsCount; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(trainingDocsCount, testDocsCount, skippedDocsCount); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return new ToStringBuilder(getClass()) | ||
.add(TRAINING_DOCS_COUNT.getPreferredName(), trainingDocsCount) | ||
.add(TEST_DOCS_COUNT.getPreferredName(), testDocsCount) | ||
.add(SKIPPED_DOCS_COUNT.getPreferredName(), skippedDocsCount) | ||
.toString(); | ||
} | ||
|
||
public long getTrainingDocsCount() { | ||
return trainingDocsCount; | ||
} | ||
|
||
public long getTestDocsCount() { | ||
return testDocsCount; | ||
} | ||
|
||
public long getSkippedDocsCount() { | ||
return skippedDocsCount; | ||
} | ||
|
||
private static <T> T getOrDefault(@Nullable T value, T defaultValue) { | ||
return value != null ? value : defaultValue; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
...vel/src/test/java/org/elasticsearch/client/ml/dataframe/stats/common/DataCountsTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.client.ml.dataframe.stats.common; | ||
|
||
import org.elasticsearch.common.xcontent.XContentParser; | ||
import org.elasticsearch.test.AbstractXContentTestCase; | ||
|
||
import java.io.IOException; | ||
|
||
public class DataCountsTests extends AbstractXContentTestCase<DataCounts> { | ||
|
||
@Override | ||
protected DataCounts createTestInstance() { | ||
return createRandom(); | ||
} | ||
|
||
public static DataCounts createRandom() { | ||
return new DataCounts( | ||
randomNonNegativeLong(), | ||
randomNonNegativeLong(), | ||
randomNonNegativeLong() | ||
); | ||
} | ||
|
||
@Override | ||
protected DataCounts doParseInstance(XContentParser parser) throws IOException { | ||
return DataCounts.PARSER.apply(parser, null); | ||
} | ||
|
||
@Override | ||
protected boolean supportsUnknownFields() { | ||
return true; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.