Skip to content

Commit

Permalink
Revert "Revert "Make SnapshotSubset() faster (pantsbuild#9779)" (pant…
Browse files Browse the repository at this point in the history
…sbuild#10143)"

This reverts commit fa093eb.

[ci skip-jvm-tests]
  • Loading branch information
cosmicexplorer committed Jun 24, 2020
1 parent fa093eb commit 81a07a3
Show file tree
Hide file tree
Showing 18 changed files with 1,385 additions and 536 deletions.
6 changes: 3 additions & 3 deletions src/python/pants/backend/python/rules/run_setup_py_test.py
Expand Up @@ -166,7 +166,7 @@ def test_generate_chroot(self) -> None:
"name": "foo",
"version": "1.2.3",
"package_dir": {"": "src"},
"packages": ["foo", "foo.qux"],
"packages": ["foo", "foo.qux", "foo.resources.js"],
"namespace_packages": ["foo"],
"package_data": {"foo": ["resources/js/code.js"]},
"install_requires": ["baz==1.1.1"],
Expand Down Expand Up @@ -293,7 +293,7 @@ def test_get_sources(self) -> None:
"foo/__init__.py",
"foo/resources/js/code.js",
],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux"],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux", "foo.resources.js"],
expected_namespace_packages=["foo.bar"],
expected_package_data={"foo": ("resources/js/code.js",)},
addrs=["src/python/foo/bar/baz:baz1", "src/python/foo/qux", "src/python/foo/resources"],
Expand All @@ -309,7 +309,7 @@ def test_get_sources(self) -> None:
"foo/__init__.py",
"foo/resources/js/code.js",
],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux"],
expected_packages=["foo", "foo.bar", "foo.bar.baz", "foo.qux", "foo.resources.js"],
expected_namespace_packages=["foo.bar"],
expected_package_data={"foo": ("resources/js/code.js",)},
addrs=[
Expand Down
2 changes: 2 additions & 0 deletions src/rust/engine/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 24 additions & 4 deletions src/rust/engine/fs/src/glob_matching.rs
Expand Up @@ -4,6 +4,7 @@
use std::collections::HashSet;
use std::ffi::OsStr;
use std::fmt::Display;
use std::iter::Iterator;
use std::path::{Component, Path, PathBuf};
use std::sync::Arc;

Expand All @@ -21,9 +22,9 @@ use crate::{

lazy_static! {
static ref PARENT_DIR: &'static str = "..";
static ref SINGLE_STAR_GLOB: Pattern = Pattern::new("*").unwrap();
pub static ref SINGLE_STAR_GLOB: Pattern = Pattern::new("*").unwrap();
static ref DOUBLE_STAR: &'static str = "**";
static ref DOUBLE_STAR_GLOB: Pattern = Pattern::new(*DOUBLE_STAR).unwrap();
pub static ref DOUBLE_STAR_GLOB: Pattern = Pattern::new(*DOUBLE_STAR).unwrap();
static ref MISSING_GLOB_SOURCE: GlobParsedSource = GlobParsedSource(String::from(""));
static ref PATTERN_MATCH_OPTIONS: MatchOptions = MatchOptions {
require_literal_separator: true,
Expand All @@ -32,7 +33,7 @@ lazy_static! {
}

#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub(crate) enum PathGlob {
pub enum PathGlob {
Wildcard {
canonical_dir: Dir,
symbolic_path: PathBuf,
Expand Down Expand Up @@ -238,7 +239,19 @@ impl PathGlob {
}
}

#[derive(Debug)]
///
/// This struct extracts out just the include and exclude globs from the `PreparedPathGlobs`
/// struct. It is a temporary measure to try to share some code between the glob matching
/// implementation in this file and in snapshot_ops.rs.
///
/// TODO(#9967): Remove this struct!
///
pub struct ExpandablePathGlobs {
pub include: Vec<PathGlob>,
pub exclude: Arc<GitignoreStyleExcludes>,
}

#[derive(Debug, Clone)]
pub struct PreparedPathGlobs {
pub(crate) include: Vec<PathGlobIncludeEntry>,
pub(crate) exclude: Arc<GitignoreStyleExcludes>,
Expand All @@ -248,6 +261,13 @@ pub struct PreparedPathGlobs {
}

impl PreparedPathGlobs {
pub fn as_expandable_globs(&self) -> ExpandablePathGlobs {
ExpandablePathGlobs {
include: Iterator::flatten(self.include.iter().map(|pgie| pgie.globs.clone())).collect(),
exclude: self.exclude.clone(),
}
}

fn parse_patterns_from_include(
include: &[PathGlobIncludeEntry],
) -> Result<Vec<glob::Pattern>, String> {
Expand Down
11 changes: 7 additions & 4 deletions src/rust/engine/fs/src/lib.rs
Expand Up @@ -33,7 +33,10 @@ mod glob_matching_tests;
#[cfg(test)]
mod posixfs_tests;

pub use crate::glob_matching::{GlobMatching, PreparedPathGlobs};
pub use crate::glob_matching::{
ExpandablePathGlobs, GlobMatching, PathGlob, PreparedPathGlobs, DOUBLE_STAR_GLOB,
SINGLE_STAR_GLOB,
};

use std::cmp::min;
use std::io::{self, Read};
Expand Down Expand Up @@ -198,7 +201,7 @@ impl GitignoreStyleExcludes {
self.is_ignored_path(stat.path(), is_dir)
}

fn is_ignored_path(&self, path: &Path, is_dir: bool) -> bool {
pub fn is_ignored_path(&self, path: &Path, is_dir: bool) -> bool {
match self.gitignore.matched(path, is_dir) {
::ignore::Match::None | ::ignore::Match::Whitelist(_) => false,
::ignore::Match::Ignore(_) => true,
Expand All @@ -213,7 +216,7 @@ impl GitignoreStyleExcludes {
}
}

#[derive(Debug)]
#[derive(Debug, Clone)]
pub enum StrictGlobMatching {
// NB: the Error and Warn variants store a description of the origin of the PathGlob
// request so that we can make the error message more helpful to users when globs fail to match.
Expand Down Expand Up @@ -257,7 +260,7 @@ impl StrictGlobMatching {
}
}

#[derive(Debug)]
#[derive(Debug, Clone)]
pub enum GlobExpansionConjunction {
AllMatch,
AnyMatch,
Expand Down
2 changes: 2 additions & 0 deletions src/rust/engine/fs/store/Cargo.toml
Expand Up @@ -5,6 +5,7 @@ authors = ["Daniel Wagner-Hall <dwagnerhall@twitter.com>"]
edition = "2018"

[dependencies]
async-trait = "0.1"
bazel_protos = { path = "../../process_execution/bazel_protos" }
boxfuture = { path = "../../boxfuture" }
bytes = "0.4.5"
Expand All @@ -14,6 +15,7 @@ dirs = "1"
fs = { path = ".." }
futures01 = { package = "futures", version = "0.1" }
futures = { version = "0.3", features = ["compat"] }
glob = "0.2.11"
# TODO: This is 0.5.1 + https://github.com/tikv/grpc-rs/pull/457 + a workaround for https://github.com/rust-lang/cargo/issues/8258
grpcio = { git = "https://github.com/pantsbuild/grpc-rs.git", rev = "ed3afa3c24ddf1fdd86826e836f57c00757dfc00", default_features = false, features = ["protobuf-codec", "secure"] }
hashing = { path = "../../hashing" }
Expand Down
135 changes: 125 additions & 10 deletions src/rust/engine/fs/store/benches/store.rs
Expand Up @@ -29,23 +29,27 @@

use criterion::{criterion_group, criterion_main, Criterion};

use std::collections::HashSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::os::unix::ffi::OsStrExt;
use std::path::PathBuf;
use std::time::Duration;

use bazel_protos::remote_execution as remexec;
use bytes::Bytes;
use fs::{GlobExpansionConjunction, PreparedPathGlobs, StrictGlobMatching};
use futures::compat::Future01CompatExt;
use futures::future;
use hashing::Digest;
use hashing::{Digest, EMPTY_DIGEST};
use protobuf;
use task_executor::Executor;
use tempfile::TempDir;
use tokio::runtime::Runtime;

use store::{Snapshot, Store};
use store::{SnapshotOps, Store, SubsetParams};

pub fn criterion_benchmark(c: &mut Criterion) {
pub fn criterion_benchmark_materialize(c: &mut Criterion) {
// Create an executor, store containing the stuff to materialize, and a digest for the stuff.
// To avoid benchmarking the deleting of things, we create a parent temporary directory (which
// will be deleted at the end of the benchmark) and then skip deletion of the per-run directories.
Expand All @@ -62,16 +66,125 @@ pub fn criterion_benchmark(c: &mut Criterion) {
.measurement_time(Duration::from_secs(60))
.bench_function("materialize_directory", |b| {
b.iter(|| {
// NB: We take ownership of this child tempdir to avoid deleting things during the run.
let dest = TempDir::new_in(parent_dest_path).unwrap().into_path();
// NB: We forget this child tempdir to avoid deleting things during the run.
let new_temp = TempDir::new_in(parent_dest_path).unwrap();
let dest = new_temp.path().to_path_buf();
std::mem::forget(new_temp);
let _ = executor
.block_on(store.materialize_directory(dest, digest).compat())
.unwrap();
})
});
}

criterion_group!(benches, criterion_benchmark);
pub fn criterion_benchmark_subset_wildcard(c: &mut Criterion) {
let rt = Runtime::new().unwrap();
let executor = Executor::new(rt.handle().clone());
// NB: We use a much larger snapshot size compared to the materialize benchmark!
let (store, _tempdir, digest) = large_snapshot(&executor, 1000);

let mut cgroup = c.benchmark_group("snapshot_subset");

cgroup
.sample_size(10)
.measurement_time(Duration::from_secs(80))
.bench_function("wildcard", |b| {
b.iter(|| {
let get_subset = store.subset(
digest,
SubsetParams {
globs: PreparedPathGlobs::create(
vec!["**/*".to_string()],
StrictGlobMatching::Ignore,
GlobExpansionConjunction::AllMatch,
)
.unwrap(),
},
);
let _ = executor.block_on(get_subset).unwrap();
})
});
}

pub fn criterion_benchmark_merge(c: &mut Criterion) {
let rt = Runtime::new().unwrap();
let executor = Executor::new(rt.handle().clone());
let num_files: usize = 4000;
let (store, _tempdir, digest) = large_snapshot(&executor, num_files);

let (directory, _metadata) = executor
.block_on(store.load_directory(digest))
.unwrap()
.unwrap();
// Modify half of the files in the top-level directory by setting them to have the empty
// fingerprint (zero content).
let mut all_file_nodes = directory.get_files().to_vec();
let mut file_nodes_to_modify = all_file_nodes.split_off(all_file_nodes.len() / 2);
for file_node in file_nodes_to_modify.iter_mut() {
let mut empty_bazel_digest = remexec::Digest::new();
empty_bazel_digest.set_hash(EMPTY_DIGEST.0.to_hex());
empty_bazel_digest.set_size_bytes(0);
file_node.set_digest(empty_bazel_digest);
}
let modified_file_names: HashSet<String> = file_nodes_to_modify
.iter()
.map(|file_node| file_node.get_name().to_string())
.collect();

let mut bazel_modified_files_directory = remexec::Directory::new();
bazel_modified_files_directory.set_files(protobuf::RepeatedField::from_vec(
all_file_nodes
.iter()
.cloned()
.chain(file_nodes_to_modify.into_iter())
.collect(),
));
bazel_modified_files_directory.set_directories(directory.directories.clone());

let modified_digest = executor
.block_on(store.record_directory(&bazel_modified_files_directory, true))
.unwrap();

let mut bazel_removed_files_directory = remexec::Directory::new();
bazel_removed_files_directory.set_files(protobuf::RepeatedField::from_vec(
all_file_nodes
.into_iter()
.filter(|file_node| !modified_file_names.contains(file_node.get_name()))
.collect(),
));
bazel_removed_files_directory.set_directories(directory.directories.clone());
let removed_digest = executor
.block_on(store.record_directory(&bazel_removed_files_directory, true))
.unwrap();

let mut cgroup = c.benchmark_group("snapshot_merge");

cgroup
.sample_size(10)
.measurement_time(Duration::from_secs(80))
.bench_function("snapshot_merge", |b| {
b.iter(|| {
// Merge the old and the new snapshot together, allowing any file to be duplicated.
let old_first: Digest = executor
.block_on(store.merge(vec![removed_digest, modified_digest]))
.unwrap();

// Test the performance of either ordering of snapshots.
let new_first: Digest = executor
.block_on(store.merge(vec![modified_digest, removed_digest]))
.unwrap();

assert_eq!(old_first, new_first);
})
});
}

criterion_group!(
benches,
criterion_benchmark_materialize,
criterion_benchmark_subset_wildcard,
criterion_benchmark_merge
);
criterion_main!(benches);

///
Expand Down Expand Up @@ -107,8 +220,10 @@ pub fn large_snapshot(executor: &Executor, max_files: usize) -> (Store, TempDir,
})
.collect::<String>();

// Drop empty or too-long candidates.
// NB: Split the line by whitespace, then accumulate a PathBuf using each word as a path
// component!
let path_buf = clean_line.split_whitespace().collect::<PathBuf>();
// Drop empty or too-long candidates.
let components_too_long = path_buf.components().any(|c| c.as_os_str().len() > 255);
if components_too_long || path_buf.as_os_str().is_empty() || path_buf.as_os_str().len() > 512
{
Expand All @@ -122,9 +237,10 @@ pub fn large_snapshot(executor: &Executor, max_files: usize) -> (Store, TempDir,
let storedir = TempDir::new().unwrap();
let store = Store::local_only(executor.clone(), storedir.path()).unwrap();

let store2 = store.clone();
let digests = henries_paths
.map(|mut path| {
let store = store.clone();
let store = store2.clone();
async move {
// We use the path as the content as well: would be interesting to make this tunable.
let content = Bytes::from(path.as_os_str().as_bytes());
Expand All @@ -141,10 +257,9 @@ pub fn large_snapshot(executor: &Executor, max_files: usize) -> (Store, TempDir,

let digest = executor
.block_on({
let store = store.clone();
async move {
let digests = future::try_join_all(digests).await?;
Snapshot::merge_directories(store, digests).await
store2.merge(digests).await
}
})
.unwrap();
Expand Down

0 comments on commit 81a07a3

Please sign in to comment.