From 33ea6c1472f84b7bce9c65606a92610fd1e6e6d8 Mon Sep 17 00:00:00 2001 From: Jixiang Jiang Date: Sat, 5 Mar 2022 09:42:12 -0500 Subject: [PATCH 1/9] first implementation of connector proxy with airbyte stop and cont airbyte discover spec / validate / apply capture requests airbyte pull airbyte discover various improvements check runtime protocol check runtime protocol ready signal from stderr makefile more deps try_stream! for error processing update Cargo.lock Makefile change improve airbyte source logic ldconfig XXtesting move test to common stage NsMerge tweaks update Cargo.lock use libc instead of nix skip null field in specresult update control plane snapshots delayed process stop itself setup go in stage 2 tweaks using /var/tmp send SpecRequest synthetic chekcpoint convert_ to adapt_ debug - replacing sigcont with stdin simplify delay waiting logic remove a TODO --- .github/workflows/main.yml | 27 +- Cargo.lock | 270 ++++++++++- Makefile | 3 +- crates/connector_proxy/Cargo.toml | 4 + crates/connector_proxy/src/apis.rs | 46 +- .../connector_proxy/src/connector_runner.rs | 163 +++++-- crates/connector_proxy/src/errors.rs | 26 ++ .../airbyte_capture_interceptor.rs | 32 -- .../airbyte_source_interceptor.rs | 431 ++++++++++++++++++ .../src/interceptors/default_interceptors.rs | 6 - .../connector_proxy/src/interceptors/mod.rs | 3 +- .../network_proxy_capture_interceptor.rs | 43 +- .../network_proxy_materialize_interceptor.rs | 39 +- .../src/libs/airbyte_catalog.rs | 239 ++++++++++ crates/connector_proxy/src/libs/command.rs | 136 +++++- .../connector_proxy/src/libs/image_config.rs | 59 --- .../connector_proxy/src/libs/image_inspect.rs | 88 ++++ crates/connector_proxy/src/libs/json.rs | 28 ++ crates/connector_proxy/src/libs/mod.rs | 3 +- crates/connector_proxy/src/libs/protobuf.rs | 7 +- crates/connector_proxy/src/libs/stream.rs | 74 ++- crates/connector_proxy/src/main.rs | 150 +++--- go/capture/driver/airbyte/driver.go | 318 +++---------- go/connector/run.go | 56 ++- 24 files changed, 1659 insertions(+), 592 deletions(-) delete mode 100644 crates/connector_proxy/src/interceptors/airbyte_capture_interceptor.rs create mode 100644 crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs delete mode 100644 crates/connector_proxy/src/interceptors/default_interceptors.rs create mode 100644 crates/connector_proxy/src/libs/airbyte_catalog.rs delete mode 100644 crates/connector_proxy/src/libs/image_config.rs create mode 100644 crates/connector_proxy/src/libs/image_inspect.rs diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ac1d364981..ff196ae4b2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -61,7 +61,6 @@ jobs: - run: make install-tools - run: go mod download - run: make rust-test - - run: make go-test-ci - run: make rust-binaries - name: Ensure that generated files are unchanged. @@ -144,6 +143,19 @@ jobs: fetch-depth: 0 submodules: true + - name: Install protobuf compiler (it's not already included in CI runner) + run: sudo apt install -y libprotobuf-dev protobuf-compiler + + # We require a minimal Go version of 1.17. + - uses: actions/setup-go@v2 + with: + go-version: "1.17.3" + + - name: Install rust toolchain + run: rustup show + - run: make extra-ci-runner-setup + - run: make print-versions + - name: Set up Google Cloud SDK uses: google-github-actions/setup-gcloud@v0 with: @@ -190,6 +202,19 @@ jobs: -o /home/runner/work/flow/flow/.build/package/bin/gazette \ -o /home/runner/work/flow/flow/.build/package/bin/sops + - name: make go-test-ci + run: | + make go-test-ci \ + -o /home/runner/work/flow/flow/.build/package/bin/etcd \ + -o /home/runner/work/flow/flow/.build/package/bin/flowctl \ + -o /home/runner/work/flow/flow/.build/package/bin/flowctl-go \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-connector-proxy \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-network-proxy \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-parser \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-schemalate \ + -o /home/runner/work/flow/flow/.build/package/bin/gazette \ + -o /home/runner/work/flow/flow/.build/package/bin/sops + - name: make end-to-end-test run: | make end-to-end-test \ diff --git a/Cargo.lock b/Cargo.lock index b7e50da9eb..8ec7d684ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -236,9 +236,9 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] name = "base64ct" -version = "1.0.1" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a32fd6af2b5827bce66c29053ba0e7c42b9dcab01835835058558c10851a46b" +checksum = "71acf5509fc522cce1b100ac0121c635129bfd4d91cdf036bcc9b9935f97ccf5" [[package]] name = "bcrypt-pbkdf" @@ -536,7 +536,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom", + "nom 7.1.1", ] [[package]] @@ -671,6 +671,8 @@ dependencies = [ "flow_cli_common", "futures-core", "futures-util", + "json-pointer", + "libc", "network-proxy", "prost", "protocol", @@ -680,10 +682,12 @@ dependencies = [ "structopt", "strum 0.24.0", "strum_macros 0.24.0", + "tempfile", "thiserror", "tokio", "tokio-util", "tracing", + "validator", ] [[package]] @@ -786,9 +790,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.4" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +checksum = "fdbfe11fe19ff083c48923cf179540e8cd0535903dc35e178a1fdeeb59aef51f" dependencies = [ "cfg-if", "crossbeam-utils", @@ -819,6 +823,16 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -881,6 +895,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "ctor" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f877be4f7c9f246b183111634f75baa039715e3f46ce860677d3b19a69fb229c" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "ctr" version = "0.8.0" @@ -992,7 +1016,25 @@ dependencies = [ "lazy_static", "quickcheck", "quickcheck_macros", - "serde", + "serde 1.0.136", + "serde_json", + "serde_yaml", + "thiserror", + "tinyvec", + "tracing", + "url", +] + +[[package]] +name = "doc" +version = "0.0.0" +source = "git+https://github.com/estuary/flow#4f2ca48fda98b608dd2dc2d920dc40ecc60150f5" +dependencies = [ + "fancy-regex", + "itertools", + "json 0.0.0 (git+https://github.com/estuary/flow)", + "lazy_static", + "serde 1.0.136", "serde_json", "serde_yaml", "thiserror", @@ -1681,10 +1723,9 @@ dependencies = [ ] [[package]] -name = "json-patch" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f995a3c8f2bc3dd52a18a583e90f9ec109c047fa1603a853e46bcda14d2e279d" +name = "json" +version = "0.0.0" +source = "git+https://github.com/estuary/flow#4f2ca48fda98b608dd2dc2d920dc40ecc60150f5" dependencies = [ "serde", "serde_json", @@ -1695,6 +1736,15 @@ dependencies = [ name = "labels" version = "0.0.0" +[[package]] +name = "json-pointer" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fe841b94e719a482213cee19dd04927cf412f26d8dc84c5a446c081e49c2997" +dependencies = [ + "serde_json", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -1709,9 +1759,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.121" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" +checksum = "ad5c14e80759d0939d013e6ca49930e59fc53dd8e5009132f76240c179380c09" [[package]] name = "libflate" @@ -1981,6 +2031,17 @@ dependencies = [ "url", ] +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "lexical-core", + "memchr", + "version_check", +] + [[package]] name = "nom" version = "7.1.1" @@ -2656,9 +2717,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.17" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632d02bff7f874a36f33ea8bb416cd484b90cc66c1194b1a1110d067a7013f58" +checksum = "b4af2ec4714533fcdf07e886f17025ace8b997b9ce51204ee69b6da831c3da57" dependencies = [ "proc-macro2", ] @@ -2813,9 +2874,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.3" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +checksum = "7776223e2696f1aa4c6b0170e83212f47296a00424305117d013dfe86fb0fe55" dependencies = [ "getrandom 0.2.6", "redox_syscall", @@ -2944,6 +3005,40 @@ dependencies = [ "semver", ] +[[package]] +name = "rustls" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +dependencies = [ + "base64", + "log", + "ring", + "sct 0.6.1", + "webpki 0.21.4", +] + +[[package]] +name = "rustls" +version = "0.20.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fbfeb8d0ddb84706bc597a5574ab8912817c52a397f819e5b614e2265206921" +dependencies = [ + "log", + "ring", + "sct 0.7.0", + "webpki 0.22.0", +] + +[[package]] +name = "rustls-pemfile" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee86d63972a7c661d1536fefe8c3c8407321c3df668891286de28abcd087360" +dependencies = [ + "base64", +] + [[package]] name = "rustversion" version = "1.0.6" @@ -3280,6 +3375,139 @@ dependencies = [ "yaml-merge-keys", ] +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "sqlformat" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4b7922be017ee70900be125523f38bdd644f4f06a1b16e8fa5a8ee8c34bffd4" +dependencies = [ + "itertools", + "nom 7.1.1", + "unicode_categories", +] + +[[package]] +name = "sqlx" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc15591eb44ffb5816a4a70a7efd5dd87bfd3aa84c4c200401c4396140525826" +dependencies = [ + "sqlx-core", + "sqlx-macros", +] + +[[package]] +name = "sqlx-core" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "195183bf6ff8328bb82c0511a83faf60aacf75840103388851db61d7a9854ae3" +dependencies = [ + "ahash", + "atoi", + "base64", + "bitflags", + "byteorder", + "bytes", + "chrono", + "crc 2.1.0", + "crossbeam-queue", + "dirs 4.0.0", + "either", + "futures-channel", + "futures-core", + "futures-intrusive", + "futures-util", + "hashlink", + "hex", + "hmac 0.11.0", + "indexmap", + "itoa 1.0.1", + "libc", + "log", + "md-5", + "memchr", + "once_cell", + "paste 1.0.6", + "percent-encoding", + "rand 0.8.5", + "rustls 0.19.1", + "serde 1.0.136", + "serde_json", + "sha-1 0.9.8", + "sha2 0.9.9", + "smallvec", + "sqlformat", + "sqlx-rt", + "stringprep", + "thiserror", + "tokio-stream", + "url", + "webpki 0.21.4", + "webpki-roots 0.21.1", + "whoami", +] + +[[package]] +name = "sqlx-macros" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eee35713129561f5e55c554bba1c378e2a7e67f81257b7311183de98c50e6f94" +dependencies = [ + "dotenv", + "either", + "heck 0.3.3", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde 1.0.136", + "serde_json", + "sha2 0.9.9", + "sqlx-core", + "sqlx-rt", + "syn", + "url", +] + +[[package]] +name = "sqlx-rt" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b555e70fbbf84e269ec3858b7a6515bcfe7a166a7cc9c636dd6efd20431678b6" +dependencies = [ + "once_cell", + "tokio", + "tokio-rustls 0.22.0", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "stats_alloc" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a260c96bf26273969f360c2fc2e2c7732acc2ce49d939c7243c7230c2ad179d0" + +[[package]] +name = "stringprep" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "strsim" version = "0.8.0" @@ -3367,9 +3595,9 @@ checksum = "ab16ced94dbd8a46c82fd81e3ed9a8727dac2977ea869d217bcc4ea1f122e81f" [[package]] name = "syn" -version = "1.0.90" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "704df27628939572cd88d33f171cd6f896f4eaca85252c6e0a72d8d8287ee86f" +checksum = "ea297be220d52398dcc07ce15a209fce436d361735ac1db700cab3b6cdfb9f54" dependencies = [ "proc-macro2", "quote", @@ -4192,6 +4420,12 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.79" diff --git a/Makefile b/Makefile index d9925b3f52..a2390714a2 100644 --- a/Makefile +++ b/Makefile @@ -198,7 +198,6 @@ ${PKGDIR}/bin/flow-network-proxy: ${RUST_MUSL_BIN}/flow-network-proxy | ${PKGDIR ${PKGDIR}/bin/flow-connector-proxy: ${RUST_MUSL_BIN}/flow-connector-proxy | ${PKGDIR} cp ${RUST_MUSL_BIN}/flow-connector-proxy $@ - ########################################################################## # Make targets used by CI: @@ -245,7 +244,7 @@ go-test-fast: $(GO_BUILD_DEPS) | ${PKGDIR}/bin/etcd ${PKGDIR}/bin/sops ./go.sh test -p ${NPROC} --tags "${GO_BUILD_TAGS}" ./go/... .PHONY: go-test-ci -go-test-ci: $(GO_BUILD_DEPS) | ${PKGDIR}/bin/etcd ${PKGDIR}/bin/sops +go-test-ci: $(GO_BUILD_DEPS) | ${PKGDIR}/bin/etcd ${PKGDIR}/bin/sops ${PKGDIR}/bin/flow-connector-proxy ${PKGDIR}/bin/flowctl ${PKGDIR}/bin/flowctl-go PATH=${PKGDIR}/bin:$$PATH ;\ GORACE="halt_on_error=1" ;\ ./go.sh test -p ${NPROC} --tags "${GO_BUILD_TAGS}" --race --count=15 --failfast ./go/... diff --git a/crates/connector_proxy/Cargo.toml b/crates/connector_proxy/Cargo.toml index 1765af109f..8f93abdf0f 100644 --- a/crates/connector_proxy/Cargo.toml +++ b/crates/connector_proxy/Cargo.toml @@ -20,6 +20,8 @@ byteorder="*" clap = { version = "^3", features = ["derive"] } futures-core = "*" futures-util="*" +json-pointer="*" +libc="*" prost = "*" schemars = "*" serde = { version = "*", features = ["derive"]} @@ -27,7 +29,9 @@ serde_json = { version = "*", features = ["raw_value"]} structopt = "*" strum = "*" strum_macros = "*" +tempfile="*" thiserror = "*" tokio = { version = "1.15.0", features = ["full"] } tokio-util = { version = "*", features = ["io"] } tracing="*" +validator = { version = "*", features = ["derive"] } \ No newline at end of file diff --git a/crates/connector_proxy/src/apis.rs b/crates/connector_proxy/src/apis.rs index d3bf09b06c..08cc36cb81 100644 --- a/crates/connector_proxy/src/apis.rs +++ b/crates/connector_proxy/src/apis.rs @@ -1,9 +1,20 @@ -use crate::errors::Error; use bytes::Bytes; use clap::ArgEnum; use futures_core::stream::Stream; use std::pin::Pin; +// The protocol used by FlowRuntime to speak with connector-proxy. +// There are two ways to infer the protocol. +// 1. From the proxy command passed in from FlowRuntime to the connector proxy. +// 2. From the connector image labels and tags. +// The proxy raises an error if both are inconsistent. +#[derive(Debug, strum_macros::Display, ArgEnum, PartialEq, Clone)] +#[strum(serialize_all = "snake_case")] +pub enum FlowRuntimeProtocol { + Capture, + Materialize, +} + // Flow Capture operations defined in // https://github.com/estuary/flow/blob/master/go/protocols/capture/capture.proto #[derive(Debug, strum_macros::Display, ArgEnum, Clone)] @@ -29,39 +40,6 @@ pub enum FlowMaterializeOperation { Transactions, } -// To be used as a trait bound for interceptors. -pub trait FlowOperation {} -impl FlowOperation for FlowCaptureOperation {} -impl FlowOperation for FlowMaterializeOperation {} - // An interceptor modifies the request/response streams between Flow runtime and the connector. // InterceptorStream defines the type of input and output streams handled by interceptors. pub type InterceptorStream = Pin> + Send + Sync>>; - -// The generic param "T" below is bounded by FlowOperation. -// A converter is a function that contains the specific stream-handling logic of an interceptor. -type ConverterFn = Box Result>; -// An intercept is characterized by a pair of converters, corresponding to the handling logic of request and response streams, respectively. -pub type RequestResponseConverterPair = (ConverterFn, ConverterFn); -pub trait Interceptor { - fn get_converters() -> RequestResponseConverterPair { - ( - Box::new(|_op, stream| Ok(stream)), - Box::new(|_op, stream| Ok(stream)), - ) - } -} - -// Two converter pairs can be composed together to form a new converter pair. -pub fn compose( - a: RequestResponseConverterPair, - b: RequestResponseConverterPair, -) -> RequestResponseConverterPair { - let (req_a, resp_a) = a; - let (req_b, resp_b) = b; - ( - Box::new(move |op, stream| (req_b)(op, (req_a)(op, stream)?)), - // Response conversions are applied in the reverse order of the request conversions. - Box::new(move |op, stream| (resp_a)(op, (resp_b)(op, stream)?)), - ) -} diff --git a/crates/connector_proxy/src/connector_runner.rs b/crates/connector_proxy/src/connector_runner.rs index fbe1a9d61c..28dc1b4255 100644 --- a/crates/connector_proxy/src/connector_runner.rs +++ b/crates/connector_proxy/src/connector_runner.rs @@ -1,48 +1,139 @@ -use crate::apis::RequestResponseConverterPair; +use crate::apis::{FlowCaptureOperation, FlowMaterializeOperation, InterceptorStream}; use crate::errors::Error; -use crate::libs::command::{check_exit_status, invoke_connector}; +use crate::interceptors::{ + airbyte_source_interceptor::AirbyteSourceInterceptor, + network_proxy_capture_interceptor::NetworkProxyCaptureInterceptor, + network_proxy_materialize_interceptor::NetworkProxyMaterializeInterceptor, +}; +use crate::libs::command::{ + check_exit_status, invoke_connector_delayed, invoke_connector_direct, parse_child, +}; use tokio::io::copy; +use tokio::process::{ChildStderr, ChildStdin, ChildStdout}; use tokio_util::io::{ReaderStream, StreamReader}; -pub async fn run_connector( - operation: T, +pub async fn run_flow_capture_connector( + op: &FlowCaptureOperation, entrypoint: Vec, - converter_pair: RequestResponseConverterPair, ) -> Result<(), Error> { - // prepare entrypoint and args. + let (entrypoint, mut args) = parse_entrypoint(&entrypoint)?; + args.push(op.to_string()); + + let (mut child, child_stdin, child_stdout, child_stderr) = + parse_child(invoke_connector_direct(entrypoint, args)?)?; + + let adapted_request_stream = + NetworkProxyCaptureInterceptor::adapt_request_stream(op, request_stream())?; + + let adapted_response_stream = + NetworkProxyCaptureInterceptor::adapt_response_stream(op, response_stream(child_stdout))?; + + streaming_all( + child_stdin, + child_stderr, + adapted_request_stream, + adapted_response_stream, + ) + .await?; + + check_exit_status("flow capture connector:", child.wait().await) +} + +pub async fn run_flow_materialize_connector( + op: &FlowMaterializeOperation, + entrypoint: Vec, +) -> Result<(), Error> { + let (entrypoint, mut args) = parse_entrypoint(&entrypoint)?; + args.push(op.to_string()); + + let (mut child, child_stdin, child_stdout, child_stderr) = + parse_child(invoke_connector_direct(entrypoint, args)?)?; + + let adapted_request_stream = + NetworkProxyMaterializeInterceptor::adapt_request_stream(op, request_stream())?; + + let adapted_response_stream = NetworkProxyMaterializeInterceptor::adapt_response_stream( + op, + response_stream(child_stdout), + )?; + + streaming_all( + child_stdin, + child_stderr, + adapted_request_stream, + adapted_response_stream, + ) + .await?; + + check_exit_status("flow materialize connector:", child.wait().await) +} + +pub async fn run_airbyte_source_connector( + op: &FlowCaptureOperation, + entrypoint: Vec, +) -> Result<(), Error> { + let mut airbyte_interceptor = AirbyteSourceInterceptor::new(); + + let (entrypoint, args) = parse_entrypoint(&entrypoint)?; + let args = airbyte_interceptor.adapt_command_args(op, args)?; + + let (mut child, child_stdin, child_stdout, child_stderr) = + parse_child(invoke_connector_delayed(entrypoint, args).await?)?; + + let adapted_request_stream = airbyte_interceptor.adapt_request_stream( + op, + NetworkProxyCaptureInterceptor::adapt_request_stream(op, request_stream())?, + )?; + + let adapted_response_stream = NetworkProxyCaptureInterceptor::adapt_response_stream( + op, + airbyte_interceptor.adapt_response_stream(op, response_stream(child_stdout))?, + )?; + + streaming_all( + child_stdin, + child_stderr, + adapted_request_stream, + adapted_response_stream, + ) + .await?; + + check_exit_status("airbyte source connector:", child.wait().await) +} + +fn parse_entrypoint(entrypoint: &Vec) -> Result<(String, Vec), Error> { if entrypoint.len() == 0 { return Err(Error::EmptyEntrypointError); } - let mut args = Vec::new(); - args.extend_from_slice(&entrypoint[1..]); - args.push(operation.to_string()); - - let entrypoint = entrypoint[0].clone(); - - // invoke the connector and converts the request/response streams. - let mut child = invoke_connector(entrypoint, &args)?; - - let (request_converter, response_converter) = converter_pair; - // Perform conversions on requests and responses and starts bi-directional copying. - let mut request_source = StreamReader::new((request_converter)( - &operation, - Box::pin(ReaderStream::new(tokio::io::stdin())), - )?); - let mut request_destination = child.stdin.take().ok_or(Error::MissingIOPipe)?; - - let response_stream_out = child.stdout.take().ok_or(Error::MissingIOPipe)?; - let mut response_source = StreamReader::new((response_converter)( - &operation, - Box::pin(ReaderStream::new(response_stream_out)), - )?); - let mut response_destination = tokio::io::stdout(); - - let (a, b) = tokio::join!( - copy(&mut request_source, &mut request_destination), - copy(&mut response_source, &mut response_destination) + + return Ok((entrypoint[0].clone(), entrypoint[1..].to_vec())); +} + +fn request_stream() -> InterceptorStream { + Box::pin(ReaderStream::new(tokio::io::stdin())) +} + +fn response_stream(child_stdout: ChildStdout) -> InterceptorStream { + Box::pin(ReaderStream::new(child_stdout)) +} + +async fn streaming_all( + mut request_stream_writer: ChildStdin, + mut error_reader: ChildStderr, + request_stream: InterceptorStream, + response_stream: InterceptorStream, +) -> Result<(), Error> { + let mut request_stream_reader = StreamReader::new(request_stream); + let mut response_stream_reader = StreamReader::new(response_stream); + let mut response_stream_writer = tokio::io::stdout(); + let mut error_writer = tokio::io::stderr(); + + let (a, b, c) = tokio::join!( + copy(&mut request_stream_reader, &mut request_stream_writer), + copy(&mut response_stream_reader, &mut response_stream_writer), + copy(&mut error_reader, &mut error_writer), ); - a?; - b?; - check_exit_status(child.wait().await) + tracing::info!("Done streaming, transferred bytes: {} {} {}", a?, b?, c?); + Ok(()) } diff --git a/crates/connector_proxy/src/errors.rs b/crates/connector_proxy/src/errors.rs index 47eda729a9..35e5d938a3 100644 --- a/crates/connector_proxy/src/errors.rs +++ b/crates/connector_proxy/src/errors.rs @@ -1,5 +1,8 @@ #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("failed in starting bouncer process.")] + BouncerProcessStartError, + #[error("channel timeout in receiving messages after 5 seconds.")] ChannelTimeoutError, @@ -18,6 +21,12 @@ pub enum Error { #[error("missing process io pipes.")] MissingIOPipe, + #[error("mismatching runtime protocol")] + MismatchingRuntimeProtocol, + + #[error("No ready signal is received. {0}")] + NotReady(&'static str), + #[error("invalid endpoint json config.")] InvalidEndpointConfig, @@ -36,11 +45,28 @@ pub enum Error { #[error(transparent)] MessageEncodeError(#[from] prost::EncodeError), + #[error("Missing required image inspect file. Specify it via --image-inspect-json-path in command line.")] + MissingImageInspectFile, + #[error(transparent)] NetworkProxyError(#[from] network_proxy::errors::Error), + #[error(transparent)] + TempfilePersistError(#[from] tempfile::PersistError), + #[error("Tokio task execution error.")] TokioTaskExecutionError(#[from] tokio::task::JoinError), + + #[error("The operation of '{0}' is not expected for the given protocol.")] + UnexpectedOperation(String), +} + +pub fn raise_custom_error(message: &str) -> Result<(), std::io::Error> { + Err(create_custom_error(message)) +} + +pub fn create_custom_error(message: &str) -> std::io::Error { + std::io::Error::new(std::io::ErrorKind::Other, message) } pub trait Must { diff --git a/crates/connector_proxy/src/interceptors/airbyte_capture_interceptor.rs b/crates/connector_proxy/src/interceptors/airbyte_capture_interceptor.rs deleted file mode 100644 index 296d7953e2..0000000000 --- a/crates/connector_proxy/src/interceptors/airbyte_capture_interceptor.rs +++ /dev/null @@ -1,32 +0,0 @@ -use crate::apis::{ - FlowCaptureOperation, Interceptor, InterceptorStream, RequestResponseConverterPair, -}; -use crate::errors::Error; - -// A placeholder for real logic of airbyte connectors. Details might change during real-implementations. -pub struct AirbyteCaptureInterceptor {} - -impl AirbyteCaptureInterceptor { - fn convert_request( - _operation: &FlowCaptureOperation, - _in_stream: InterceptorStream, - ) -> Result { - panic!("TBD AirbyteCaptureInterceptor") - } - - fn convert_response( - _operation: &FlowCaptureOperation, - _in_stream: InterceptorStream, - ) -> Result { - panic!("TBD AirbyteCaptureInterceptor") - } -} - -impl Interceptor for AirbyteCaptureInterceptor { - fn get_converters() -> RequestResponseConverterPair { - ( - Box::new(Self::convert_request), - Box::new(Self::convert_response), - ) - } -} diff --git a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs new file mode 100644 index 0000000000..ba61bd3509 --- /dev/null +++ b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs @@ -0,0 +1,431 @@ +use crate::apis::{FlowCaptureOperation, InterceptorStream}; + +use crate::errors::{create_custom_error, raise_custom_error, Error}; +use crate::libs::airbyte_catalog::{ + self, ConfiguredCatalog, ConfiguredStream, DestinationSyncMode, Range, ResourceSpec, Status, + SyncMode, +}; +use crate::libs::command::READY; +use crate::libs::json::{create_root_schema, tokenize_jsonpointer}; +use crate::libs::protobuf::{decode_message, encode_message}; +use crate::libs::stream::stream_all_airbyte_messages; + +use async_stream::try_stream; +use bytes::Bytes; +use protocol::capture::{ + discover_response, validate_response, DiscoverRequest, DiscoverResponse, Documents, + PullRequest, PullResponse, SpecRequest, SpecResponse, ValidateRequest, ValidateResponse, +}; +use protocol::flow::{DriverCheckpoint, Slice}; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::Mutex; +use validator::Validate; + +use futures_util::StreamExt; +use json_pointer::JsonPointer; +use serde_json::value::RawValue; +use std::fs::File; +use std::io::Write; +use tempfile::{Builder, TempDir}; +use tokio_util::io::StreamReader; + +const CONFIG_FILE_NAME: &str = "config.json"; +const CATALOG_FILE_NAME: &str = "catalog.json"; +const STATE_FILE_NAME: &str = "state.json"; + +pub struct AirbyteSourceInterceptor { + validate_request: Arc>>, + stream_to_binding: Arc>>, + tmp_dir: TempDir, +} + +impl AirbyteSourceInterceptor { + pub fn new() -> Self { + AirbyteSourceInterceptor { + validate_request: Arc::new(Mutex::new(None)), + stream_to_binding: Arc::new(Mutex::new(HashMap::new())), + tmp_dir: Builder::new() + .prefix("airbyte-source-") + .tempdir_in("/var/tmp") + .expect("failed to create temp dir."), + } + } + + fn adapt_spec_request_stream(&mut self, in_stream: InterceptorStream) -> InterceptorStream { + Box::pin(try_stream! { + let mut reader = StreamReader::new(in_stream); + decode_message::(&mut reader).await?.ok_or(create_custom_error("missing spec request."))?; + + yield Bytes::from(READY); + }) + } + + fn adapt_spec_response_stream(&mut self, in_stream: InterceptorStream) -> InterceptorStream { + Box::pin(try_stream! { + let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); + loop { + let message = match airbyte_message_stream.next().await { + None => break, + Some(message) => message? + }; + if let Some(spec) = message.spec { + let mut resp = SpecResponse::default(); + resp.endpoint_spec_schema_json = spec.connection_specification.to_string(); + resp.resource_spec_schema_json = serde_json::to_string_pretty(&create_root_schema::())?; + if let Some(url) = spec.documentation_url { + resp.documentation_url = url; + } + yield encode_message(&resp)?; + } else if let Some(mlog) = message.log { + mlog.log(); + } else { + raise_custom_error("unexpected spec response.")?; + } + } + }) + } + + fn adapt_discover_request( + &mut self, + config_file_path: String, + in_stream: InterceptorStream, + ) -> InterceptorStream { + Box::pin(try_stream! { + let mut reader = StreamReader::new(in_stream); + let request = decode_message::(&mut reader).await?.ok_or(create_custom_error("missing discover request."))?; + + File::create(config_file_path)?.write_all(request.endpoint_spec_json.as_bytes())?; + + yield Bytes::from(READY); + }) + } + + fn adapt_discover_response_stream( + &mut self, + in_stream: InterceptorStream, + ) -> InterceptorStream { + Box::pin(try_stream! { + let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); + loop { + let message = match airbyte_message_stream.next().await { + None => break, + Some(message) => message? + }; + + if let Some(catalog) = message.catalog { + let mut resp = DiscoverResponse::default(); + for stream in catalog.streams { + let mode = if stream.supported_sync_modes.contains(&SyncMode::Incremental) {SyncMode::Incremental} else {SyncMode::FullRefresh}; + let resource_spec = ResourceSpec { + stream: stream.name.clone(), + namespace: stream.namespace, + sync_mode: mode + }; + + let key_ptrs = match stream.source_defined_primary_key { + None => Vec::new(), + Some(keys) => keys.iter().map(|k| JsonPointer::new(k).to_string()).collect() + }; + resp.bindings.push(discover_response::Binding{ + recommended_name: stream.name.clone(), + resource_spec_json: serde_json::to_string(&resource_spec)?, + key_ptrs: key_ptrs, + document_schema_json: stream.json_schema.to_string(), + }) + } + + yield encode_message(&resp)?; + } else if let Some(mlog) = message.log { + mlog.log(); + } else { + raise_custom_error("unexpected discover response.")?; + } + } + }) + } + + fn adapt_validate_request_stream( + &mut self, + config_file_path: String, + validate_request: Arc>>, + in_stream: InterceptorStream, + ) -> InterceptorStream { + Box::pin(try_stream! { + let mut reader = StreamReader::new(in_stream); + let request = decode_message::(&mut reader).await?.ok_or(create_custom_error("missing validate request"))?; + *validate_request.lock().await = Some(request.clone()); + + File::create(config_file_path)?.write_all(request.endpoint_spec_json.as_bytes())?; + + yield Bytes::from(READY); + }) + } + + fn adapt_validate_response_stream( + &mut self, + validate_request: Arc>>, + in_stream: InterceptorStream, + ) -> InterceptorStream { + Box::pin(try_stream! { + let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); + loop { + let message = match airbyte_message_stream.next().await { + None => break, + Some(message) => message? + }; + + if let Some(connection_status) = message.connection_status { + if connection_status.status != Status::Succeeded { + raise_custom_error(&format!("validation failed {:?}", connection_status))?; + } + + let req = validate_request.lock().await; + let req = req.as_ref().ok_or(create_custom_error("missing validate request."))?; + let mut resp = ValidateResponse::default(); + for binding in &req.bindings { + let resource: ResourceSpec = serde_json::from_str(&binding.resource_spec_json)?; + resp.bindings.push(validate_response::Binding {resource_path: vec![resource.stream]}); + } + drop(req); + yield encode_message(&resp)?; + } else if let Some(mlog) = message.log { + mlog.log(); + } else { + raise_custom_error("unexpected validate response.")?; + } + } + }) + } + + fn adapt_pull_request_stream( + &mut self, + config_file_path: String, + catalog_file_path: String, + state_file_path: String, + stream_to_binding: Arc>>, + in_stream: InterceptorStream, + ) -> InterceptorStream { + Box::pin(try_stream! { + let mut reader = StreamReader::new(in_stream); + let mut request = decode_message::(&mut reader).await?.ok_or(create_custom_error("missing pull request"))?; + if let Some(ref mut o) = request.open { + File::create(state_file_path)?.write_all(&o.driver_checkpoint_json)?; + + if let Some(ref mut c) = o.capture { + File::create(config_file_path)?.write_all(&c.endpoint_spec_json.as_bytes())?; + + let mut catalog = ConfiguredCatalog { + streams: Vec::new(), + tail: o.tail, + range: Range { begin: o.key_begin, end: o.key_end } + }; + + let mut stream_to_binding = stream_to_binding.lock().await; + + for (i, binding) in c.bindings.iter().enumerate() { + let resource: ResourceSpec = serde_json::from_str(&binding.resource_spec_json)?; + stream_to_binding.insert(resource.stream.clone(), i); + + let mut projections = HashMap::new(); + if let Some(ref collection) = binding.collection { + for p in &collection.projections { + projections.insert(p.field.clone(), p.ptr.clone()); + } + + let primary_key: Vec> = collection.key_ptrs.iter().map(|ptr| tokenize_jsonpointer(ptr)).collect(); + catalog.streams.push(ConfiguredStream{ + sync_mode: resource.sync_mode.clone(), + destination_sync_mode: DestinationSyncMode::Append, + cursor_field: None, + primary_key: Some(primary_key), + stream: airbyte_catalog::Stream{ + name: resource.stream, + namespace: resource.namespace, + json_schema: RawValue::from_string(collection.schema_json.clone())?, + supported_sync_modes: vec![resource.sync_mode.clone()], + default_cursor_field: None, + source_defined_cursor: None, + source_defined_primary_key: None, + }, + projections: projections, + }); + } + } + + if let Err(e) = catalog.validate() { + raise_custom_error(&format!("invalid config_catalog: {:?}", e))? + } + + serde_json::to_writer(File::create(catalog_file_path)?, &catalog)? + } + + // release the lock. + drop(stream_to_binding); + + yield Bytes::from(READY); + } + }) + } + + fn adapt_pull_response_stream( + &mut self, + stream_to_binding: Arc>>, + in_stream: InterceptorStream, + ) -> InterceptorStream { + Box::pin(try_stream! { + let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); + // transaction_pending is true if the connector writes output messages and exits _without_ writing + // a final state checkpoint. + let mut transaction_pending = false; + + loop { + let message = match airbyte_message_stream.next().await { + None => break, + Some(message) => message? + }; + + let mut resp = PullResponse::default(); + if let Some(state) = message.state { + resp.checkpoint = Some(DriverCheckpoint{ + driver_checkpoint_json: state.data.get().as_bytes().to_vec(), + rfc7396_merge_patch: match state.merge { + Some(m) => m, + None => false, + }, + }); + + yield encode_message(&resp)?; + transaction_pending = false; + } else if let Some(record) = message.record { + let stream_to_binding = stream_to_binding.lock().await; + match stream_to_binding.get(&record.stream) { + None => { + raise_custom_error(&format!("connector record with unknown stream {}", record.stream))?; + } + Some(binding) => { + let arena = record.data.get().as_bytes().to_vec(); + let arena_len: u32 = arena.len() as u32; + resp.documents = Some(Documents { + binding: *binding as u32, + arena: arena, + docs_json: vec![Slice{begin: 0, end: arena_len}] + }) + } + } + drop(stream_to_binding); + yield encode_message(&resp)?; + transaction_pending = true; + } else if let Some(mlog) = message.log { + mlog.log(); + } else { + raise_custom_error("unexpected pull response.")?; + } + } + + if transaction_pending { + // We generate a synthetic commit now, and the empty checkpoint means the assumed behavior + // of the next invocation will be "full refresh". + let mut resp = PullResponse::default(); + resp.checkpoint = Some(DriverCheckpoint{ + driver_checkpoint_json: Vec::new(), + rfc7396_merge_patch: false + }); + yield encode_message(&resp)?; + } + }) + } + + fn input_file_path(&mut self, file_name: &str) -> String { + self.tmp_dir + .path() + .join(file_name) + .to_str() + .expect("failed construct config file name.") + .into() + } +} + +impl AirbyteSourceInterceptor { + pub fn adapt_command_args( + &mut self, + op: &FlowCaptureOperation, + args: Vec, + ) -> Result, Error> { + let config_file_path = self.input_file_path(CONFIG_FILE_NAME); + let catalog_file_path = self.input_file_path(CATALOG_FILE_NAME); + let state_file_path = self.input_file_path(STATE_FILE_NAME); + + let airbyte_args = match op { + FlowCaptureOperation::Spec => vec!["spec"], + FlowCaptureOperation::Discover => vec!["discover", "--config", &config_file_path], + FlowCaptureOperation::Validate => vec!["check", "--config", &config_file_path], + FlowCaptureOperation::Pull => { + vec![ + "read", + "--config", + &config_file_path, + "--catalog", + &catalog_file_path, + "--state", + &state_file_path, + ] + } + + _ => return Err(Error::UnexpectedOperation(op.to_string())), + }; + + let airbyte_args: Vec = airbyte_args.into_iter().map(Into::into).collect(); + Ok([airbyte_args, args].concat()) + } + + pub fn adapt_request_stream( + &mut self, + op: &FlowCaptureOperation, + in_stream: InterceptorStream, + ) -> Result { + let config_file_path = self.input_file_path(CONFIG_FILE_NAME); + let catalog_file_path = self.input_file_path(CATALOG_FILE_NAME); + let state_file_path = self.input_file_path(STATE_FILE_NAME); + + match op { + FlowCaptureOperation::Spec => Ok(self.adapt_spec_request_stream(in_stream)), + FlowCaptureOperation::Discover => { + Ok(self.adapt_discover_request(config_file_path, in_stream)) + } + FlowCaptureOperation::Validate => Ok(self.adapt_validate_request_stream( + config_file_path, + Arc::clone(&self.validate_request), + in_stream, + )), + FlowCaptureOperation::Pull => Ok(self.adapt_pull_request_stream( + config_file_path, + catalog_file_path, + state_file_path, + Arc::clone(&self.stream_to_binding), + in_stream, + )), + + _ => Err(Error::UnexpectedOperation(op.to_string())), + } + } + + pub fn adapt_response_stream( + &mut self, + op: &FlowCaptureOperation, + in_stream: InterceptorStream, + ) -> Result { + match op { + FlowCaptureOperation::Spec => Ok(self.adapt_spec_response_stream(in_stream)), + FlowCaptureOperation::Discover => Ok(self.adapt_discover_response_stream(in_stream)), + FlowCaptureOperation::Validate => { + Ok(self + .adapt_validate_response_stream(Arc::clone(&self.validate_request), in_stream)) + } + FlowCaptureOperation::Pull => { + Ok(self.adapt_pull_response_stream(Arc::clone(&self.stream_to_binding), in_stream)) + } + _ => Err(Error::UnexpectedOperation(op.to_string())), + } + } +} diff --git a/crates/connector_proxy/src/interceptors/default_interceptors.rs b/crates/connector_proxy/src/interceptors/default_interceptors.rs deleted file mode 100644 index 979c4dc64c..0000000000 --- a/crates/connector_proxy/src/interceptors/default_interceptors.rs +++ /dev/null @@ -1,6 +0,0 @@ -use crate::apis::{FlowCaptureOperation, FlowMaterializeOperation, Interceptor}; -pub struct DefaultFlowCaptureInterceptor {} -impl Interceptor for DefaultFlowCaptureInterceptor {} - -pub struct DefaultFlowMaterializeInterceptor {} -impl Interceptor for DefaultFlowMaterializeInterceptor {} diff --git a/crates/connector_proxy/src/interceptors/mod.rs b/crates/connector_proxy/src/interceptors/mod.rs index 6323473bad..37b44c07f0 100644 --- a/crates/connector_proxy/src/interceptors/mod.rs +++ b/crates/connector_proxy/src/interceptors/mod.rs @@ -1,4 +1,3 @@ -pub mod airbyte_capture_interceptor; -pub mod default_interceptors; +pub mod airbyte_source_interceptor; pub mod network_proxy_capture_interceptor; pub mod network_proxy_materialize_interceptor; diff --git a/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs b/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs index 8792435bd8..6b98cf6d07 100644 --- a/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs @@ -1,6 +1,4 @@ -use crate::apis::{ - FlowCaptureOperation, Interceptor, InterceptorStream, RequestResponseConverterPair, -}; +use crate::apis::{FlowCaptureOperation, InterceptorStream}; use crate::errors::{Error, Must}; use crate::libs::network_proxy::NetworkProxy; use crate::libs::protobuf::{decode_message, encode_message}; @@ -18,7 +16,7 @@ use tokio_util::io::StreamReader; pub struct NetworkProxyCaptureInterceptor {} impl NetworkProxyCaptureInterceptor { - fn convert_discover_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_discover_request_stream(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -29,7 +27,7 @@ impl NetworkProxyCaptureInterceptor { }) } - fn convert_validate_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_validate_request_stream(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -40,7 +38,7 @@ impl NetworkProxyCaptureInterceptor { }) } - fn convert_apply_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_apply_request(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -54,7 +52,7 @@ impl NetworkProxyCaptureInterceptor { }) } - fn convert_pull_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_pull_request_stream(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -74,27 +72,29 @@ impl NetworkProxyCaptureInterceptor { } }) } +} - fn convert_request( - operation: &FlowCaptureOperation, +impl NetworkProxyCaptureInterceptor { + pub fn adapt_request_stream( + op: &FlowCaptureOperation, in_stream: InterceptorStream, ) -> Result { - Ok(match operation { - FlowCaptureOperation::Discover => Self::convert_discover_request(in_stream), - FlowCaptureOperation::Validate => Self::convert_validate_request(in_stream), + Ok(match op { + FlowCaptureOperation::Discover => Self::adapt_discover_request_stream(in_stream), + FlowCaptureOperation::Validate => Self::adapt_validate_request_stream(in_stream), FlowCaptureOperation::ApplyUpsert | FlowCaptureOperation::ApplyDelete => { - Self::convert_apply_request(in_stream) + Self::adapt_apply_request(in_stream) } - FlowCaptureOperation::Pull => Self::convert_pull_request(in_stream), + FlowCaptureOperation::Pull => Self::adapt_pull_request_stream(in_stream), _ => in_stream, }) } - fn convert_response( - operation: &FlowCaptureOperation, + pub fn adapt_response_stream( + op: &FlowCaptureOperation, in_stream: InterceptorStream, ) -> Result { - Ok(match operation { + Ok(match op { FlowCaptureOperation::Spec => Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut response = decode_message::(&mut reader).await.or_bail().expect("No expected response received."); @@ -107,12 +107,3 @@ impl NetworkProxyCaptureInterceptor { }) } } - -impl Interceptor for NetworkProxyCaptureInterceptor { - fn get_converters() -> RequestResponseConverterPair { - ( - Box::new(Self::convert_request), - Box::new(Self::convert_response), - ) - } -} diff --git a/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs b/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs index c916dc4409..d85c05ecbc 100644 --- a/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs @@ -1,6 +1,4 @@ -use crate::apis::{ - FlowMaterializeOperation, Interceptor, InterceptorStream, RequestResponseConverterPair, -}; +use crate::apis::{FlowMaterializeOperation, InterceptorStream}; use crate::errors::{Error, Must}; use crate::libs::network_proxy::NetworkProxy; use crate::libs::protobuf::{decode_message, encode_message}; @@ -17,7 +15,7 @@ use tokio_util::io::StreamReader; pub struct NetworkProxyMaterializeInterceptor {} impl NetworkProxyMaterializeInterceptor { - fn convert_spec_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_spec_request(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -30,7 +28,7 @@ impl NetworkProxyMaterializeInterceptor { }) } - fn convert_apply_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_apply_request(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -44,7 +42,7 @@ impl NetworkProxyMaterializeInterceptor { }) } - fn convert_transactions_request(in_stream: InterceptorStream) -> InterceptorStream { + fn adapt_transactions_request(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); @@ -64,26 +62,28 @@ impl NetworkProxyMaterializeInterceptor { } }) } +} - fn convert_request( - operation: &FlowMaterializeOperation, +impl NetworkProxyMaterializeInterceptor { + pub fn adapt_request_stream( + op: &FlowMaterializeOperation, in_stream: InterceptorStream, ) -> Result { - Ok(match operation { - FlowMaterializeOperation::Validate => Self::convert_spec_request(in_stream), + Ok(match op { + FlowMaterializeOperation::Validate => Self::adapt_spec_request(in_stream), FlowMaterializeOperation::ApplyUpsert | FlowMaterializeOperation::ApplyDelete => { - Self::convert_apply_request(in_stream) + Self::adapt_apply_request(in_stream) } - FlowMaterializeOperation::Transactions => Self::convert_transactions_request(in_stream), + FlowMaterializeOperation::Transactions => Self::adapt_transactions_request(in_stream), _ => in_stream, }) } - fn convert_response( - operation: &FlowMaterializeOperation, + pub fn adapt_response_stream( + op: &FlowMaterializeOperation, in_stream: InterceptorStream, ) -> Result { - Ok(match operation { + Ok(match op { FlowMaterializeOperation::Spec => Box::pin(stream! { let mut reader = StreamReader::new(in_stream); let mut response = decode_message::(&mut reader).await.or_bail().expect("expected response is not received."); @@ -96,12 +96,3 @@ impl NetworkProxyMaterializeInterceptor { }) } } - -impl Interceptor for NetworkProxyMaterializeInterceptor { - fn get_converters() -> RequestResponseConverterPair { - ( - Box::new(Self::convert_request), - Box::new(Self::convert_response), - ) - } -} diff --git a/crates/connector_proxy/src/libs/airbyte_catalog.rs b/crates/connector_proxy/src/libs/airbyte_catalog.rs new file mode 100644 index 0000000000..dc54c110ca --- /dev/null +++ b/crates/connector_proxy/src/libs/airbyte_catalog.rs @@ -0,0 +1,239 @@ +use std::collections::HashMap; + +use schemars::JsonSchema; +use serde::ser::{SerializeStruct, Serializer}; +use serde::{Deserialize, Serialize}; +use serde_json::value::RawValue; +use validator::{Validate, ValidationError}; + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, JsonSchema)] +#[serde(rename_all = "snake_case")] +pub enum SyncMode { + Incremental, + FullRefresh, +} + +#[derive(Debug, Serialize, Deserialize, Clone, Validate)] +#[serde(rename_all = "snake_case")] +pub struct Stream { + pub name: String, + pub json_schema: Box, + #[validate(length(min = 1))] + pub supported_sync_modes: Vec, + pub source_defined_cursor: Option, + pub default_cursor_field: Option>, + pub source_defined_primary_key: Option>>, + pub namespace: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum DestinationSyncMode { + Append, + Overwrite, + AppendDedup, +} + +#[derive(Debug, Serialize, Deserialize, Clone, Validate)] +#[serde(rename_all = "snake_case")] +#[validate(schema(function = "Self::validate_configured_stream"))] +pub struct ConfiguredStream { + #[validate] + pub stream: Stream, + pub sync_mode: SyncMode, + pub destination_sync_mode: DestinationSyncMode, + pub cursor_field: Option>, + pub primary_key: Option>>, + + #[serde(alias = "estuary.dev/projections")] + pub projections: HashMap, +} +impl ConfiguredStream { + fn validate_configured_stream(&self) -> Result<(), ValidationError> { + if self.stream.supported_sync_modes.contains(&self.sync_mode) { + Ok(()) + } else { + Err(ValidationError::new( + "sync_mode is not in the supported list.", + )) + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, Validate)] +pub struct Catalog { + #[serde(rename = "streams")] + #[validate] + pub streams: Vec, +} + +#[derive(Debug, Deserialize, Clone, Validate)] +#[validate(schema(function = "Self::validate_range"))] +pub struct Range { + pub begin: u32, + pub end: u32, +} + +impl Range { + fn validate_range(&self) -> Result<(), ValidationError> { + if self.begin <= self.end { + Ok(()) + } else { + Err(ValidationError::new("expected Begin <= End")) + } + } +} + +impl Serialize for Range { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut state = serializer.serialize_struct("Range", 2)?; + state.serialize_field("begin", &format!("{:x}", self.begin))?; + state.serialize_field("end", &format!("{:x}", self.end))?; + state.end() + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, Validate)] +#[serde(rename_all = "snake_case")] +pub struct ConfiguredCatalog { + #[serde(rename = "streams")] + #[validate(length(min = 1))] + #[validate] + pub streams: Vec, + + #[serde(alias = "estuary.dev/tail")] + pub tail: bool, + + #[serde(alias = "estuary.dev/range")] + #[validate] + pub range: Range, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum Status { + Succeeded, + Failed, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub struct ConnectionStatus { + pub status: Status, + pub message: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub struct Record { + pub stream: String, + pub data: Box, + pub emitted_at: Option, + pub namespace: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum LogLevel { + Trace, + Debug, + Info, + Warn, + Error, + Fatal, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct Log { + pub level: LogLevel, + pub message: String, +} +impl Log { + pub fn log(&self) { + match self.level { + LogLevel::Trace => tracing::trace!(?self.message), + LogLevel::Debug => tracing::debug!(?self.message), + LogLevel::Info => tracing::info!(?self.message), + LogLevel::Warn => tracing::warn!(?self.message), + LogLevel::Error | LogLevel::Fatal => tracing::error!(?self.message), + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub struct State { + // Data is the actual state associated with the ingestion. This must be a JSON _Object_ in order + // to comply with the airbyte specification. + pub data: Box, + + // Merge indicates that Data is an RFC 7396 JSON Merge Patch, and should + // be be reduced into the previous state accordingly. + #[serde(alias = "estuary.dev/merge")] + pub merge: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "camelCase")] +pub struct Spec { + pub documentation_url: Option, + pub changelog_url: Option, + pub connection_specification: Box, + pub supports_incremental: bool, + + // SupportedDestinationSyncModes is ignored by Flow + pub supported_destination_sync_modes: Option>, + // SupportsNormalization is not currently used or supported by Flow or estuary-developed + // connectors + pub supports_normalization: Option, + // SupportsDBT is not currently used or supported by Flow or estuary-developed connectors + #[serde(rename = "supportsDBT")] + pub supports_dbt: Option, + + // AuthSpecification is not currently used or supported by Flow or estuary-developed + // connectors, and it is deprecated in the airbyte spec. + pub auth_specification: Option>, + // AdvancedAuth is not currently used or supported by Flow or estuary-developed + // connectors. + pub advanced_auth: Option>, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "SCREAMING_SNAKE_CASE")] +pub enum MessageType { + Record, + State, + Log, + Spec, + ConnectionStatus, + Catalog, +} + +#[derive(Debug, Serialize, Deserialize, Clone, Validate)] +#[serde(rename_all = "camelCase")] +pub struct Message { + #[serde(rename = "type")] + pub message_type: MessageType, + + pub log: Option, + pub state: Option, + pub record: Option, + pub connection_status: Option, + pub spec: Option, + #[validate] + pub catalog: Option, +} + +#[derive(Debug, Serialize, Deserialize, JsonSchema, Clone)] +#[serde(rename_all = "camelCase")] +// ResourceSpec is the configuration for Airbyte source streams. +pub struct ResourceSpec { + pub stream: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub namespace: Option, + pub sync_mode: SyncMode, +} diff --git a/crates/connector_proxy/src/libs/command.rs b/crates/connector_proxy/src/libs/command.rs index 60844e10df..7258f7b85d 100644 --- a/crates/connector_proxy/src/libs/command.rs +++ b/crates/connector_proxy/src/libs/command.rs @@ -1,21 +1,28 @@ use crate::errors::Error; +use serde::{Deserialize, Serialize}; use std::process::{ExitStatus, Stdio}; -use tokio::process::{Child, Command}; +use tempfile::NamedTempFile; +use tokio::io::{AsyncRead, AsyncReadExt}; +use tokio::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command}; +use tokio::time::timeout; -// Start the proxied connector as a process. -pub fn invoke_connector(entrypoint: String, args: &[String]) -> Result { - Command::new(entrypoint) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::inherit()) - .args(args) - .spawn() - .map_err(|e| e.into()) +pub const READY: &[u8] = "READY".as_bytes(); + +// Start the connector directly. +pub fn invoke_connector_direct(entrypoint: String, args: Vec) -> Result { + invoke_connector( + Stdio::piped(), + Stdio::piped(), + Stdio::piped(), + &entrypoint, + &args, + ) } -// Replace this function after `exit_status_error` is stable. https://github.com/rust-lang/rust/issues/84908 -pub fn check_exit_status(result: std::io::Result) -> Result<(), Error> { +// Check the connector execution exit status. +// TODO: replace this function after `exit_status_error` is stable. https://github.com/rust-lang/rust/issues/84908 +pub fn check_exit_status(message: &str, result: std::io::Result) -> Result<(), Error> { match result { Ok(status) => { if status.success() { @@ -23,15 +30,110 @@ pub fn check_exit_status(result: std::io::Result) -> Result<(), Erro } else { match status.code() { Some(code) => Err(Error::CommandExecutionError(format!( - "failed with code {}.", - code + "{} failed with code {}.", + message, code + ))), + None => Err(Error::CommandExecutionError(format!( + "{} process terminated by signal", + message ))), - None => Err(Error::CommandExecutionError( - "process terminated by signal".to_string(), - )), } } } Err(e) => Err(e.into()), } } + +// For storing the entrypoint and args to start a delayed connector. +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub struct CommandConfig { + pub entrypoint: String, + pub args: Vec, +} +// Instead of starting the connector directly, `invoke_connector_delayed` starts a bouncer process first, which will +// start the real connector after reading a "READY" string from Stdin. Two actions are involved, +// The caller of `invoke_connector_delayed` is responsible of sending "READY" to the Stdin of the returned Child process, +// before sending anything else. +pub async fn invoke_connector_delayed( + entrypoint: String, + args: Vec, +) -> Result { + tracing::info!("invoke delayed connector {}, {:?}", entrypoint, args); + + // Saves the configs to start the connector. + let command_config = CommandConfig { + entrypoint: entrypoint, + args: args, + }; + let config_file = NamedTempFile::new()?; + serde_json::to_writer(&config_file, &command_config)?; + let (_, config_file_path) = config_file.keep()?; + let config_file_path = config_file_path + .to_str() + .expect("config file path conversion failed."); + + // Prepares and starts the bouncer process. + let bouncer_process_entrypoint = std::env::current_exe()?; + let bouncer_process_entrypoint = bouncer_process_entrypoint + .to_str() + .expect("unexpected binary path"); + + invoke_connector( + Stdio::piped(), + Stdio::piped(), + Stdio::piped(), + bouncer_process_entrypoint, + &vec!["delayed-execute".to_string(), config_file_path.to_string()], + ) +} + +pub async fn read_ready(reader: &mut R) -> Result<(), Error> { + let mut ready_buf: Vec = vec![0; READY.len()]; + match timeout( + std::time::Duration::from_secs(1), + reader.read_exact(&mut ready_buf), + ) + .await + { + Ok(_) => { + if &ready_buf == READY { + Ok(()) + } else { + Err(Error::NotReady("received unexpected bytes.")) + } + } + Err(_) => Err(Error::NotReady( + "timeout: reading from delayed-connector process wrapper.", + )), + } +} + +// A more flexible API for starting the connector. +pub fn invoke_connector( + stdin: Stdio, + stdout: Stdio, + stderr: Stdio, + entrypoint: &str, + args: &[String], +) -> Result { + tracing::info!("invoke connector {}, {:?}", entrypoint, args); + + Command::new(entrypoint) + .stdin(stdin) + .stdout(stdout) + .stderr(stderr) + .args(args) + .spawn() + .map_err(|e| e.into()) +} + +pub fn parse_child( + mut child: Child, +) -> Result<(Child, ChildStdin, ChildStdout, ChildStderr), Error> { + let stdout = child.stdout.take().ok_or(Error::MissingIOPipe)?; + let stdin = child.stdin.take().ok_or(Error::MissingIOPipe)?; + let stderr = child.stderr.take().ok_or(Error::MissingIOPipe)?; + + Ok((child, stdin, stdout, stderr)) +} diff --git a/crates/connector_proxy/src/libs/image_config.rs b/crates/connector_proxy/src/libs/image_config.rs deleted file mode 100644 index 6d6ea26de9..0000000000 --- a/crates/connector_proxy/src/libs/image_config.rs +++ /dev/null @@ -1,59 +0,0 @@ -use crate::errors::{Error, Must}; -use clap::ArgEnum; -use serde::{Deserialize, Serialize}; -use std::collections::HashMap; -use std::fs::File; -use std::io::BufReader; - -// The key of the docker image label that indicates the connector protocol. -const CONNECTOR_PROTOCOL_KEY: &str = "CONNECTOR_PROTOCOL"; - -#[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(rename_all = "PascalCase")] -pub struct ImageConfig { - pub entrypoint: Vec, - pub labels: Option>, -} - -#[derive(Debug, Serialize, Deserialize, Clone)] -#[serde(rename_all = "PascalCase")] -pub struct ImageInspect { - pub config: ImageConfig, -} - -impl ImageConfig { - pub fn parse_from_json_file(path: String) -> Result { - let reader = BufReader::new(File::open(path)?); - let image_inspects: Vec = serde_json::from_reader(reader)?; - match image_inspects.len() { - 1 => Ok(image_inspects[0].config.clone()), - _ => Err(Error::InvalidImageInspectFile), - } - } - - pub fn get_entrypoint(&self, default: Vec) -> Vec { - match self.entrypoint.len() { - 0 => { - tracing::warn!( - "No entry point is specified in the image, using default: {:?}", - default - ); - default - } - _ => self.entrypoint.clone(), - } - } - - pub fn get_connector_protocol(&self, default: T) -> T { - if let Some(ref labels) = self.labels { - if let Some(value) = labels.get(CONNECTOR_PROTOCOL_KEY) { - return T::from_str(&value, false).or_bail(); - } - } - tracing::warn!( - "No connector protocol is specified in the image, using default: {:?}", - default - ); - default - } -} diff --git a/crates/connector_proxy/src/libs/image_inspect.rs b/crates/connector_proxy/src/libs/image_inspect.rs new file mode 100644 index 0000000000..aa2d9642ee --- /dev/null +++ b/crates/connector_proxy/src/libs/image_inspect.rs @@ -0,0 +1,88 @@ +use crate::apis::FlowRuntimeProtocol; +use crate::errors::{Error, Must}; +use clap::ArgEnum; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fs::File; +use std::io::BufReader; + +// The key of the docker image label that indicates the connector protocol. +const FLOW_RUNTIME_PROTOCOL_KEY: &str = "FLOW_RUNTIME_PROTOCOL"; +const CONNECTOR_PROTOCOL_KEY: &str = "CONNECTOR_PROTOCOL"; + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "PascalCase")] +pub struct ImageConfig { + pub entrypoint: Vec, + pub labels: Option>, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "PascalCase")] +pub struct ImageInspect { + pub config: ImageConfig, + pub repo_tags: Option>, +} + +impl ImageInspect { + pub fn parse_from_json_file(path: Option) -> Result { + if path.is_none() {} + match path { + None => { + return Err(Error::MissingImageInspectFile); + } + Some(p) => { + let reader = BufReader::new(File::open(p)?); + let image_inspects: Vec = serde_json::from_reader(reader)?; + match image_inspects.len() { + 1 => Ok(image_inspects[0].clone()), + _ => Err(Error::InvalidImageInspectFile), + } + } + } + } + + pub fn get_entrypoint(&self, default: Vec) -> Vec { + match self.config.entrypoint.len() { + 0 => { + tracing::warn!( + "No entry point is specified in the image, using default: {:?}", + default + ); + default + } + _ => self.config.entrypoint.clone(), + } + } + + pub fn infer_runtime_protocol(&self) -> FlowRuntimeProtocol { + if let Some(ref labels) = self.config.labels { + if let Some(value) = labels.get(FLOW_RUNTIME_PROTOCOL_KEY) { + return FlowRuntimeProtocol::from_str(&value, false).or_bail(); + } + } + + if let Some(repo_tags) = &self.repo_tags { + for tag in repo_tags { + if tag.starts_with("ghcr.io/estuary/materialize-") { + return FlowRuntimeProtocol::Materialize; + } + } + } + + return FlowRuntimeProtocol::Capture; + } + + pub fn get_connector_protocol(&self, default: T) -> T { + if let Some(ref labels) = self.config.labels { + if let Some(value) = labels.get(CONNECTOR_PROTOCOL_KEY) { + return T::from_str(&value, false).or_bail(); + } + } + tracing::warn!( + "No connector protocol is specified in the image, using default: {:?}", + default + ); + default + } +} diff --git a/crates/connector_proxy/src/libs/json.rs b/crates/connector_proxy/src/libs/json.rs index e09064459b..dbe8f42f6d 100644 --- a/crates/connector_proxy/src/libs/json.rs +++ b/crates/connector_proxy/src/libs/json.rs @@ -1,3 +1,4 @@ +use doc::ptr::{Pointer, Token}; use schemars::{schema::RootSchema, JsonSchema}; use serde_json::Value; @@ -19,3 +20,30 @@ pub fn remove_subobject(mut v: Value, key: &str) -> (Option, Value) { (sub_object, v) } + +pub fn tokenize_jsonpointer(ptr: &str) -> Vec { + Pointer::from_str(&ptr) + .iter() + .map(|t| match t { + // Keep the index and next index for now. Could adjust based on usecases. + Token::Index(ind) => ind.to_string(), + Token::Property(prop) => prop.to_string(), + Token::NextIndex => "-".to_string(), + }) + .collect() +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_tokenize_jsonpointer() { + let expected: Vec = vec!["p1", "p2", "56", "p3", "-"] + .iter() + .map(|s| s.to_string()) + .collect(); + + assert!(expected == tokenize_jsonpointer("/p1/p2/56/p3/-")); + } +} diff --git a/crates/connector_proxy/src/libs/mod.rs b/crates/connector_proxy/src/libs/mod.rs index 80519d2488..c7ff1dedb9 100644 --- a/crates/connector_proxy/src/libs/mod.rs +++ b/crates/connector_proxy/src/libs/mod.rs @@ -1,5 +1,6 @@ +pub mod airbyte_catalog; pub mod command; -pub mod image_config; +pub mod image_inspect; pub mod json; pub mod network_proxy; pub mod protobuf; diff --git a/crates/connector_proxy/src/libs/protobuf.rs b/crates/connector_proxy/src/libs/protobuf.rs index e93c0cfcfd..1f246d3aaf 100644 --- a/crates/connector_proxy/src/libs/protobuf.rs +++ b/crates/connector_proxy/src/libs/protobuf.rs @@ -1,5 +1,3 @@ -use crate::errors::Error; - use byteorder::{ByteOrder, LittleEndian}; use bytes::{BufMut, Bytes, BytesMut}; use prost::Message; @@ -10,15 +8,14 @@ pub async fn decode_message< R: AsyncRead + std::marker::Unpin, >( reader: &mut R, -) -> Result, Error> { - // Deserialize the proto message. +) -> Result, std::io::Error> { let mut length_buf: [u8; 4] = [0; 4]; match reader.read_exact(&mut length_buf).await { Err(e) => match e.kind() { // By the current communication protocol, UnexpectedEof indicates the ending of the stream. std::io::ErrorKind::UnexpectedEof => return Ok(None), - _ => return Err(e.into()), + _ => return Err(e), }, Ok(_) => {} } diff --git a/crates/connector_proxy/src/libs/stream.rs b/crates/connector_proxy/src/libs/stream.rs index c1e31748d8..33d1e441e4 100644 --- a/crates/connector_proxy/src/libs/stream.rs +++ b/crates/connector_proxy/src/libs/stream.rs @@ -1,12 +1,19 @@ -use async_stream::stream; -use bytes::{Bytes, BytesMut}; +use crate::apis::InterceptorStream; +use crate::libs::airbyte_catalog::Message; + +use crate::errors::raise_custom_error; +use async_stream::try_stream; +use bytes::{Buf, Bytes, BytesMut}; use futures_core::Stream; +use futures_util::StreamExt; +use serde_json::{Deserializer, Value}; use tokio::io::{AsyncRead, AsyncReadExt}; +use validator::Validate; pub fn stream_all_bytes( mut reader: R, ) -> impl Stream> { - stream! { + try_stream! { loop { // consistent with the default capacity of ReaderStream. // https://github.com/tokio-rs/tokio/blob/master/tokio-util/src/io/reader_stream.rs#L8 @@ -14,12 +21,69 @@ pub fn stream_all_bytes( match reader.read_buf(&mut buf).await { Ok(0) => break, Ok(_) => { - yield Ok(buf.into()); + yield buf.into(); } Err(e) => { - panic!("error during streaming {:?}.", e); + raise_custom_error(&format!("error during streaming {:?}.", e))?; } } } } } + +pub fn stream_all_airbyte_messages( + mut in_stream: InterceptorStream, +) -> impl Stream> { + try_stream! { + let mut buf = BytesMut::new(); + + while let Some(bytes) = in_stream.next().await { + match bytes { + Ok(b) => { + buf.extend_from_slice(b.chunk()); + } + Err(e) => { + raise_custom_error(&format!("error in reading next in_stream: {:?}", e))?; + } + } + + let chunk = buf.chunk(); + let deserializer = Deserializer::from_slice(&chunk); + + // Deserialize to Value first, instead of Message, to avoid missing 'is_eof' signals in error. + let mut value_stream = deserializer.into_iter::(); + while let Some(value) = value_stream.next() { + match value { + Ok(v) => { + let message: Message = serde_json::from_value(v).unwrap(); + if let Err(e) = message.validate() { + raise_custom_error(&format!( + "error in validating message: {:?}, {:?}", + e, std::str::from_utf8(&chunk[value_stream.byte_offset()..])))?; + } + tracing::debug!("read message:: {:?}", &message); + yield message; + } + Err(e) => { + if e.is_eof() { + break; + } + + raise_custom_error(&format!( + "error in decoding message: {:?}, {:?}", + e, std::str::from_utf8(&chunk[value_stream.byte_offset()..])))?; + } + } + } + + let byte_offset = value_stream.byte_offset(); + drop(buf.split_to(byte_offset)); + } + + if buf.len() > 0 { + raise_custom_error("unconsumed content in stream found.")?; + } + + tracing::info!("done reading all in_stream."); + } +} diff --git a/crates/connector_proxy/src/main.rs b/crates/connector_proxy/src/main.rs index 7defe1507c..d157168b2e 100644 --- a/crates/connector_proxy/src/main.rs +++ b/crates/connector_proxy/src/main.rs @@ -3,24 +3,28 @@ pub mod connector_runner; pub mod errors; pub mod interceptors; pub mod libs; +use std::fs::File; +use std::io::BufReader; use clap::{ArgEnum, Parser, Subcommand}; -use tokio::signal::unix::{signal, SignalKind}; +use tokio::{ + io::AsyncReadExt, + signal::unix::{signal, SignalKind}, +}; -use apis::{compose, FlowCaptureOperation, FlowMaterializeOperation, Interceptor}; +use apis::{FlowCaptureOperation, FlowMaterializeOperation, FlowRuntimeProtocol}; use flow_cli_common::{init_logging, LogArgs}; -use connector_runner::run_connector; +use connector_runner::{ + run_airbyte_source_connector, run_flow_capture_connector, run_flow_materialize_connector, +}; use errors::Error; -use libs::image_config::ImageConfig; - -use interceptors::{ - airbyte_capture_interceptor::AirbyteCaptureInterceptor, - default_interceptors::{DefaultFlowCaptureInterceptor, DefaultFlowMaterializeInterceptor}, - network_proxy_capture_interceptor::NetworkProxyCaptureInterceptor, - network_proxy_materialize_interceptor::NetworkProxyMaterializeInterceptor, +use libs::{ + command::{check_exit_status, invoke_connector, read_ready, CommandConfig}, + image_inspect::ImageInspect, }; +use std::process::Stdio; #[derive(Debug, ArgEnum, Clone)] pub enum CaptureConnectorProtocol { @@ -47,12 +51,19 @@ struct ProxyFlowMaterialize { operation: FlowMaterializeOperation, } +#[derive(Debug, clap::Parser)] +struct DelayedExecutionConfig { + config_file_path: String, +} + #[derive(Debug, Subcommand)] enum ProxyCommand { /// proxies the Flow runtime Capture Protocol to the connector. ProxyFlowCapture(ProxyFlowCapture), /// proxies the Flow runtime Materialize Protocol to the connector. ProxyFlowMaterialize(ProxyFlowMaterialize), + /// internal command used by the connector proxy itself to delay execution until signaled. + DelayedExecute(DelayedExecutionConfig), } #[derive(Parser, Debug)] @@ -61,7 +72,7 @@ pub struct Args { /// The path (in the container) to the JSON file that contains the inspection results from the connector image. /// Normally produced via command "docker inspect ". #[clap(short, long)] - image_inspect_json_path: String, + image_inspect_json_path: Option, /// The type of proxy service to provide. #[clap(subcommand)] @@ -91,18 +102,15 @@ async fn main() -> std::io::Result<()> { } = Args::parse(); init_logging(&log_args); - // respond to os signals. - tokio::task::spawn(async move { signal_handler().await }); - let result = async_main(image_inspect_json_path, proxy_command).await; if let Err(err) = result.as_ref() { - tracing::error!(error = ?err, "connector proxy execution failed."); + tracing::error!("connector proxy execution failed. {:?}", err); std::process::exit(1); } Ok(()) } -async fn signal_handler() { +async fn sigterm_handler() { let mut signal_stream = signal(SignalKind::terminate()).expect("failed creating signal."); signal_stream @@ -114,54 +122,94 @@ async fn signal_handler() { } async fn async_main( - image_inspect_json_path: String, + image_inspect_json_path: Option, proxy_command: ProxyCommand, ) -> Result<(), Error> { - let image_config = ImageConfig::parse_from_json_file(image_inspect_json_path)?; - - // TODO(jixiang): add a check to make sure the proxy_command passed in from commandline is consistent with the protocol inferred from image. match proxy_command { - ProxyCommand::ProxyFlowCapture(c) => proxy_flow_capture(c, image_config).await, - ProxyCommand::ProxyFlowMaterialize(m) => proxy_flow_materialize(m, image_config).await, + ProxyCommand::ProxyFlowCapture(c) => proxy_flow_capture(c, image_inspect_json_path).await, + ProxyCommand::ProxyFlowMaterialize(m) => { + proxy_flow_materialize(m, image_inspect_json_path).await + } + ProxyCommand::DelayedExecute(ba) => delayed_execute(ba.config_file_path).await, } } -async fn proxy_flow_capture(c: ProxyFlowCapture, image_config: ImageConfig) -> Result<(), Error> { - let mut converter_pair = match image_config +async fn proxy_flow_capture( + c: ProxyFlowCapture, + image_inspect_json_path: Option, +) -> Result<(), Error> { + let image_inspect = ImageInspect::parse_from_json_file(image_inspect_json_path)?; + if image_inspect.infer_runtime_protocol() != FlowRuntimeProtocol::Capture { + return Err(Error::MismatchingRuntimeProtocol); + } + + let entrypoint = image_inspect.get_entrypoint(vec![DEFAULT_CONNECTOR_ENTRYPOINT.to_string()]); + + match image_inspect .get_connector_protocol::(CaptureConnectorProtocol::Airbyte) { - CaptureConnectorProtocol::FlowCapture => DefaultFlowCaptureInterceptor::get_converters(), - CaptureConnectorProtocol::Airbyte => AirbyteCaptureInterceptor::get_converters(), - }; - - converter_pair = compose( - converter_pair, - NetworkProxyCaptureInterceptor::get_converters(), - ); - - run_connector::( - c.operation, - image_config.get_entrypoint(vec![DEFAULT_CONNECTOR_ENTRYPOINT.to_string()]), - converter_pair, - ) - .await + CaptureConnectorProtocol::FlowCapture => { + run_flow_capture_connector(&c.operation, entrypoint).await + } + CaptureConnectorProtocol::Airbyte => { + run_airbyte_source_connector(&c.operation, entrypoint).await + } + } } async fn proxy_flow_materialize( m: ProxyFlowMaterialize, - image_config: ImageConfig, + image_inspect_json_path: Option, ) -> Result<(), Error> { - // There is only one type of connector protocol for flow materialize. - let mut converter_pair = DefaultFlowMaterializeInterceptor::get_converters(); - converter_pair = compose( - converter_pair, - NetworkProxyMaterializeInterceptor::get_converters(), - ); - - run_connector::( - m.operation, - image_config.get_entrypoint(vec![DEFAULT_CONNECTOR_ENTRYPOINT.to_string()]), - converter_pair, + // Respond to OS sigterm signal. + tokio::task::spawn(async move { sigterm_handler().await }); + + let image_inspect = ImageInspect::parse_from_json_file(image_inspect_json_path)?; + if image_inspect.infer_runtime_protocol() != FlowRuntimeProtocol::Materialize { + return Err(Error::MismatchingRuntimeProtocol); + } + + run_flow_materialize_connector( + &m.operation, + image_inspect.get_entrypoint(vec![DEFAULT_CONNECTOR_ENTRYPOINT.to_string()]), ) .await } + +async fn delayed_execute(command_config_path: String) -> Result<(), Error> { + // Wait for the "READY" signal from the parent process before starting the connector. + read_ready(&mut tokio::io::stdin()).await?; + + tracing::info!("delayed process execution continue..."); + + let reader = BufReader::new(File::open(command_config_path)?); + let command_config: CommandConfig = serde_json::from_reader(reader)?; + + let mut child = invoke_connector( + Stdio::inherit(), + Stdio::inherit(), + Stdio::piped(), + &command_config.entrypoint, + &command_config.args, + )?; + + match check_exit_status("delayed process", child.wait().await) { + Err(e) => { + let mut buf = Vec::new(); + child + .stderr + .take() + .ok_or(Error::MissingIOPipe)? + .read_to_end(&mut buf) + .await?; + + tracing::error!( + "connector failed. command_config: {:?}. stderr from connector: {}", + &command_config, + std::str::from_utf8(&buf).expect("error when decoding stderr") + ); + Err(e) + } + _ => Ok(()), + } +} diff --git a/go/capture/driver/airbyte/driver.go b/go/capture/driver/airbyte/driver.go index cd7b005480..dbac669b1d 100644 --- a/go/capture/driver/airbyte/driver.go +++ b/go/capture/driver/airbyte/driver.go @@ -2,18 +2,19 @@ package airbyte import ( "context" + "encoding/binary" "encoding/json" "fmt" "io" "strings" - "github.com/alecthomas/jsonschema" "github.com/estuary/flow/go/connector" "github.com/estuary/flow/go/flow/ops" "github.com/estuary/flow/go/protocols/airbyte" pc "github.com/estuary/flow/go/protocols/capture" pf "github.com/estuary/flow/go/protocols/flow" - "github.com/go-openapi/jsonpointer" + protoio "github.com/gogo/protobuf/io" + "github.com/gogo/protobuf/proto" "github.com/sirupsen/logrus" ) @@ -92,50 +93,39 @@ func (d driver) Spec(ctx context.Context, req *pc.SpecRequest) (*pc.SpecResponse "operation": "spec", }) - var spec *airbyte.Spec - var err = connector.Run(ctx, source.Image, connector.Capture, d.networkName, + var decrypted, err = connector.DecryptConfig(ctx, source.Config) + if err != nil { + return nil, err + } + defer connector.ZeroBytes(decrypted) // connector.Run will also ZeroBytes(). + req.EndpointSpecJson = decrypted + + var resp *pc.SpecResponse + err = connector.Run(ctx, source.Image, connector.Capture, d.networkName, []string{"spec"}, // No configuration is passed to the connector. nil, // No stdin is sent to the connector. - func(w io.Writer) error { return nil }, + func(w io.Writer) error { + defer connector.ZeroBytes(decrypted) + return protoio.NewUint32DelimitedWriter(w, binary.LittleEndian). + WriteMsg(req) + }, // Expect to decode Airbyte messages, and a ConnectorSpecification specifically. - connector.NewJSONOutput( - func() interface{} { return new(airbyte.Message) }, - func(i interface{}) error { - if rec := i.(*airbyte.Message); rec.Log != nil { - logger.Log(airbyteToLogrusLevel(rec.Log.Level), nil, rec.Log.Message) - } else if rec.Spec != nil { - spec = rec.Spec - } else { - return fmt.Errorf("unexpected connector message: %v", rec) + connector.NewProtoOutput( + func() proto.Message { return new(pc.SpecResponse) }, + func(m proto.Message) error { + if resp != nil { + return fmt.Errorf("read more than one SpecResponse") } + resp = m.(*pc.SpecResponse) return nil }, - onStdoutDecodeError(logger), ), logger, ) + return resp, err - // Expect connector spit out a successful ConnectorSpecification. - if err == nil && spec == nil { - err = fmt.Errorf("connector didn't produce a Specification") - } - if err != nil { - return nil, err - } - - var reflector = jsonschema.Reflector{ExpandedStruct: true} - resourceSchema, err := reflector.Reflect(new(ResourceSpec)).MarshalJSON() - if err != nil { - return nil, fmt.Errorf("generating resource schema: %w", err) - } - - return &pc.SpecResponse{ - EndpointSpecSchemaJson: spec.ConnectionSpecification, - ResourceSpecSchemaJson: json.RawMessage(resourceSchema), - DocumentationUrl: spec.DocumentationURL, - }, nil } // Discover delegates to the `discover` command of the identified Airbyte image. @@ -156,80 +146,39 @@ func (d driver) Discover(ctx context.Context, req *pc.DiscoverRequest) (*pc.Disc return nil, err } defer connector.ZeroBytes(decrypted) // connector.Run will also ZeroBytes(). + req.EndpointSpecJson = decrypted - var catalog *airbyte.Catalog + var resp *pc.DiscoverResponse err = connector.Run(ctx, source.Image, connector.Capture, d.networkName, []string{ "discover", - "--config", - "/tmp/config.json", }, - // Write configuration JSON to connector input. - map[string]json.RawMessage{"config.json": decrypted}, - // No stdin is sent to the connector. - func(w io.Writer) error { return nil }, - // Expect to decode Airbyte messages, and a ConnectionStatus specifically. - connector.NewJSONOutput( - func() interface{} { return new(airbyte.Message) }, - func(i interface{}) error { - if rec := i.(*airbyte.Message); rec.Log != nil { - logger.Log(airbyteToLogrusLevel(rec.Log.Level), nil, rec.Log.Message) - } else if rec.Catalog != nil { - catalog = rec.Catalog - } else { - return fmt.Errorf("unexpected connector message: %v", rec) + nil, + func(w io.Writer) error { + defer connector.ZeroBytes(decrypted) + return protoio.NewUint32DelimitedWriter(w, binary.LittleEndian). + WriteMsg(req) + }, + connector.NewProtoOutput( + func() proto.Message { return new(pc.DiscoverResponse) }, + func(m proto.Message) error { + if resp != nil { + return fmt.Errorf("read more than one DiscoverResponse") } + resp = m.(*pc.DiscoverResponse) return nil }, - onStdoutDecodeError(logger), ), logger, ) // Expect connector spit out a successful ConnectionStatus. - if err == nil && catalog == nil { + if err == nil && resp == nil { err = fmt.Errorf("connector didn't produce a Catalog") - } - if err != nil { + } else if err != nil { return nil, err } - var resp = new(pc.DiscoverResponse) - for _, stream := range catalog.Streams { - // Use incremental mode if available. - var mode = airbyte.SyncModeFullRefresh - for _, m := range stream.SupportedSyncModes { - if m == airbyte.SyncModeIncremental { - mode = m - } - } - - var resourceSpec, err = json.Marshal(ResourceSpec{ - Stream: stream.Name, - Namespace: stream.Namespace, - SyncMode: mode, - }) - if err != nil { - return nil, fmt.Errorf("encoding resource spec: %w", err) - } - - // Encode array of hierarchical properties as a JSON-pointer. - var keyPtrs []string - for _, tokens := range stream.SourceDefinedPrimaryKey { - for i := range tokens { - tokens[i] = jsonpointer.Escape(tokens[i]) - } - keyPtrs = append(keyPtrs, "/"+strings.Join(tokens, "/")) - } - - resp.Bindings = append(resp.Bindings, &pc.DiscoverResponse_Binding{ - RecommendedName: stream.Name, - ResourceSpecJson: json.RawMessage(resourceSpec), - DocumentSchemaJson: stream.JSONSchema, - KeyPtrs: keyPtrs, - }) - } - return resp, nil } @@ -252,57 +201,39 @@ func (d driver) Validate(ctx context.Context, req *pc.ValidateRequest) (*pc.Vali ops.LogSourceField: source.Image, "operation": "validate", }) + req.EndpointSpecJson = decrypted - var status *airbyte.ConnectionStatus + var resp *pc.ValidateResponse err = connector.Run(ctx, source.Image, connector.Capture, d.networkName, []string{ - "check", - "--config", - "/tmp/config.json", + "validate", }, - // Write configuration JSON to connector input. - map[string]json.RawMessage{"config.json": decrypted}, - // No stdin is sent to the connector. - func(w io.Writer) error { return nil }, - // Expect to decode Airbyte messages, and a ConnectionStatus specifically. - connector.NewJSONOutput( - func() interface{} { return new(airbyte.Message) }, - func(i interface{}) error { - if rec := i.(*airbyte.Message); rec.Log != nil { - logger.Log(airbyteToLogrusLevel(rec.Log.Level), nil, rec.Log.Message) - } else if rec.ConnectionStatus != nil { - status = rec.ConnectionStatus - } else { - return fmt.Errorf("unexpected connector message: %v", rec) + nil, + func(w io.Writer) error { + defer connector.ZeroBytes(decrypted) + return protoio.NewUint32DelimitedWriter(w, binary.LittleEndian). + WriteMsg(req) + }, + connector.NewProtoOutput( + func() proto.Message { return new(pc.ValidateResponse) }, + func(m proto.Message) error { + if resp != nil { + return fmt.Errorf("read more than one ValidateResponse") } + resp = m.(*pc.ValidateResponse) return nil }, - onStdoutDecodeError(logger), ), logger, ) - // Expect connector spit out a successful ConnectionStatus. - if err == nil && status == nil { - err = fmt.Errorf("connector didn't produce a ConnectionStatus") - } else if err == nil && status.Status != airbyte.StatusSucceeded { - err = fmt.Errorf("%s: %s", status.Status, status.Message) + if err == nil && resp == nil { + err = fmt.Errorf("connector didn't produce a response") } if err != nil { return nil, err } - // Parse stream bindings and send back their resource paths. - var resp = new(pc.ValidateResponse) - for _, binding := range req.Bindings { - var stream = new(ResourceSpec) - if err := pf.UnmarshalStrict(binding.ResourceSpecJson, stream); err != nil { - return nil, fmt.Errorf("parsing stream configuration: %w", err) - } - resp.Bindings = append(resp.Bindings, &pc.ValidateResponse_Binding{ - ResourcePath: []string{stream.Stream}, - }) - } return resp, nil } @@ -332,102 +263,36 @@ func (d driver) Pull(stream pc.Driver_PullServer) error { return fmt.Errorf("parsing connector configuration: %w", err) } - var open = req.Open - var streamToBinding = make(map[string]int) var logger = ops.NewLoggerWithFields(d.logger, logrus.Fields{ ops.LogSourceField: source.Image, "operation": "read", }) - // Build configured Airbyte catalog. - var catalog = airbyte.ConfiguredCatalog{ - Streams: nil, - Tail: open.Tail, - Range: airbyte.Range{ - Begin: open.KeyBegin, - End: open.KeyEnd, - }, - } - for i, binding := range open.Capture.Bindings { - var resource = new(ResourceSpec) - if err := pf.UnmarshalStrict(binding.ResourceSpecJson, resource); err != nil { - return fmt.Errorf("parsing stream configuration: %w", err) - } - - var projections = make(map[string]string) - for _, p := range binding.Collection.Projections { - projections[p.Field] = p.Ptr - } - - var primaryKey = make([][]string, 0, len(binding.Collection.KeyPtrs)) - for _, key := range binding.Collection.KeyPtrs { - if ptr, err := jsonpointer.New(key); err != nil { - return fmt.Errorf("parsing json pointer: %w", err) - } else { - primaryKey = append(primaryKey, ptr.DecodedTokens()) - } - } - - catalog.Streams = append(catalog.Streams, - airbyte.ConfiguredStream{ - SyncMode: resource.SyncMode, - DestinationSyncMode: airbyte.DestinationSyncModeAppend, - PrimaryKey: primaryKey, - Stream: airbyte.Stream{ - Name: resource.Stream, - Namespace: resource.Namespace, - JSONSchema: binding.Collection.SchemaJson, - SupportedSyncModes: []airbyte.SyncMode{resource.SyncMode}, - }, - Projections: projections, - }) - streamToBinding[resource.Stream] = i - } - - catalogJSON, err := json.Marshal(&catalog) - if err != nil { - return fmt.Errorf("encoding catalog: %w", err) - } - logger.Log(logrus.DebugLevel, logrus.Fields{ - "catalog": &catalog, - }, "using configured catalog") - decrypted, err := connector.DecryptConfig(stream.Context(), source.Config) if err != nil { return err } defer connector.ZeroBytes(decrypted) // RunConnector will also ZeroBytes(). - var invokeArgs = []string{ - "read", - "--config", - "/tmp/config.json", - "--catalog", - "/tmp/catalog.json", - } - var invokeFiles = map[string]json.RawMessage{ - "config.json": decrypted, - "catalog.json": catalogJSON, - } - - if len(open.DriverCheckpointJson) != 0 { - invokeArgs = append(invokeArgs, "--state", "/tmp/state.json") - // Copy because RunConnector will ZeroBytes() once sent and, - // as noted in driver{}, we don't own this memory. - invokeFiles["state.json"] = append([]byte(nil), open.DriverCheckpointJson...) - } + req.Open.Capture.EndpointSpecJson = decrypted if err := stream.Send(&pc.PullResponse{Opened: &pc.PullResponse_Opened{}}); err != nil { return fmt.Errorf("sending Opened: %w", err) } - var resp *pc.PullResponse - // Invoke the connector for reading. - if err := connector.Run(stream.Context(), source.Image, connector.Capture, d.networkName, - invokeArgs, - invokeFiles, + return connector.Run(stream.Context(), source.Image, connector.Capture, d.networkName, + []string{"pull"}, + nil, func(w io.Writer) error { + defer connector.ZeroBytes(decrypted) + var enc = protoio.NewUint32DelimitedWriter(w, binary.LittleEndian) + var err = enc.WriteMsg(req) + + if err != nil { + return fmt.Errorf("proxying Open: %w", err) + } + for { var req, err = stream.Recv() if err == io.EOF { @@ -443,49 +308,14 @@ func (d driver) Pull(stream pc.Driver_PullServer) error { } } }, - // Expect to decode Airbyte messages. - connector.NewJSONOutput( - func() interface{} { return new(airbyte.Message) }, - func(i interface{}) error { - if rec := i.(*airbyte.Message); rec.Log != nil { - logger.Log(airbyteToLogrusLevel(rec.Log.Level), nil, rec.Log.Message) - } else if rec.State != nil { - return pc.WritePullCheckpoint(stream, &resp, - &pf.DriverCheckpoint{ - DriverCheckpointJson: rec.State.Data, - Rfc7396MergePatch: rec.State.Merge, - }) - } else if rec.Record != nil { - if b, ok := streamToBinding[rec.Record.Stream]; ok { - return pc.StagePullDocuments(stream, &resp, b, rec.Record.Data) - } - return fmt.Errorf("connector record with unknown stream %q", rec.Record.Stream) - } else { - return fmt.Errorf("unexpected connector message: %v", rec) - } - return nil + connector.NewProtoOutput( + func() proto.Message { return new(pc.PullResponse) }, + func(m proto.Message) error { + return stream.Send(m.(*pc.PullResponse)) }, - onStdoutDecodeError(logger), ), logger, - ); err != nil { - return err - } - - if resp == nil { - return nil // Connector flushed prior to exiting. All done. - } - - // Write a final commit, followed by EOF. - // This happens only when a connector writes output and exits _without_ - // writing a final state checkpoint. We generate a synthetic commit now, - // and the nil checkpoint means the assumed behavior of the next invocation - // will be "full refresh". - return pc.WritePullCheckpoint(stream, &resp, - &pf.DriverCheckpoint{ - DriverCheckpointJson: nil, - Rfc7396MergePatch: false, - }) + ) } // onStdoutDecodeError returns a function that is invoked whenever there's an error parsing a line diff --git a/go/connector/run.go b/go/connector/run.go index a783176207..6b5926a1a2 100644 --- a/go/connector/run.go +++ b/go/connector/run.go @@ -105,41 +105,39 @@ func Run( } defer os.RemoveAll(tempdir) - if protocol == Materialize { - if connectorProxyPath, err := prepareFlowConnectorProxyBinary(tempdir); err != nil { - return fmt.Errorf("prepare flow connector proxy binary: %w", err) - } else { - imageArgs = append(imageArgs, - "--entrypoint", connectorProxyPath, - "--mount", fmt.Sprintf("type=bind,source=%[1]s,target=%[1]s", connectorProxyPath), - ) - } + if connectorProxyPath, err := prepareFlowConnectorProxyBinary(tempdir); err != nil { + return fmt.Errorf("prepare flow connector proxy binary: %w", err) + } else { + imageArgs = append(imageArgs, + "--entrypoint", connectorProxyPath, + "--mount", fmt.Sprintf("type=bind,source=%[1]s,target=%[1]s", connectorProxyPath), + ) + } - if err := pullRemoteImage(ctx, image, logger); err != nil { - // This might be a local image. Log an error and keep going. - // If the image does not exist locally, the inspectImage will return an error and terminate the workflow. - logger.Log(logrus.InfoLevel, logrus.Fields{ - "error": err, - }, "pull remote image does not succeed.") - } + if err := pullRemoteImage(ctx, image, logger); err != nil { + // This might be a local image. Log an error and keep going. + // If the image does not exist locally, the inspectImage will return an error and terminate the workflow. + logger.Log(logrus.InfoLevel, logrus.Fields{ + "error": err, + }, "pull remote image does not succeed.") + } - if inspectOutput, err := inspectImage(ctx, image); err != nil { - return fmt.Errorf("inspect image: %w", err) - } else { - if jsonFiles == nil { - jsonFiles = map[string]json.RawMessage{imageInspectJsonFileName: inspectOutput} + if inspectOutput, err := inspectImage(ctx, image); err != nil { + return fmt.Errorf("inspect image: %w", err) + } else { + if jsonFiles == nil { + jsonFiles = map[string]json.RawMessage{imageInspectJsonFileName: inspectOutput} - } else { - jsonFiles[imageInspectJsonFileName] = inspectOutput - } + } else { + jsonFiles[imageInspectJsonFileName] = inspectOutput } - - args = append([]string{ - fmt.Sprintf("--image-inspect-json-path=/tmp/%s", imageInspectJsonFileName), - protocol.proxyCommand(), - }, args...) } + args = append([]string{ + fmt.Sprintf("--image-inspect-json-path=/tmp/%s", imageInspectJsonFileName), + protocol.proxyCommand(), + }, args...) + for name, data := range jsonFiles { var hostPath = filepath.Join(tempdir, name) var containerPath = filepath.Join("/tmp", name) From 83d5d6071c70a5cf77c4735af81be837e4788218 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Tue, 29 Mar 2022 16:31:02 +0000 Subject: [PATCH 2/9] TODO for infer_runtime_protocol to use labels instead of static prefix --- crates/connector_proxy/src/libs/image_inspect.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/connector_proxy/src/libs/image_inspect.rs b/crates/connector_proxy/src/libs/image_inspect.rs index aa2d9642ee..0004dbbad2 100644 --- a/crates/connector_proxy/src/libs/image_inspect.rs +++ b/crates/connector_proxy/src/libs/image_inspect.rs @@ -62,6 +62,8 @@ impl ImageInspect { } } + // TODO: change this to allow arbitrary docker images to be recognized + // as a materialization if let Some(repo_tags) = &self.repo_tags { for tag in repo_tags { if tag.starts_with("ghcr.io/estuary/materialize-") { From 7e97455c6c9818b8adf5a8da8b559cf775df3a13 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Wed, 30 Mar 2022 15:16:14 +0000 Subject: [PATCH 3/9] fix bug with airbyte invoke args and catalog spec --- .../src/interceptors/airbyte_source_interceptor.rs | 2 +- crates/connector_proxy/src/libs/airbyte_catalog.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs index ba61bd3509..0d816f99fc 100644 --- a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs @@ -376,7 +376,7 @@ impl AirbyteSourceInterceptor { }; let airbyte_args: Vec = airbyte_args.into_iter().map(Into::into).collect(); - Ok([airbyte_args, args].concat()) + Ok([args, airbyte_args].concat()) } pub fn adapt_request_stream( diff --git a/crates/connector_proxy/src/libs/airbyte_catalog.rs b/crates/connector_proxy/src/libs/airbyte_catalog.rs index dc54c110ca..a0be6bc01d 100644 --- a/crates/connector_proxy/src/libs/airbyte_catalog.rs +++ b/crates/connector_proxy/src/libs/airbyte_catalog.rs @@ -183,7 +183,7 @@ pub struct Spec { pub documentation_url: Option, pub changelog_url: Option, pub connection_specification: Box, - pub supports_incremental: bool, + pub supports_incremental: Option, // SupportedDestinationSyncModes is ignored by Flow pub supported_destination_sync_modes: Option>, From 73157edfe8517ab3e6257dae683dd66ffaa6f746 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Wed, 30 Mar 2022 20:12:09 +0000 Subject: [PATCH 4/9] fix airbyte specification --- .../src/interceptors/airbyte_source_interceptor.rs | 4 ++-- crates/connector_proxy/src/libs/airbyte_catalog.rs | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs index 0d816f99fc..7f5c5d4960 100644 --- a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs @@ -116,7 +116,7 @@ impl AirbyteSourceInterceptor { if let Some(catalog) = message.catalog { let mut resp = DiscoverResponse::default(); for stream in catalog.streams { - let mode = if stream.supported_sync_modes.contains(&SyncMode::Incremental) {SyncMode::Incremental} else {SyncMode::FullRefresh}; + let mode = if stream.supported_sync_modes.map(|modes| modes.contains(&SyncMode::Incremental)).unwrap_or(false) {SyncMode::Incremental} else {SyncMode::FullRefresh}; let resource_spec = ResourceSpec { stream: stream.name.clone(), namespace: stream.namespace, @@ -243,7 +243,7 @@ impl AirbyteSourceInterceptor { name: resource.stream, namespace: resource.namespace, json_schema: RawValue::from_string(collection.schema_json.clone())?, - supported_sync_modes: vec![resource.sync_mode.clone()], + supported_sync_modes: Some(vec![resource.sync_mode.clone()]), default_cursor_field: None, source_defined_cursor: None, source_defined_primary_key: None, diff --git a/crates/connector_proxy/src/libs/airbyte_catalog.rs b/crates/connector_proxy/src/libs/airbyte_catalog.rs index a0be6bc01d..3aff540afe 100644 --- a/crates/connector_proxy/src/libs/airbyte_catalog.rs +++ b/crates/connector_proxy/src/libs/airbyte_catalog.rs @@ -19,7 +19,7 @@ pub struct Stream { pub name: String, pub json_schema: Box, #[validate(length(min = 1))] - pub supported_sync_modes: Vec, + pub supported_sync_modes: Option>, pub source_defined_cursor: Option, pub default_cursor_field: Option>, pub source_defined_primary_key: Option>>, @@ -50,7 +50,13 @@ pub struct ConfiguredStream { } impl ConfiguredStream { fn validate_configured_stream(&self) -> Result<(), ValidationError> { - if self.stream.supported_sync_modes.contains(&self.sync_mode) { + if self + .stream + .supported_sync_modes + .as_ref() + .map(|modes| modes.contains(&self.sync_mode)) + .unwrap_or(false) + { Ok(()) } else { Err(ValidationError::new( @@ -123,7 +129,7 @@ pub enum Status { #[serde(rename_all = "snake_case")] pub struct ConnectionStatus { pub status: Status, - pub message: String, + pub message: Option, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -131,7 +137,7 @@ pub struct ConnectionStatus { pub struct Record { pub stream: String, pub data: Box, - pub emitted_at: Option, + pub emitted_at: i64, pub namespace: Option, } From e054526870b3a42a2cfe737a4f72bf7e49b89e53 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Thu, 31 Mar 2022 07:54:23 +0000 Subject: [PATCH 5/9] add a bunch of TODOs --- .../src/interceptors/airbyte_source_interceptor.rs | 2 ++ crates/connector_proxy/src/libs/airbyte_catalog.rs | 2 ++ crates/connector_proxy/src/libs/network_proxy.rs | 2 +- crates/connector_proxy/src/libs/stream.rs | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs index 7f5c5d4960..ab15ace019 100644 --- a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs @@ -125,6 +125,8 @@ impl AirbyteSourceInterceptor { let key_ptrs = match stream.source_defined_primary_key { None => Vec::new(), + // TODO: use doc::Pointer, and if necessary implement creation of new json pointers + // in that module. What about the existing tokenize_jsonpointer function? Some(keys) => keys.iter().map(|k| JsonPointer::new(k).to_string()).collect() }; resp.bindings.push(discover_response::Binding{ diff --git a/crates/connector_proxy/src/libs/airbyte_catalog.rs b/crates/connector_proxy/src/libs/airbyte_catalog.rs index 3aff540afe..91f6a0c19e 100644 --- a/crates/connector_proxy/src/libs/airbyte_catalog.rs +++ b/crates/connector_proxy/src/libs/airbyte_catalog.rs @@ -18,6 +18,8 @@ pub enum SyncMode { pub struct Stream { pub name: String, pub json_schema: Box, + // supported_sync_modes is planned to be made required soon + // see https://is.gd/RqAhTO #[validate(length(min = 1))] pub supported_sync_modes: Option>, pub source_defined_cursor: Option, diff --git a/crates/connector_proxy/src/libs/network_proxy.rs b/crates/connector_proxy/src/libs/network_proxy.rs index bdbc6d1050..4be21cfb74 100644 --- a/crates/connector_proxy/src/libs/network_proxy.rs +++ b/crates/connector_proxy/src/libs/network_proxy.rs @@ -76,7 +76,7 @@ impl NetworkProxy { let (mut tx, rx) = oneshot::channel(); tokio::spawn(Self::start_network_proxy(network_proxy_config, rx)); - // TODO(jixiang): Refact the network-proxy and remove the timeout logic here after all connectors are converted to work with connector-proxy. + // TODO: Refact the network-proxy and remove the timeout logic here after all connectors are converted to work with connector-proxy. // Block for at most 5 seconds for network proxy to be prepared. if let Err(_) = timeout(std::time::Duration::from_secs(5), tx.closed()).await { diff --git a/crates/connector_proxy/src/libs/stream.rs b/crates/connector_proxy/src/libs/stream.rs index 33d1e441e4..e62789e530 100644 --- a/crates/connector_proxy/src/libs/stream.rs +++ b/crates/connector_proxy/src/libs/stream.rs @@ -13,6 +13,8 @@ use validator::Validate; pub fn stream_all_bytes( mut reader: R, ) -> impl Stream> { + // TODO: can we replace these macros with futures crate StreamExt or TryStreamExt methods? + // e.g. futures::stream::unfold() might be useful. try_stream! { loop { // consistent with the default capacity of ReaderStream. From d34cb05741b1b876b896626e9ff31a4cabeedadc Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Fri, 1 Apr 2022 07:20:04 +0000 Subject: [PATCH 6/9] use try_join! to surface errors from connector --- crates/connector_proxy/src/connector_runner.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/connector_proxy/src/connector_runner.rs b/crates/connector_proxy/src/connector_runner.rs index 28dc1b4255..4a2d3fc824 100644 --- a/crates/connector_proxy/src/connector_runner.rs +++ b/crates/connector_proxy/src/connector_runner.rs @@ -128,12 +128,12 @@ async fn streaming_all( let mut response_stream_writer = tokio::io::stdout(); let mut error_writer = tokio::io::stderr(); - let (a, b, c) = tokio::join!( + let (a, b, c) = tokio::try_join!( copy(&mut request_stream_reader, &mut request_stream_writer), copy(&mut response_stream_reader, &mut response_stream_writer), copy(&mut error_reader, &mut error_writer), - ); + )?; - tracing::info!("Done streaming, transferred bytes: {} {} {}", a?, b?, c?); + tracing::info!("Done streaming, transferred bytes: {} {} {}", a, b, c); Ok(()) } From 47432ec7d0bd9035ea6043d9cff3c4792ce11751 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Fri, 8 Apr 2022 15:44:04 +0100 Subject: [PATCH 7/9] Connector Proxy stream refactor (#435) * wip moving away from try_stream! * refactor simple streams into stream::once * more stream refactoring * more refactorings * simplify stream usage with helper functions * more comments * two more small streams --- .vscode/settings.json | 1 + Cargo.lock | 251 +---------- Makefile | 2 +- crates/connector_proxy/Cargo.toml | 2 +- crates/connector_proxy/src/apis.rs | 10 +- .../connector_proxy/src/connector_runner.rs | 7 +- crates/connector_proxy/src/errors.rs | 6 +- .../airbyte_source_interceptor.rs | 420 +++++++++--------- .../network_proxy_capture_interceptor.rs | 114 +++-- .../network_proxy_materialize_interceptor.rs | 103 +++-- .../src/libs/airbyte_catalog.rs | 10 +- crates/connector_proxy/src/libs/stream.rs | 289 +++++++++--- crates/connector_proxy/src/main.rs | 2 +- crates/network-proxy/src/main.rs | 1 - crates/network-proxy/src/sshforwarding.rs | 2 + 15 files changed, 598 insertions(+), 622 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index e871e49d00..7b3b3c8b84 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -21,6 +21,7 @@ "files.trimTrailingWhitespace": true, "editor.formatOnSave": true, "cSpell.words": [ + "airbyte", "Firebolt", "schemalate" ], diff --git a/Cargo.lock b/Cargo.lock index 8ec7d684ec..316a29598b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -236,9 +236,9 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] name = "base64ct" -version = "1.4.1" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71acf5509fc522cce1b100ac0121c635129bfd4d91cdf036bcc9b9935f97ccf5" +checksum = "8a32fd6af2b5827bce66c29053ba0e7c42b9dcab01835835058558c10851a46b" [[package]] name = "bcrypt-pbkdf" @@ -536,7 +536,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom 7.1.1", + "nom", ] [[package]] @@ -662,13 +662,13 @@ dependencies = [ name = "connector_proxy" version = "0.0.0" dependencies = [ - "async-stream", "async-trait", "byteorder", "bytes", "clap 3.1.8", "doc", "flow_cli_common", + "futures", "futures-core", "futures-util", "json-pointer", @@ -823,16 +823,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "crossbeam-queue" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" -dependencies = [ - "cfg-if", - "crossbeam-utils", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -895,16 +885,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "ctor" -version = "0.1.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f877be4f7c9f246b183111634f75baa039715e3f46ce860677d3b19a69fb229c" -dependencies = [ - "quote", - "syn", -] - [[package]] name = "ctr" version = "0.8.0" @@ -1016,25 +996,7 @@ dependencies = [ "lazy_static", "quickcheck", "quickcheck_macros", - "serde 1.0.136", - "serde_json", - "serde_yaml", - "thiserror", - "tinyvec", - "tracing", - "url", -] - -[[package]] -name = "doc" -version = "0.0.0" -source = "git+https://github.com/estuary/flow#4f2ca48fda98b608dd2dc2d920dc40ecc60150f5" -dependencies = [ - "fancy-regex", - "itertools", - "json 0.0.0 (git+https://github.com/estuary/flow)", - "lazy_static", - "serde 1.0.136", + "serde", "serde_json", "serde_yaml", "thiserror", @@ -1723,19 +1685,16 @@ dependencies = [ ] [[package]] -name = "json" -version = "0.0.0" -source = "git+https://github.com/estuary/flow#4f2ca48fda98b608dd2dc2d920dc40ecc60150f5" +name = "json-patch" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f995a3c8f2bc3dd52a18a583e90f9ec109c047fa1603a853e46bcda14d2e279d" dependencies = [ "serde", "serde_json", "treediff", ] -[[package]] -name = "labels" -version = "0.0.0" - [[package]] name = "json-pointer" version = "0.3.4" @@ -1745,6 +1704,10 @@ dependencies = [ "serde_json", ] +[[package]] +name = "labels" +version = "0.0.0" + [[package]] name = "lazy_static" version = "1.4.0" @@ -1759,9 +1722,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.120" +version = "0.2.122" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad5c14e80759d0939d013e6ca49930e59fc53dd8e5009132f76240c179380c09" +checksum = "ec647867e2bf0772e28c8bcde4f0d19a9216916e890543b5a03ed8ef27b8f259" [[package]] name = "libflate" @@ -2031,17 +1994,6 @@ dependencies = [ "url", ] -[[package]] -name = "nom" -version = "5.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" -dependencies = [ - "lexical-core", - "memchr", - "version_check", -] - [[package]] name = "nom" version = "7.1.1" @@ -3005,40 +2957,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rustls" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64", - "log", - "ring", - "sct 0.6.1", - "webpki 0.21.4", -] - -[[package]] -name = "rustls" -version = "0.20.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbfeb8d0ddb84706bc597a5574ab8912817c52a397f819e5b614e2265206921" -dependencies = [ - "log", - "ring", - "sct 0.7.0", - "webpki 0.22.0", -] - -[[package]] -name = "rustls-pemfile" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ee86d63972a7c661d1536fefe8c3c8407321c3df668891286de28abcd087360" -dependencies = [ - "base64", -] - [[package]] name = "rustversion" version = "1.0.6" @@ -3375,139 +3293,6 @@ dependencies = [ "yaml-merge-keys", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - -[[package]] -name = "sqlformat" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4b7922be017ee70900be125523f38bdd644f4f06a1b16e8fa5a8ee8c34bffd4" -dependencies = [ - "itertools", - "nom 7.1.1", - "unicode_categories", -] - -[[package]] -name = "sqlx" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc15591eb44ffb5816a4a70a7efd5dd87bfd3aa84c4c200401c4396140525826" -dependencies = [ - "sqlx-core", - "sqlx-macros", -] - -[[package]] -name = "sqlx-core" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "195183bf6ff8328bb82c0511a83faf60aacf75840103388851db61d7a9854ae3" -dependencies = [ - "ahash", - "atoi", - "base64", - "bitflags", - "byteorder", - "bytes", - "chrono", - "crc 2.1.0", - "crossbeam-queue", - "dirs 4.0.0", - "either", - "futures-channel", - "futures-core", - "futures-intrusive", - "futures-util", - "hashlink", - "hex", - "hmac 0.11.0", - "indexmap", - "itoa 1.0.1", - "libc", - "log", - "md-5", - "memchr", - "once_cell", - "paste 1.0.6", - "percent-encoding", - "rand 0.8.5", - "rustls 0.19.1", - "serde 1.0.136", - "serde_json", - "sha-1 0.9.8", - "sha2 0.9.9", - "smallvec", - "sqlformat", - "sqlx-rt", - "stringprep", - "thiserror", - "tokio-stream", - "url", - "webpki 0.21.4", - "webpki-roots 0.21.1", - "whoami", -] - -[[package]] -name = "sqlx-macros" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee35713129561f5e55c554bba1c378e2a7e67f81257b7311183de98c50e6f94" -dependencies = [ - "dotenv", - "either", - "heck 0.3.3", - "hex", - "once_cell", - "proc-macro2", - "quote", - "serde 1.0.136", - "serde_json", - "sha2 0.9.9", - "sqlx-core", - "sqlx-rt", - "syn", - "url", -] - -[[package]] -name = "sqlx-rt" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b555e70fbbf84e269ec3858b7a6515bcfe7a166a7cc9c636dd6efd20431678b6" -dependencies = [ - "once_cell", - "tokio", - "tokio-rustls 0.22.0", -] - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "stats_alloc" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a260c96bf26273969f360c2fc2e2c7732acc2ce49d939c7243c7230c2ad179d0" - -[[package]] -name = "stringprep" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee348cb74b87454fff4b551cbf727025810a004f88aeacae7f85b87f4e9a1c1" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - [[package]] name = "strsim" version = "0.8.0" @@ -4420,12 +4205,6 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - [[package]] name = "wasm-bindgen" version = "0.2.79" diff --git a/Makefile b/Makefile index a2390714a2..4d38e90362 100644 --- a/Makefile +++ b/Makefile @@ -281,4 +281,4 @@ docker-push: # It is invoked only for builds on the master branch. .PHONY: docker-push-dev docker-push-dev: - docker push ghcr.io/estuary/flow:dev + docker push ghcr.io/estuary/flow:dev \ No newline at end of file diff --git a/crates/connector_proxy/Cargo.toml b/crates/connector_proxy/Cargo.toml index 8f93abdf0f..72fab3ce59 100644 --- a/crates/connector_proxy/Cargo.toml +++ b/crates/connector_proxy/Cargo.toml @@ -14,12 +14,12 @@ network-proxy = { path = "../network-proxy", version = "0.1.0" } protocol = { path = "../protocol", version = "0.0.0" } async-trait="*" -async-stream="*" bytes = "*" byteorder="*" clap = { version = "^3", features = ["derive"] } futures-core = "*" futures-util="*" +futures="*" json-pointer="*" libc="*" prost = "*" diff --git a/crates/connector_proxy/src/apis.rs b/crates/connector_proxy/src/apis.rs index 08cc36cb81..7c6c781bbd 100644 --- a/crates/connector_proxy/src/apis.rs +++ b/crates/connector_proxy/src/apis.rs @@ -1,6 +1,6 @@ use bytes::Bytes; use clap::ArgEnum; -use futures_core::stream::Stream; +use futures::TryStream; use std::pin::Pin; // The protocol used by FlowRuntime to speak with connector-proxy. @@ -42,4 +42,10 @@ pub enum FlowMaterializeOperation { // An interceptor modifies the request/response streams between Flow runtime and the connector. // InterceptorStream defines the type of input and output streams handled by interceptors. -pub type InterceptorStream = Pin> + Send + Sync>>; +pub type InterceptorStream = Pin< + Box< + dyn TryStream> + + Send + + Sync, + >, +>; diff --git a/crates/connector_proxy/src/connector_runner.rs b/crates/connector_proxy/src/connector_runner.rs index 4a2d3fc824..4b97d36b79 100644 --- a/crates/connector_proxy/src/connector_runner.rs +++ b/crates/connector_proxy/src/connector_runner.rs @@ -134,6 +134,11 @@ async fn streaming_all( copy(&mut error_reader, &mut error_writer), )?; - tracing::info!("Done streaming, transferred bytes: {} {} {}", a, b, c); + tracing::info!( + req_stream = a, + resp_stream = b, + stderr = c, + "Done streaming" + ); Ok(()) } diff --git a/crates/connector_proxy/src/errors.rs b/crates/connector_proxy/src/errors.rs index 35e5d938a3..7473694484 100644 --- a/crates/connector_proxy/src/errors.rs +++ b/crates/connector_proxy/src/errors.rs @@ -60,8 +60,11 @@ pub enum Error { #[error("The operation of '{0}' is not expected for the given protocol.")] UnexpectedOperation(String), } +/*> +> +>>*/ -pub fn raise_custom_error(message: &str) -> Result<(), std::io::Error> { +pub fn raise_err(message: &str) -> Result { Err(create_custom_error(message)) } @@ -69,6 +72,7 @@ pub fn create_custom_error(message: &str) -> std::io::Error { std::io::Error::new(std::io::ErrorKind::Other, message) } +// TODO: refactor to remove or_bail usages pub trait Must { fn or_bail(self) -> T; } diff --git a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs index ab15ace019..283380fc6d 100644 --- a/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/airbyte_source_interceptor.rs @@ -1,16 +1,15 @@ use crate::apis::{FlowCaptureOperation, InterceptorStream}; -use crate::errors::{create_custom_error, raise_custom_error, Error}; +use crate::errors::{create_custom_error, raise_err, Error}; use crate::libs::airbyte_catalog::{ self, ConfiguredCatalog, ConfiguredStream, DestinationSyncMode, Range, ResourceSpec, Status, SyncMode, }; use crate::libs::command::READY; use crate::libs::json::{create_root_schema, tokenize_jsonpointer}; -use crate::libs::protobuf::{decode_message, encode_message}; -use crate::libs::stream::stream_all_airbyte_messages; +use crate::libs::protobuf::encode_message; +use crate::libs::stream::{get_airbyte_response, get_decoded_message, stream_airbyte_responses}; -use async_stream::try_stream; use bytes::Bytes; use protocol::capture::{ discover_response, validate_response, DiscoverRequest, DiscoverResponse, Documents, @@ -22,13 +21,12 @@ use std::sync::Arc; use tokio::sync::Mutex; use validator::Validate; -use futures_util::StreamExt; +use futures::{stream, StreamExt, TryStreamExt}; use json_pointer::JsonPointer; use serde_json::value::RawValue; use std::fs::File; use std::io::Write; use tempfile::{Builder, TempDir}; -use tokio_util::io::StreamReader; const CONFIG_FILE_NAME: &str = "config.json"; const CATALOG_FILE_NAME: &str = "catalog.json"; @@ -53,37 +51,26 @@ impl AirbyteSourceInterceptor { } fn adapt_spec_request_stream(&mut self, in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(try_stream! { - let mut reader = StreamReader::new(in_stream); - decode_message::(&mut reader).await?.ok_or(create_custom_error("missing spec request."))?; - - yield Bytes::from(READY); - }) + Box::pin(stream::once(async { + get_decoded_message::(in_stream).await?; + Ok(Bytes::from(READY)) + })) } fn adapt_spec_response_stream(&mut self, in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(try_stream! { - let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); - loop { - let message = match airbyte_message_stream.next().await { - None => break, - Some(message) => message? - }; - if let Some(spec) = message.spec { - let mut resp = SpecResponse::default(); - resp.endpoint_spec_schema_json = spec.connection_specification.to_string(); - resp.resource_spec_schema_json = serde_json::to_string_pretty(&create_root_schema::())?; - if let Some(url) = spec.documentation_url { - resp.documentation_url = url; - } - yield encode_message(&resp)?; - } else if let Some(mlog) = message.log { - mlog.log(); - } else { - raise_custom_error("unexpected spec response.")?; - } + Box::pin(stream::once(async { + let message = get_airbyte_response(in_stream, |m| m.spec.is_some()).await?; + let spec = message.spec.unwrap(); + + let mut resp = SpecResponse::default(); + resp.endpoint_spec_schema_json = spec.connection_specification.to_string(); + resp.resource_spec_schema_json = + serde_json::to_string_pretty(&create_root_schema::())?; + if let Some(url) = spec.documentation_url { + resp.documentation_url = url; } - }) + encode_message(&resp) + })) } fn adapt_discover_request( @@ -91,60 +78,59 @@ impl AirbyteSourceInterceptor { config_file_path: String, in_stream: InterceptorStream, ) -> InterceptorStream { - Box::pin(try_stream! { - let mut reader = StreamReader::new(in_stream); - let request = decode_message::(&mut reader).await?.ok_or(create_custom_error("missing discover request."))?; + Box::pin(stream::once(async { + let request = get_decoded_message::(in_stream).await?; File::create(config_file_path)?.write_all(request.endpoint_spec_json.as_bytes())?; - yield Bytes::from(READY); - }) + Ok(Bytes::from(READY)) + })) } fn adapt_discover_response_stream( &mut self, in_stream: InterceptorStream, ) -> InterceptorStream { - Box::pin(try_stream! { - let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); - loop { - let message = match airbyte_message_stream.next().await { - None => break, - Some(message) => message? + Box::pin(stream::once(async { + let message = get_airbyte_response(in_stream, |m| m.catalog.is_some()).await?; + let catalog = message.catalog.unwrap(); + + let mut resp = DiscoverResponse::default(); + for stream in catalog.streams { + let has_incremental = stream + .supported_sync_modes + .map(|modes| modes.contains(&SyncMode::Incremental)) + .unwrap_or(false); + let mode = if has_incremental { + SyncMode::Incremental + } else { + SyncMode::FullRefresh + }; + let resource_spec = ResourceSpec { + stream: stream.name.clone(), + namespace: stream.namespace, + sync_mode: mode, }; - if let Some(catalog) = message.catalog { - let mut resp = DiscoverResponse::default(); - for stream in catalog.streams { - let mode = if stream.supported_sync_modes.map(|modes| modes.contains(&SyncMode::Incremental)).unwrap_or(false) {SyncMode::Incremental} else {SyncMode::FullRefresh}; - let resource_spec = ResourceSpec { - stream: stream.name.clone(), - namespace: stream.namespace, - sync_mode: mode - }; - - let key_ptrs = match stream.source_defined_primary_key { - None => Vec::new(), - // TODO: use doc::Pointer, and if necessary implement creation of new json pointers - // in that module. What about the existing tokenize_jsonpointer function? - Some(keys) => keys.iter().map(|k| JsonPointer::new(k).to_string()).collect() - }; - resp.bindings.push(discover_response::Binding{ - recommended_name: stream.name.clone(), - resource_spec_json: serde_json::to_string(&resource_spec)?, - key_ptrs: key_ptrs, - document_schema_json: stream.json_schema.to_string(), - }) - } + let key_ptrs = match stream.source_defined_primary_key { + None => Vec::new(), + // TODO: use doc::Pointer, and if necessary implement creation of new json pointers + // in that module. What about the existing tokenize_jsonpointer function? + Some(keys) => keys + .iter() + .map(|k| JsonPointer::new(k).to_string()) + .collect(), + }; + resp.bindings.push(discover_response::Binding { + recommended_name: stream.name.clone(), + resource_spec_json: serde_json::to_string(&resource_spec)?, + key_ptrs: key_ptrs, + document_schema_json: stream.json_schema.to_string(), + }) + } - yield encode_message(&resp)?; - } else if let Some(mlog) = message.log { - mlog.log(); - } else { - raise_custom_error("unexpected discover response.")?; - } - } - }) + encode_message(&resp) + })) } fn adapt_validate_request_stream( @@ -153,15 +139,14 @@ impl AirbyteSourceInterceptor { validate_request: Arc>>, in_stream: InterceptorStream, ) -> InterceptorStream { - Box::pin(try_stream! { - let mut reader = StreamReader::new(in_stream); - let request = decode_message::(&mut reader).await?.ok_or(create_custom_error("missing validate request"))?; + Box::pin(stream::once(async move { + let request = get_decoded_message::(in_stream).await?; *validate_request.lock().await = Some(request.clone()); File::create(config_file_path)?.write_all(request.endpoint_spec_json.as_bytes())?; - yield Bytes::from(READY); - }) + Ok(Bytes::from(READY)) + })) } fn adapt_validate_response_stream( @@ -169,35 +154,30 @@ impl AirbyteSourceInterceptor { validate_request: Arc>>, in_stream: InterceptorStream, ) -> InterceptorStream { - Box::pin(try_stream! { - let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); - loop { - let message = match airbyte_message_stream.next().await { - None => break, - Some(message) => message? - }; + Box::pin(stream::once(async move { + let message = + get_airbyte_response(in_stream, |m| m.connection_status.is_some()).await?; - if let Some(connection_status) = message.connection_status { - if connection_status.status != Status::Succeeded { - raise_custom_error(&format!("validation failed {:?}", connection_status))?; - } + let connection_status = message.connection_status.unwrap(); - let req = validate_request.lock().await; - let req = req.as_ref().ok_or(create_custom_error("missing validate request."))?; - let mut resp = ValidateResponse::default(); - for binding in &req.bindings { - let resource: ResourceSpec = serde_json::from_str(&binding.resource_spec_json)?; - resp.bindings.push(validate_response::Binding {resource_path: vec![resource.stream]}); - } - drop(req); - yield encode_message(&resp)?; - } else if let Some(mlog) = message.log { - mlog.log(); - } else { - raise_custom_error("unexpected validate response.")?; - } - } - }) + if connection_status.status != Status::Succeeded { + return raise_err(&format!("validation failed {:?}", connection_status)); + } + + let req = validate_request.lock().await; + let req = req + .as_ref() + .ok_or(create_custom_error("missing validate request."))?; + let mut resp = ValidateResponse::default(); + for binding in &req.bindings { + let resource: ResourceSpec = serde_json::from_str(&binding.resource_spec_json)?; + resp.bindings.push(validate_response::Binding { + resource_path: vec![resource.stream], + }); + } + + encode_message(&resp) + })) } fn adapt_pull_request_stream( @@ -208,66 +188,83 @@ impl AirbyteSourceInterceptor { stream_to_binding: Arc>>, in_stream: InterceptorStream, ) -> InterceptorStream { - Box::pin(try_stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await?.ok_or(create_custom_error("missing pull request"))?; - if let Some(ref mut o) = request.open { - File::create(state_file_path)?.write_all(&o.driver_checkpoint_json)?; - - if let Some(ref mut c) = o.capture { - File::create(config_file_path)?.write_all(&c.endpoint_spec_json.as_bytes())?; - - let mut catalog = ConfiguredCatalog { - streams: Vec::new(), - tail: o.tail, - range: Range { begin: o.key_begin, end: o.key_end } - }; - - let mut stream_to_binding = stream_to_binding.lock().await; - - for (i, binding) in c.bindings.iter().enumerate() { - let resource: ResourceSpec = serde_json::from_str(&binding.resource_spec_json)?; - stream_to_binding.insert(resource.stream.clone(), i); - - let mut projections = HashMap::new(); - if let Some(ref collection) = binding.collection { - for p in &collection.projections { - projections.insert(p.field.clone(), p.ptr.clone()); + Box::pin( + stream::once(async move { + let mut request = get_decoded_message::(in_stream).await?; + if let Some(ref mut o) = request.open { + File::create(state_file_path)?.write_all(&o.driver_checkpoint_json)?; + + if let Some(ref mut c) = o.capture { + File::create(config_file_path)? + .write_all(&c.endpoint_spec_json.as_bytes())?; + + let mut catalog = ConfiguredCatalog { + streams: Vec::new(), + tail: o.tail, + range: Range { + begin: o.key_begin, + end: o.key_end, + }, + }; + + let mut stream_to_binding = stream_to_binding.lock().await; + + for (i, binding) in c.bindings.iter().enumerate() { + let resource: ResourceSpec = + serde_json::from_str(&binding.resource_spec_json)?; + stream_to_binding.insert(resource.stream.clone(), i); + + let mut projections = HashMap::new(); + if let Some(ref collection) = binding.collection { + for p in &collection.projections { + projections.insert(p.field.clone(), p.ptr.clone()); + } + + let primary_key: Vec> = collection + .key_ptrs + .iter() + .map(|ptr| tokenize_jsonpointer(ptr)) + .collect(); + catalog.streams.push(ConfiguredStream { + sync_mode: resource.sync_mode.clone(), + destination_sync_mode: DestinationSyncMode::Append, + cursor_field: None, + primary_key: Some(primary_key), + stream: airbyte_catalog::Stream { + name: resource.stream, + namespace: resource.namespace, + json_schema: RawValue::from_string( + collection.schema_json.clone(), + )?, + supported_sync_modes: Some(vec![resource + .sync_mode + .clone()]), + default_cursor_field: None, + source_defined_cursor: None, + source_defined_primary_key: None, + }, + projections: projections, + }); } + } - let primary_key: Vec> = collection.key_ptrs.iter().map(|ptr| tokenize_jsonpointer(ptr)).collect(); - catalog.streams.push(ConfiguredStream{ - sync_mode: resource.sync_mode.clone(), - destination_sync_mode: DestinationSyncMode::Append, - cursor_field: None, - primary_key: Some(primary_key), - stream: airbyte_catalog::Stream{ - name: resource.stream, - namespace: resource.namespace, - json_schema: RawValue::from_string(collection.schema_json.clone())?, - supported_sync_modes: Some(vec![resource.sync_mode.clone()]), - default_cursor_field: None, - source_defined_cursor: None, - source_defined_primary_key: None, - }, - projections: projections, - }); + if let Err(e) = catalog.validate() { + raise_err(&format!("invalid config_catalog: {:?}", e))? } - } - if let Err(e) = catalog.validate() { - raise_custom_error(&format!("invalid config_catalog: {:?}", e))? + serde_json::to_writer(File::create(catalog_file_path)?, &catalog)? } - serde_json::to_writer(File::create(catalog_file_path)?, &catalog)? - } - - // release the lock. - drop(stream_to_binding); + // release the lock. + drop(stream_to_binding); - yield Bytes::from(READY); - } - }) + Ok(Some(Bytes::from(READY))) + } else { + Ok(None) + } + }) + .try_filter_map(|item| futures::future::ready(Ok(item))), + ) } fn adapt_pull_response_stream( @@ -275,67 +272,68 @@ impl AirbyteSourceInterceptor { stream_to_binding: Arc>>, in_stream: InterceptorStream, ) -> InterceptorStream { - Box::pin(try_stream! { - let mut airbyte_message_stream = Box::pin(stream_all_airbyte_messages(in_stream)); - // transaction_pending is true if the connector writes output messages and exits _without_ writing - // a final state checkpoint. - let mut transaction_pending = false; - - loop { - let message = match airbyte_message_stream.next().await { - None => break, - Some(message) => message? + let airbyte_message_stream = Box::pin(stream_airbyte_responses(in_stream)); + + Box::pin(stream::try_unfold( + (false, stream_to_binding, airbyte_message_stream), + |(transaction_pending, stb, mut stream)| async move { + let message = match stream.next().await { + Some(m) => m?, + None => { + // transaction_pending is true if the connector writes output messages and exits _without_ writing + // a final state checkpoint. + if transaction_pending { + // We generate a synthetic commit now, and the empty checkpoint means the assumed behavior + // of the next invocation will be "full refresh". + let mut resp = PullResponse::default(); + resp.checkpoint = Some(DriverCheckpoint { + driver_checkpoint_json: Vec::new(), + rfc7396_merge_patch: false, + }); + return Ok(Some((encode_message(&resp)?, (false, stb, stream)))); + } else { + return Ok(None); + } + } }; let mut resp = PullResponse::default(); if let Some(state) = message.state { - resp.checkpoint = Some(DriverCheckpoint{ - driver_checkpoint_json: state.data.get().as_bytes().to_vec(), - rfc7396_merge_patch: match state.merge { - Some(m) => m, - None => false, - }, + resp.checkpoint = Some(DriverCheckpoint { + driver_checkpoint_json: state.data.get().as_bytes().to_vec(), + rfc7396_merge_patch: match state.merge { + Some(m) => m, + None => false, + }, }); - yield encode_message(&resp)?; - transaction_pending = false; + Ok(Some((encode_message(&resp)?, (false, stb, stream)))) } else if let Some(record) = message.record { - let stream_to_binding = stream_to_binding.lock().await; - match stream_to_binding.get(&record.stream) { - None => { - raise_custom_error(&format!("connector record with unknown stream {}", record.stream))?; - } - Some(binding) => { - let arena = record.data.get().as_bytes().to_vec(); - let arena_len: u32 = arena.len() as u32; - resp.documents = Some(Documents { - binding: *binding as u32, - arena: arena, - docs_json: vec![Slice{begin: 0, end: arena_len}] - }) - } - } + let stream_to_binding = stb.lock().await; + let binding = + stream_to_binding + .get(&record.stream) + .ok_or(create_custom_error(&format!( + "connector record with unknown stream {}", + record.stream + )))?; + let arena = record.data.get().as_bytes().to_vec(); + let arena_len: u32 = arena.len() as u32; + resp.documents = Some(Documents { + binding: *binding as u32, + arena: arena, + docs_json: vec![Slice { + begin: 0, + end: arena_len, + }], + }); drop(stream_to_binding); - yield encode_message(&resp)?; - transaction_pending = true; - } else if let Some(mlog) = message.log { - mlog.log(); + Ok(Some((encode_message(&resp)?, (true, stb, stream)))) } else { - raise_custom_error("unexpected pull response.")?; + raise_err("unexpected pull response.") } - } - - if transaction_pending { - // We generate a synthetic commit now, and the empty checkpoint means the assumed behavior - // of the next invocation will be "full refresh". - let mut resp = PullResponse::default(); - resp.checkpoint = Some(DriverCheckpoint{ - driver_checkpoint_json: Vec::new(), - rfc7396_merge_patch: false - }); - yield encode_message(&resp)?; - } - }) + }, + )) } fn input_file_path(&mut self, file_name: &str) -> String { diff --git a/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs b/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs index 6b98cf6d07..74a533ecf1 100644 --- a/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs @@ -2,14 +2,12 @@ use crate::apis::{FlowCaptureOperation, InterceptorStream}; use crate::errors::{Error, Must}; use crate::libs::network_proxy::NetworkProxy; use crate::libs::protobuf::{decode_message, encode_message}; -use crate::libs::stream::stream_all_bytes; +use crate::libs::stream::{get_decoded_message, stream_all_bytes}; +use futures::{future, stream, StreamExt, TryStreamExt}; use protocol::capture::{ ApplyRequest, DiscoverRequest, PullRequest, SpecResponse, ValidateRequest, }; -use async_stream::stream; -use futures_util::pin_mut; -use futures_util::StreamExt; use serde_json::value::RawValue; use tokio_util::io::StreamReader; @@ -17,60 +15,79 @@ pub struct NetworkProxyCaptureInterceptor {} impl NetworkProxyCaptureInterceptor { fn adapt_discover_request_stream(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); + Box::pin(stream::once(async { + let mut request = get_decoded_message::(in_stream).await?; + request.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( RawValue::from_string(request.endpoint_spec_json)?, - ).await.or_bail().to_string(); - yield encode_message(&request); - }) + ) + .await + .or_bail() + .to_string(); + + encode_message(&request) + })) } fn adapt_validate_request_stream(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); + Box::pin(stream::once(async { + let mut request = get_decoded_message::(in_stream).await?; + request.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( RawValue::from_string(request.endpoint_spec_json)?, - ).await.or_bail().to_string(); - yield encode_message(&request); - }) + ) + .await + .or_bail() + .to_string(); + + encode_message(&request) + })) } fn adapt_apply_request(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); + Box::pin(stream::once(async { + let mut request = get_decoded_message::(in_stream).await?; + if let Some(ref mut c) = request.capture { - c.endpoint_spec_json = - NetworkProxy::consume_network_proxy_config( - RawValue::from_string(c.endpoint_spec_json.clone())?, - ).await.or_bail().to_string(); + c.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + RawValue::from_string(c.endpoint_spec_json.clone())?, + ) + .await + .or_bail() + .to_string(); } - yield encode_message(&request); - }) + + encode_message(&request) + })) } fn adapt_pull_request_stream(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); - if let Some(ref mut o) = request.open { - if let Some(ref mut c) = o.capture { - c.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( - RawValue::from_string(c.endpoint_spec_json.clone())?, - ).await.or_bail().to_string(); + Box::pin( + stream::once(async { + let mut reader = StreamReader::new(in_stream); + let mut request = decode_message::(&mut reader) + .await + .or_bail() + .expect("expected request is not received."); + if let Some(ref mut o) = request.open { + if let Some(ref mut c) = o.capture { + c.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + RawValue::from_string(c.endpoint_spec_json.clone())?, + ) + .await + .or_bail() + .to_string(); + } } - } - yield encode_message(&request); - // deliver the rest messages in the stream. - let s = stream_all_bytes(reader); - pin_mut!(s); - while let Some(value) = s.next().await { - yield value; - } - }) + + let first = stream::once(future::ready(encode_message(&request))); + let rest = stream_all_bytes(reader); + + // We need to set explicit error type, see https://github.com/rust-lang/rust/issues/63502 + Ok::<_, std::io::Error>(first.chain(rest)) + }) + .try_flatten(), + ) } } @@ -95,14 +112,15 @@ impl NetworkProxyCaptureInterceptor { in_stream: InterceptorStream, ) -> Result { Ok(match op { - FlowCaptureOperation::Spec => Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut response = decode_message::(&mut reader).await.or_bail().expect("No expected response received."); + FlowCaptureOperation::Spec => Box::pin(stream::once(async move { + let mut response = get_decoded_message::(in_stream).await?; response.endpoint_spec_schema_json = NetworkProxy::extend_endpoint_schema( RawValue::from_string(response.endpoint_spec_schema_json)?, - ).or_bail().to_string(); - yield encode_message(&response); - }), + ) + .or_bail() + .to_string(); + encode_message(&response) + })), _ => in_stream, }) } diff --git a/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs b/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs index d85c05ecbc..9bcb484576 100644 --- a/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs @@ -2,13 +2,11 @@ use crate::apis::{FlowMaterializeOperation, InterceptorStream}; use crate::errors::{Error, Must}; use crate::libs::network_proxy::NetworkProxy; use crate::libs::protobuf::{decode_message, encode_message}; -use crate::libs::stream::stream_all_bytes; +use crate::libs::stream::{get_decoded_message, stream_all_bytes}; +use futures::{future, stream, StreamExt, TryStreamExt}; use protocol::materialize::{ApplyRequest, SpecResponse, TransactionRequest, ValidateRequest}; -use async_stream::stream; -use futures_util::pin_mut; -use futures_util::StreamExt; use serde_json::value::RawValue; use tokio_util::io::StreamReader; @@ -16,51 +14,62 @@ pub struct NetworkProxyMaterializeInterceptor {} impl NetworkProxyMaterializeInterceptor { fn adapt_spec_request(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); - request.endpoint_spec_json = - NetworkProxy::consume_network_proxy_config(RawValue::from_string(request.endpoint_spec_json)?) - .await - .expect("failed to start network proxy") - .to_string(); - yield encode_message(&request); - }) + Box::pin(stream::once(async { + let mut request = get_decoded_message::(in_stream).await?; + + request.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + RawValue::from_string(request.endpoint_spec_json)?, + ) + .await + .expect("failed to start network proxy") + .to_string(); + encode_message(&request) + })) } fn adapt_apply_request(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); + Box::pin(stream::once(async { + let mut request = get_decoded_message::(in_stream).await?; + if let Some(ref mut m) = request.materialization { - m.endpoint_spec_json = - NetworkProxy::consume_network_proxy_config( - RawValue::from_string(m.endpoint_spec_json.clone())?, - ).await.or_bail().to_string(); + m.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + RawValue::from_string(m.endpoint_spec_json.clone())?, + ) + .await + .or_bail() + .to_string(); } - yield encode_message(&request); - }) + + encode_message(&request) + })) } fn adapt_transactions_request(in_stream: InterceptorStream) -> InterceptorStream { - Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut request = decode_message::(&mut reader).await.or_bail().expect("expected request is not received."); - if let Some(ref mut o) = request.open { - if let Some(ref mut m) = o.materialization { - m.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( - RawValue::from_string(m.endpoint_spec_json.clone())?, - ).await.or_bail().to_string(); + Box::pin( + stream::once(async { + let mut reader = StreamReader::new(in_stream); + let mut request = decode_message::(&mut reader) + .await + .or_bail() + .expect("expected request is not received."); + if let Some(ref mut o) = request.open { + if let Some(ref mut m) = o.materialization { + m.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + RawValue::from_string(m.endpoint_spec_json.clone())?, + ) + .await + .or_bail() + .to_string(); + } } - } - yield encode_message(&request); - // deliver the remaining messages in the stream. - let s = stream_all_bytes(reader); - pin_mut!(s); - while let Some(bytes) = s.next().await { - yield bytes; - } - }) + let first = stream::once(future::ready(encode_message(&request))); + let rest = stream_all_bytes(reader); + + // We need to set explicit error type, see https://github.com/rust-lang/rust/issues/63502 + Ok::<_, std::io::Error>(first.chain(rest)) + }) + .try_flatten(), + ) } } @@ -84,14 +93,16 @@ impl NetworkProxyMaterializeInterceptor { in_stream: InterceptorStream, ) -> Result { Ok(match op { - FlowMaterializeOperation::Spec => Box::pin(stream! { - let mut reader = StreamReader::new(in_stream); - let mut response = decode_message::(&mut reader).await.or_bail().expect("expected response is not received."); + FlowMaterializeOperation::Spec => Box::pin(stream::once(async { + let mut response = get_decoded_message::(in_stream).await?; + response.endpoint_spec_schema_json = NetworkProxy::extend_endpoint_schema( RawValue::from_string(response.endpoint_spec_schema_json)?, - ).or_bail().to_string(); - yield encode_message(&response); - }), + ) + .or_bail() + .to_string(); + encode_message(&response) + })), _ => in_stream, }) } diff --git a/crates/connector_proxy/src/libs/airbyte_catalog.rs b/crates/connector_proxy/src/libs/airbyte_catalog.rs index 91f6a0c19e..657cf7c044 100644 --- a/crates/connector_proxy/src/libs/airbyte_catalog.rs +++ b/crates/connector_proxy/src/libs/airbyte_catalog.rs @@ -75,7 +75,7 @@ pub struct Catalog { pub streams: Vec, } -#[derive(Debug, Deserialize, Clone, Validate)] +#[derive(Debug, Deserialize, Clone, Validate, PartialEq)] #[validate(schema(function = "Self::validate_range"))] pub struct Range { pub begin: u32, @@ -127,7 +127,7 @@ pub enum Status { Failed, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub struct ConnectionStatus { pub status: Status, @@ -143,7 +143,7 @@ pub struct Record { pub namespace: Option, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "SCREAMING_SNAKE_CASE")] pub enum LogLevel { Trace, @@ -154,7 +154,7 @@ pub enum LogLevel { Fatal, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "camelCase")] pub struct Log { pub level: LogLevel, @@ -210,7 +210,7 @@ pub struct Spec { pub advanced_auth: Option>, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "SCREAMING_SNAKE_CASE")] pub enum MessageType { Record, diff --git a/crates/connector_proxy/src/libs/stream.rs b/crates/connector_proxy/src/libs/stream.rs index e62789e530..a38c553395 100644 --- a/crates/connector_proxy/src/libs/stream.rs +++ b/crates/connector_proxy/src/libs/stream.rs @@ -1,91 +1,244 @@ -use crate::apis::InterceptorStream; use crate::libs::airbyte_catalog::Message; +use crate::{apis::InterceptorStream, errors::create_custom_error}; -use crate::errors::raise_custom_error; -use async_stream::try_stream; +use crate::errors::raise_err; use bytes::{Buf, Bytes, BytesMut}; -use futures_core::Stream; -use futures_util::StreamExt; +use futures::{stream, StreamExt, TryStream, TryStreamExt}; use serde_json::{Deserializer, Value}; use tokio::io::{AsyncRead, AsyncReadExt}; +use tokio_util::io::StreamReader; use validator::Validate; +use super::protobuf::decode_message; + pub fn stream_all_bytes( - mut reader: R, -) -> impl Stream> { - // TODO: can we replace these macros with futures crate StreamExt or TryStreamExt methods? - // e.g. futures::stream::unfold() might be useful. - try_stream! { - loop { - // consistent with the default capacity of ReaderStream. - // https://github.com/tokio-rs/tokio/blob/master/tokio-util/src/io/reader_stream.rs#L8 - let mut buf = BytesMut::with_capacity(4096); - match reader.read_buf(&mut buf).await { - Ok(0) => break, - Ok(_) => { - yield buf.into(); - } - Err(e) => { - raise_custom_error(&format!("error during streaming {:?}.", e))?; - } - } + reader: R, +) -> impl TryStream, Error = std::io::Error, Ok = bytes::Bytes> { + stream::try_unfold(reader, |mut r| async { + // consistent with the default capacity of ReaderStream. + // https://github.com/tokio-rs/tokio/blob/master/tokio-util/src/io/reader_stream.rs#L8 + let mut buf = BytesMut::with_capacity(4096); + match r.read_buf(&mut buf).await { + Ok(0) => Ok(None), + Ok(_) => Ok(Some((Bytes::from(buf), r))), + Err(e) => raise_err(&format!("error during streaming {:?}.", e)), } - } + }) } -pub fn stream_all_airbyte_messages( - mut in_stream: InterceptorStream, -) -> impl Stream> { - try_stream! { +/// Given a stream of bytes, try to deserialize them into Airbyte Messages. +/// This can be used when reading responses from the Airbyte connector, and will +/// handle validation of messages as well as handling of AirbyteLogMessages. +/// Will ignore* messages that cannot be parsed to an AirbyteMessage. +/// * See https://docs.airbyte.com/understanding-airbyte/airbyte-specification#the-airbyte-protocol +pub fn stream_airbyte_responses( + in_stream: InterceptorStream, +) -> impl TryStream, Ok = Message, Error = std::io::Error> { + stream::once(async { let mut buf = BytesMut::new(); + let items = in_stream + .map(move |bytes| { + let b = bytes?; + buf.extend_from_slice(b.chunk()); + let chunk = buf.chunk(); + + // Deserialize to Value first, instead of Message, to avoid missing 'is_eof' signals in error. + let deserializer = Deserializer::from_slice(chunk); + let mut value_stream = deserializer.into_iter::(); + + // Turn Values into Messages and validate them + let values: Vec> = value_stream + .by_ref() + .map_while(|value| match value { + Ok(v) => Some(Ok(v)), + Err(e) => { + // we must stop as soon as we hit EOF to avoid + // progressing value_stream.byte_offset() so that we can + // safely drop the buffer up to byte_offset() and pick up the leftovers + // when working with the next bytes + if e.is_eof() { + return None; + } - while let Some(bytes) = in_stream.next().await { - match bytes { - Ok(b) => { - buf.extend_from_slice(b.chunk()); - } - Err(e) => { - raise_custom_error(&format!("error in reading next in_stream: {:?}", e))?; - } - } - - let chunk = buf.chunk(); - let deserializer = Deserializer::from_slice(&chunk); - - // Deserialize to Value first, instead of Message, to avoid missing 'is_eof' signals in error. - let mut value_stream = deserializer.into_iter::(); - while let Some(value) = value_stream.next() { - match value { - Ok(v) => { - let message: Message = serde_json::from_value(v).unwrap(); - if let Err(e) = message.validate() { - raise_custom_error(&format!( - "error in validating message: {:?}, {:?}", - e, std::str::from_utf8(&chunk[value_stream.byte_offset()..])))?; + Some(raise_err(&format!( + "error in decoding JSON: {:?}, {:?}", + e, + std::str::from_utf8(chunk) + ))) } - tracing::debug!("read message:: {:?}", &message); - yield message; - } - Err(e) => { - if e.is_eof() { - break; + }) + .map(|value| match value { + Ok(v) => { + let message: Message = match serde_json::from_value(v) { + Ok(m) => m, + // We ignore JSONs that are not Airbyte Messages according + // to the specification: + // https://docs.airbyte.com/understanding-airbyte/airbyte-specification#the-airbyte-protocol + Err(_) => return Ok(None), + }; + + message.validate().map_err(|e| { + create_custom_error(&format!("error in validating message {:?}", e)) + })?; + + tracing::debug!("read message:: {:?}", &message); + Ok(Some(message)) } + Err(e) => Err(e), + }) + // Flipping the Option and Result to filter out the None values + .filter_map(|value| match value { + Ok(Some(v)) => Some(Ok(v)), + Ok(None) => None, + Err(e) => Some(Err(e)), + }) + .collect(); + + let byte_offset = value_stream.byte_offset(); + drop(buf.split_to(byte_offset)); - raise_custom_error(&format!( - "error in decoding message: {:?}, {:?}", - e, std::str::from_utf8(&chunk[value_stream.byte_offset()..])))?; - } - } - } + Ok::<_, std::io::Error>(stream::iter(values)) + }) + .try_flatten(); - let byte_offset = value_stream.byte_offset(); - drop(buf.split_to(byte_offset)); + // We need to set explicit error type, see https://github.com/rust-lang/rust/issues/63502 + Ok::<_, std::io::Error>(items) + }) + .try_flatten() + // Handle logs here so we don't have to worry about them everywhere else + .try_filter_map(|message| async { + if let Some(log) = message.log { + log.log(); + Ok(None) + } else { + Ok(Some(message)) } + }) +} + +/// Read the given stream and try to find an Airbyte message that matches the predicate +/// ignoring* other message kinds. This can be used to work with Airbyte connector responses. +/// * See https://docs.airbyte.com/understanding-airbyte/airbyte-specification#the-airbyte-protocol +pub fn get_airbyte_response( + in_stream: InterceptorStream, + predicate: F, +) -> impl futures::Future> +where + F: Fn(&Message) -> bool, +{ + async move { + let stream_head = Box::pin(stream_airbyte_responses(in_stream)).next().await; - if buf.len() > 0 { - raise_custom_error("unconsumed content in stream found.")?; + let message = match stream_head { + Some(m) => m, + None => return raise_err("Could not find message in stream"), + }?; + + if predicate(&message) { + Ok(message) + } else { + raise_err("Could not find message matching condition") } + } +} + +/// Read the given stream of bytes and try to decode it to type +pub fn get_decoded_message( + in_stream: InterceptorStream, +) -> impl futures::Future> +where + T: prost::Message + std::default::Default, +{ + async move { + let mut reader = StreamReader::new(in_stream); + decode_message::(&mut reader) + .await? + .ok_or(create_custom_error("missing request")) + } +} + +#[cfg(test)] +mod test { + use futures::future; + + use crate::libs::airbyte_catalog::{ConnectionStatus, MessageType, Status}; + + use super::*; + + #[tokio::test] + async fn test_stream_all_bytes() { + let input = "{\"test\": \"hello\"}".as_bytes(); + let stream = stream::once(future::ready(Ok::<_, std::io::Error>(input))); + let reader = StreamReader::new(stream); + let mut all_bytes = Box::pin(stream_all_bytes(reader)); + + let result = all_bytes.next().await.unwrap().unwrap(); + assert_eq!(result.chunk(), input); + } + + #[tokio::test] + async fn test_stream_airbyte_responses_eof_split_json() { + let input_message = Message { + message_type: MessageType::ConnectionStatus, + log: None, + state: None, + record: None, + spec: None, + catalog: None, + connection_status: Some(ConnectionStatus { + status: Status::Succeeded, + message: Some("test".to_string()), + }), + }; + let input = vec![ + Ok::<_, std::io::Error>( + "{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {".as_bytes(), + ), + Ok::<_, std::io::Error>("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}".as_bytes()), + ]; + let stream = stream::iter(input); + let reader = StreamReader::new(stream); + + let byte_stream = Box::pin(stream_all_bytes(reader)); + let mut messages = Box::pin(stream_airbyte_responses(byte_stream)); + + let result = messages.next().await.unwrap().unwrap(); + assert_eq!( + result.connection_status.unwrap(), + input_message.connection_status.unwrap() + ); + } + + #[tokio::test] + async fn test_stream_airbyte_responses_eof_split_json_partial() { + let input_message = Message { + message_type: MessageType::ConnectionStatus, + log: None, + state: None, + record: None, + spec: None, + catalog: None, + connection_status: Some(ConnectionStatus { + status: Status::Succeeded, + message: Some("test".to_string()), + }), + }; + let input = vec![ + Ok::<_, std::io::Error>( + "{}\n{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {".as_bytes(), + ), + Ok::<_, std::io::Error>("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}".as_bytes()), + ]; + let stream = stream::iter(input); + let reader = StreamReader::new(stream); + + let byte_stream = Box::pin(stream_all_bytes(reader)); + let mut messages = Box::pin(stream_airbyte_responses(byte_stream)); - tracing::info!("done reading all in_stream."); + let result = messages.next().await.unwrap().unwrap(); + assert_eq!( + result.connection_status.unwrap(), + input_message.connection_status.unwrap() + ); } } diff --git a/crates/connector_proxy/src/main.rs b/crates/connector_proxy/src/main.rs index d157168b2e..169d388f8b 100644 --- a/crates/connector_proxy/src/main.rs +++ b/crates/connector_proxy/src/main.rs @@ -104,7 +104,7 @@ async fn main() -> std::io::Result<()> { let result = async_main(image_inspect_json_path, proxy_command).await; if let Err(err) = result.as_ref() { - tracing::error!("connector proxy execution failed. {:?}", err); + tracing::error!(error = ?err, "connector proxy execution failed."); std::process::exit(1); } Ok(()) diff --git a/crates/network-proxy/src/main.rs b/crates/network-proxy/src/main.rs index 1451000708..bd8d63028e 100644 --- a/crates/network-proxy/src/main.rs +++ b/crates/network-proxy/src/main.rs @@ -32,7 +32,6 @@ async fn run() -> Result<(), Error> { // If either of the assumptions is invalid for any new proxy type, the READY-logic need to be moved to a separate task, which // sends out the "READY" signal after making sure the network proxy is started and working properly. println!("READY"); - io::stdout().flush()?; proxy.start_serve().await?; diff --git a/crates/network-proxy/src/sshforwarding.rs b/crates/network-proxy/src/sshforwarding.rs index 07db484f70..e0336f2830 100644 --- a/crates/network-proxy/src/sshforwarding.rs +++ b/crates/network-proxy/src/sshforwarding.rs @@ -76,6 +76,8 @@ impl SshForwarding { } pub async fn authenticate(&mut self) -> Result<(), Error> { + // TODO: this breaks on the new OpenSSH keys, see: + // https://stackoverflow.com/questions/54994641/openssh-private-key-to-rsa-private-key let key_pair = Arc::new(key::KeyPair::RSA { key: openssl::rsa::Rsa::private_key_from_pem(&self.config.private_key.as_bytes())?, hash: key::SignatureHash::SHA2_256, From beeb2d328feec894771db918963b3b7a695d7f0e Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Mon, 11 Apr 2022 16:10:16 +0000 Subject: [PATCH 8/9] rename network-proxy to network-tunnel --- .github/workflows/main.yml | 8 +-- Cargo.lock | 4 +- Makefile | 16 +++--- crates/connector_proxy/Cargo.toml | 2 +- .../connector_proxy/src/connector_runner.rs | 16 +++--- crates/connector_proxy/src/errors.rs | 2 +- .../connector_proxy/src/interceptors/mod.rs | 4 +- ... => network_tunnel_capture_interceptor.rs} | 18 +++---- ...network_tunnel_materialize_interceptor.rs} | 18 +++---- crates/connector_proxy/src/libs/mod.rs | 2 +- .../{network_proxy.rs => network_tunnel.rs} | 48 +++++++++--------- crates/network-proxy/src/main.rs | 39 --------------- .../.gitignore | 0 .../Cargo.toml | 4 +- .../src/errors.rs | 0 .../src/interface.rs | 10 ++-- .../src/lib.rs | 2 +- crates/network-tunnel/src/main.rs | 39 +++++++++++++++ .../src/networktunnel.rs} | 2 +- .../src/sshforwarding.rs | 4 +- .../networktunnel.go} | 50 +++++++++---------- .../networktunnel_test.go} | 2 +- .../sshforwarding/sshforwarding.go | 0 .../sshforwarding/sshforwarding_test.go | 0 .../testutil.go | 10 ++-- site/docs/concepts/connectors.md | 2 +- .../materialize-postgres.ssh.config.yaml | 4 +- 27 files changed, 153 insertions(+), 153 deletions(-) rename crates/connector_proxy/src/interceptors/{network_proxy_capture_interceptor.rs => network_tunnel_capture_interceptor.rs} (87%) rename crates/connector_proxy/src/interceptors/{network_proxy_materialize_interceptor.rs => network_tunnel_materialize_interceptor.rs} (86%) rename crates/connector_proxy/src/libs/{network_proxy.rs => network_tunnel.rs} (56%) delete mode 100644 crates/network-proxy/src/main.rs rename crates/{network-proxy => network-tunnel}/.gitignore (100%) rename crates/{network-proxy => network-tunnel}/Cargo.toml (91%) rename crates/{network-proxy => network-tunnel}/src/errors.rs (100%) rename crates/{network-proxy => network-tunnel}/src/interface.rs (56%) rename crates/{network-proxy => network-tunnel}/src/lib.rs (71%) create mode 100644 crates/network-tunnel/src/main.rs rename crates/{network-proxy/src/networkproxy.rs => network-tunnel/src/networktunnel.rs} (92%) rename crates/{network-proxy => network-tunnel}/src/sshforwarding.rs (98%) rename go/{network-proxy/networkproxy.go => network-tunnel/networktunnel.go} (66%) rename go/{network-proxy/networkproxy_test.go => network-tunnel/networktunnel_test.go} (98%) rename go/{network-proxy => network-tunnel}/sshforwarding/sshforwarding.go (100%) rename go/{network-proxy => network-tunnel}/sshforwarding/sshforwarding_test.go (100%) rename go/{network-proxy => network-tunnel}/testutil.go (75%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ff196ae4b2..038b0532d3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -196,7 +196,7 @@ jobs: -o /home/runner/work/flow/flow/.build/package/bin/flowctl \ -o /home/runner/work/flow/flow/.build/package/bin/flowctl-go \ -o /home/runner/work/flow/flow/.build/package/bin/flow-connector-proxy \ - -o /home/runner/work/flow/flow/.build/package/bin/flow-network-proxy \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-network-tunnel \ -o /home/runner/work/flow/flow/.build/package/bin/flow-parser \ -o /home/runner/work/flow/flow/.build/package/bin/flow-schemalate \ -o /home/runner/work/flow/flow/.build/package/bin/gazette \ @@ -209,7 +209,7 @@ jobs: -o /home/runner/work/flow/flow/.build/package/bin/flowctl \ -o /home/runner/work/flow/flow/.build/package/bin/flowctl-go \ -o /home/runner/work/flow/flow/.build/package/bin/flow-connector-proxy \ - -o /home/runner/work/flow/flow/.build/package/bin/flow-network-proxy \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-network-tunnel \ -o /home/runner/work/flow/flow/.build/package/bin/flow-parser \ -o /home/runner/work/flow/flow/.build/package/bin/flow-schemalate \ -o /home/runner/work/flow/flow/.build/package/bin/gazette \ @@ -222,7 +222,7 @@ jobs: -o /home/runner/work/flow/flow/.build/package/bin/flowctl \ -o /home/runner/work/flow/flow/.build/package/bin/flowctl-go \ -o /home/runner/work/flow/flow/.build/package/bin/flow-connector-proxy \ - -o /home/runner/work/flow/flow/.build/package/bin/flow-network-proxy \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-network-tunnel \ -o /home/runner/work/flow/flow/.build/package/bin/flow-parser \ -o /home/runner/work/flow/flow/.build/package/bin/flow-schemalate \ -o /home/runner/work/flow/flow/.build/package/bin/gazette \ @@ -235,7 +235,7 @@ jobs: -o /home/runner/work/flow/flow/.build/package/bin/flowctl \ -o /home/runner/work/flow/flow/.build/package/bin/flowctl-go \ -o /home/runner/work/flow/flow/.build/package/bin/flow-connector-proxy \ - -o /home/runner/work/flow/flow/.build/package/bin/flow-network-proxy \ + -o /home/runner/work/flow/flow/.build/package/bin/flow-network-tunnel \ -o /home/runner/work/flow/flow/.build/package/bin/flow-parser \ -o /home/runner/work/flow/flow/.build/package/bin/flow-schemalate \ -o /home/runner/work/flow/flow/.build/package/bin/gazette \ diff --git a/Cargo.lock b/Cargo.lock index 316a29598b..243e70208a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -673,7 +673,7 @@ dependencies = [ "futures-util", "json-pointer", "libc", - "network-proxy", + "network-tunnel", "prost", "protocol", "schemars", @@ -1973,7 +1973,7 @@ dependencies = [ ] [[package]] -name = "network-proxy" +name = "network-tunnel" version = "0.1.0" dependencies = [ "async-trait", diff --git a/Makefile b/Makefile index 4d38e90362..0a15e432cb 100644 --- a/Makefile +++ b/Makefile @@ -145,9 +145,9 @@ ${RUST_MUSL_BIN}/flow-schemalate: ${RUST_MUSL_BIN}/flow-parser: cargo build --target x86_64-unknown-linux-musl --release --locked -p parser -.PHONY: ${RUST_MUSL_BIN}/flow-network-proxy -${RUST_MUSL_BIN}/flow-network-proxy: - cargo build --target x86_64-unknown-linux-musl --release --locked -p network-proxy +.PHONY: ${RUST_MUSL_BIN}/flow-network-tunnel +${RUST_MUSL_BIN}/flow-network-tunnel: + cargo build --target x86_64-unknown-linux-musl --release --locked -p network-tunnel .PHONY: ${RUST_MUSL_BIN}/flow-connector-proxy ${RUST_MUSL_BIN}/flow-connector-proxy: @@ -167,7 +167,7 @@ RUST_TARGETS = \ MUSL_TARGETS = \ ${PKGDIR}/bin/flow-parser \ ${PKGDIR}/bin/flow-schemalate \ - ${PKGDIR}/bin/flow-network-proxy \ + ${PKGDIR}/bin/flow-network-tunnel \ ${PKGDIR}/bin/flow-connector-proxy \ .PHONY: rust-binaries @@ -193,8 +193,8 @@ ${PKGDIR}/bin/flow-schemalate: ${RUST_MUSL_BIN}/flow-schemalate | ${PKGDIR} cp ${RUST_MUSL_BIN}/flow-schemalate $@ ${PKGDIR}/bin/flow-parser: ${RUST_MUSL_BIN}/flow-parser | ${PKGDIR} cp ${RUST_MUSL_BIN}/flow-parser $@ -${PKGDIR}/bin/flow-network-proxy: ${RUST_MUSL_BIN}/flow-network-proxy | ${PKGDIR} - cp ${RUST_MUSL_BIN}/flow-network-proxy $@ +${PKGDIR}/bin/flow-network-tunnel: ${RUST_MUSL_BIN}/flow-network-tunnel | ${PKGDIR} + cp ${RUST_MUSL_BIN}/flow-network-tunnel $@ ${PKGDIR}/bin/flow-connector-proxy: ${RUST_MUSL_BIN}/flow-connector-proxy | ${PKGDIR} cp ${RUST_MUSL_BIN}/flow-connector-proxy $@ @@ -228,11 +228,11 @@ install-tools: ${PKGDIR}/bin/etcd ${PKGDIR}/bin/sops .PHONY: rust-test rust-test: - cargo test --release --locked --workspace --exclude parser --exclude network-proxy --exclude schemalate --exclude connector_proxy + cargo test --release --locked --workspace --exclude parser --exclude network-tunnel --exclude schemalate --exclude connector_proxy .PHONY: musl-test musl-test: - cargo test --release --locked --target x86_64-unknown-linux-musl --package parser --package network-proxy --package schemalate --package connector_proxy + cargo test --release --locked --target x86_64-unknown-linux-musl --package parser --package network-tunnel --package schemalate --package connector_proxy # `go` test targets must have PATH-based access to tools (etcd & sops), # because the `go` tool compiles tests as binaries within a temp directory, diff --git a/crates/connector_proxy/Cargo.toml b/crates/connector_proxy/Cargo.toml index 72fab3ce59..ed5fe19d9d 100644 --- a/crates/connector_proxy/Cargo.toml +++ b/crates/connector_proxy/Cargo.toml @@ -10,7 +10,7 @@ path = "src/main.rs" [dependencies] doc = { path = "../doc", version = "0.0.0" } flow_cli_common = { path = "../flow_cli_common" } -network-proxy = { path = "../network-proxy", version = "0.1.0" } +network-tunnel = { path = "../network-tunnel", version = "0.1.0" } protocol = { path = "../protocol", version = "0.0.0" } async-trait="*" diff --git a/crates/connector_proxy/src/connector_runner.rs b/crates/connector_proxy/src/connector_runner.rs index 4b97d36b79..2b349d6c11 100644 --- a/crates/connector_proxy/src/connector_runner.rs +++ b/crates/connector_proxy/src/connector_runner.rs @@ -2,8 +2,8 @@ use crate::apis::{FlowCaptureOperation, FlowMaterializeOperation, InterceptorStr use crate::errors::Error; use crate::interceptors::{ airbyte_source_interceptor::AirbyteSourceInterceptor, - network_proxy_capture_interceptor::NetworkProxyCaptureInterceptor, - network_proxy_materialize_interceptor::NetworkProxyMaterializeInterceptor, + network_tunnel_capture_interceptor::NetworkTunnelCaptureInterceptor, + network_tunnel_materialize_interceptor::NetworkTunnelMaterializeInterceptor, }; use crate::libs::command::{ check_exit_status, invoke_connector_delayed, invoke_connector_direct, parse_child, @@ -23,10 +23,10 @@ pub async fn run_flow_capture_connector( parse_child(invoke_connector_direct(entrypoint, args)?)?; let adapted_request_stream = - NetworkProxyCaptureInterceptor::adapt_request_stream(op, request_stream())?; + NetworkTunnelCaptureInterceptor::adapt_request_stream(op, request_stream())?; let adapted_response_stream = - NetworkProxyCaptureInterceptor::adapt_response_stream(op, response_stream(child_stdout))?; + NetworkTunnelCaptureInterceptor::adapt_response_stream(op, response_stream(child_stdout))?; streaming_all( child_stdin, @@ -50,9 +50,9 @@ pub async fn run_flow_materialize_connector( parse_child(invoke_connector_direct(entrypoint, args)?)?; let adapted_request_stream = - NetworkProxyMaterializeInterceptor::adapt_request_stream(op, request_stream())?; + NetworkTunnelMaterializeInterceptor::adapt_request_stream(op, request_stream())?; - let adapted_response_stream = NetworkProxyMaterializeInterceptor::adapt_response_stream( + let adapted_response_stream = NetworkTunnelMaterializeInterceptor::adapt_response_stream( op, response_stream(child_stdout), )?; @@ -82,10 +82,10 @@ pub async fn run_airbyte_source_connector( let adapted_request_stream = airbyte_interceptor.adapt_request_stream( op, - NetworkProxyCaptureInterceptor::adapt_request_stream(op, request_stream())?, + NetworkTunnelCaptureInterceptor::adapt_request_stream(op, request_stream())?, )?; - let adapted_response_stream = NetworkProxyCaptureInterceptor::adapt_response_stream( + let adapted_response_stream = NetworkTunnelCaptureInterceptor::adapt_response_stream( op, airbyte_interceptor.adapt_response_stream(op, response_stream(child_stdout))?, )?; diff --git a/crates/connector_proxy/src/errors.rs b/crates/connector_proxy/src/errors.rs index 7473694484..0316bcf12a 100644 --- a/crates/connector_proxy/src/errors.rs +++ b/crates/connector_proxy/src/errors.rs @@ -49,7 +49,7 @@ pub enum Error { MissingImageInspectFile, #[error(transparent)] - NetworkProxyError(#[from] network_proxy::errors::Error), + NetworkTunnelError(#[from] network_tunnel::errors::Error), #[error(transparent)] TempfilePersistError(#[from] tempfile::PersistError), diff --git a/crates/connector_proxy/src/interceptors/mod.rs b/crates/connector_proxy/src/interceptors/mod.rs index 37b44c07f0..e8edba8231 100644 --- a/crates/connector_proxy/src/interceptors/mod.rs +++ b/crates/connector_proxy/src/interceptors/mod.rs @@ -1,3 +1,3 @@ pub mod airbyte_source_interceptor; -pub mod network_proxy_capture_interceptor; -pub mod network_proxy_materialize_interceptor; +pub mod network_tunnel_capture_interceptor; +pub mod network_tunnel_materialize_interceptor; diff --git a/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs b/crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs similarity index 87% rename from crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs rename to crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs index 74a533ecf1..ece41962c0 100644 --- a/crates/connector_proxy/src/interceptors/network_proxy_capture_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs @@ -1,6 +1,6 @@ use crate::apis::{FlowCaptureOperation, InterceptorStream}; use crate::errors::{Error, Must}; -use crate::libs::network_proxy::NetworkProxy; +use crate::libs::network_tunnel::NetworkTunnel; use crate::libs::protobuf::{decode_message, encode_message}; use crate::libs::stream::{get_decoded_message, stream_all_bytes}; use futures::{future, stream, StreamExt, TryStreamExt}; @@ -11,14 +11,14 @@ use protocol::capture::{ use serde_json::value::RawValue; use tokio_util::io::StreamReader; -pub struct NetworkProxyCaptureInterceptor {} +pub struct NetworkTunnelCaptureInterceptor {} -impl NetworkProxyCaptureInterceptor { +impl NetworkTunnelCaptureInterceptor { fn adapt_discover_request_stream(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream::once(async { let mut request = get_decoded_message::(in_stream).await?; - request.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + request.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(request.endpoint_spec_json)?, ) .await @@ -33,7 +33,7 @@ impl NetworkProxyCaptureInterceptor { Box::pin(stream::once(async { let mut request = get_decoded_message::(in_stream).await?; - request.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + request.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(request.endpoint_spec_json)?, ) .await @@ -49,7 +49,7 @@ impl NetworkProxyCaptureInterceptor { let mut request = get_decoded_message::(in_stream).await?; if let Some(ref mut c) = request.capture { - c.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + c.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(c.endpoint_spec_json.clone())?, ) .await @@ -71,7 +71,7 @@ impl NetworkProxyCaptureInterceptor { .expect("expected request is not received."); if let Some(ref mut o) = request.open { if let Some(ref mut c) = o.capture { - c.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + c.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(c.endpoint_spec_json.clone())?, ) .await @@ -91,7 +91,7 @@ impl NetworkProxyCaptureInterceptor { } } -impl NetworkProxyCaptureInterceptor { +impl NetworkTunnelCaptureInterceptor { pub fn adapt_request_stream( op: &FlowCaptureOperation, in_stream: InterceptorStream, @@ -114,7 +114,7 @@ impl NetworkProxyCaptureInterceptor { Ok(match op { FlowCaptureOperation::Spec => Box::pin(stream::once(async move { let mut response = get_decoded_message::(in_stream).await?; - response.endpoint_spec_schema_json = NetworkProxy::extend_endpoint_schema( + response.endpoint_spec_schema_json = NetworkTunnel::extend_endpoint_schema( RawValue::from_string(response.endpoint_spec_schema_json)?, ) .or_bail() diff --git a/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs b/crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs similarity index 86% rename from crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs rename to crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs index 9bcb484576..fe1ead6840 100644 --- a/crates/connector_proxy/src/interceptors/network_proxy_materialize_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs @@ -1,6 +1,6 @@ use crate::apis::{FlowMaterializeOperation, InterceptorStream}; use crate::errors::{Error, Must}; -use crate::libs::network_proxy::NetworkProxy; +use crate::libs::network_tunnel::NetworkTunnel; use crate::libs::protobuf::{decode_message, encode_message}; use crate::libs::stream::{get_decoded_message, stream_all_bytes}; @@ -10,18 +10,18 @@ use protocol::materialize::{ApplyRequest, SpecResponse, TransactionRequest, Vali use serde_json::value::RawValue; use tokio_util::io::StreamReader; -pub struct NetworkProxyMaterializeInterceptor {} +pub struct NetworkTunnelMaterializeInterceptor {} -impl NetworkProxyMaterializeInterceptor { +impl NetworkTunnelMaterializeInterceptor { fn adapt_spec_request(in_stream: InterceptorStream) -> InterceptorStream { Box::pin(stream::once(async { let mut request = get_decoded_message::(in_stream).await?; - request.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + request.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(request.endpoint_spec_json)?, ) .await - .expect("failed to start network proxy") + .expect("failed to start network tunnel") .to_string(); encode_message(&request) })) @@ -32,7 +32,7 @@ impl NetworkProxyMaterializeInterceptor { let mut request = get_decoded_message::(in_stream).await?; if let Some(ref mut m) = request.materialization { - m.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + m.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(m.endpoint_spec_json.clone())?, ) .await @@ -54,7 +54,7 @@ impl NetworkProxyMaterializeInterceptor { .expect("expected request is not received."); if let Some(ref mut o) = request.open { if let Some(ref mut m) = o.materialization { - m.endpoint_spec_json = NetworkProxy::consume_network_proxy_config( + m.endpoint_spec_json = NetworkTunnel::consume_network_tunnel_config( RawValue::from_string(m.endpoint_spec_json.clone())?, ) .await @@ -73,7 +73,7 @@ impl NetworkProxyMaterializeInterceptor { } } -impl NetworkProxyMaterializeInterceptor { +impl NetworkTunnelMaterializeInterceptor { pub fn adapt_request_stream( op: &FlowMaterializeOperation, in_stream: InterceptorStream, @@ -96,7 +96,7 @@ impl NetworkProxyMaterializeInterceptor { FlowMaterializeOperation::Spec => Box::pin(stream::once(async { let mut response = get_decoded_message::(in_stream).await?; - response.endpoint_spec_schema_json = NetworkProxy::extend_endpoint_schema( + response.endpoint_spec_schema_json = NetworkTunnel::extend_endpoint_schema( RawValue::from_string(response.endpoint_spec_schema_json)?, ) .or_bail() diff --git a/crates/connector_proxy/src/libs/mod.rs b/crates/connector_proxy/src/libs/mod.rs index c7ff1dedb9..870add588a 100644 --- a/crates/connector_proxy/src/libs/mod.rs +++ b/crates/connector_proxy/src/libs/mod.rs @@ -2,6 +2,6 @@ pub mod airbyte_catalog; pub mod command; pub mod image_inspect; pub mod json; -pub mod network_proxy; +pub mod network_tunnel; pub mod protobuf; pub mod stream; diff --git a/crates/connector_proxy/src/libs/network_proxy.rs b/crates/connector_proxy/src/libs/network_tunnel.rs similarity index 56% rename from crates/connector_proxy/src/libs/network_proxy.rs rename to crates/connector_proxy/src/libs/network_tunnel.rs index 4be21cfb74..f0ddab9de2 100644 --- a/crates/connector_proxy/src/libs/network_proxy.rs +++ b/crates/connector_proxy/src/libs/network_tunnel.rs @@ -1,29 +1,29 @@ use crate::errors::Error; use crate::libs::json::{create_root_schema, remove_subobject}; -use network_proxy::interface::NetworkProxyConfig; +use network_tunnel::interface::NetworkTunnelConfig; use schemars::schema::{RootSchema, Schema}; use serde_json::value::RawValue; use tokio::sync::oneshot::{self, Receiver}; use tokio::time::timeout; -pub struct NetworkProxy {} -pub const NETWORK_PROXY_KEY: &str = "networkProxy"; +pub struct NetworkTunnel {} +pub const NETWORK_TUNNEL_KEY: &str = "networkTunnel"; -impl NetworkProxy { +impl NetworkTunnel { pub fn extend_endpoint_schema( endpoint_spec_schema: Box, ) -> Result, Error> { - let network_proxy_schema = create_root_schema::(); + let network_tunnel_schema = create_root_schema::(); let mut modified_schema: RootSchema = serde_json::from_str(endpoint_spec_schema.get())?; if let Some(ref mut o) = &mut modified_schema.schema.object { - if o.as_ref().properties.contains_key(NETWORK_PROXY_KEY) { - return Err(Error::DuplicatedKeyError(NETWORK_PROXY_KEY)); + if o.as_ref().properties.contains_key(NETWORK_TUNNEL_KEY) { + return Err(Error::DuplicatedKeyError(NETWORK_TUNNEL_KEY)); } o.as_mut().properties.insert( - NETWORK_PROXY_KEY.to_string(), - Schema::Object(network_proxy_schema.schema), + NETWORK_TUNNEL_KEY.to_string(), + Schema::Object(network_tunnel_schema.schema), ); } @@ -31,24 +31,24 @@ impl NetworkProxy { RawValue::from_string(json).map_err(Into::into) } - // Start the network proxy. The receiver rx will be dropped to indicate the network proxy + // Start the network tunnel. The receiver rx will be dropped to indicate the network tunnel // is ready to accept requests. - async fn start_network_proxy( - config: NetworkProxyConfig, + async fn start_network_tunnel( + config: NetworkTunnelConfig, rx: Receiver<()>, ) -> Result<(), Error> { - let mut network_proxy = config.new_proxy(); + let mut network_tunnel = config.new_tunnel(); tokio::task::spawn(async move { - let result: Result<(), Error> = match network_proxy.prepare().await { + let result: Result<(), Error> = match network_tunnel.prepare().await { Ok(()) => { drop(rx); - network_proxy.start_serve().await.map_err(Into::into) + network_tunnel.start_serve().await.map_err(Into::into) } Err(e) => Err(e.into()), }; if let Err(ref err) = result { - tracing::error!(error=?err, "failed starting network proxy."); + tracing::error!(error=?err, "failed starting network tunnel."); std::process::exit(1); } }) @@ -57,7 +57,7 @@ impl NetworkProxy { Ok(()) } - pub async fn consume_network_proxy_config( + pub async fn consume_network_tunnel_config( endpoint_spec_json: Box, ) -> Result, Error> { if endpoint_spec_json.get().is_empty() { @@ -65,25 +65,25 @@ impl NetworkProxy { } let endpoint_spec = serde_json::from_str(endpoint_spec_json.get())?; - let (network_proxy_config, endpoint_spec) = - remove_subobject(endpoint_spec, NETWORK_PROXY_KEY); + let (network_tunnel_config, endpoint_spec) = + remove_subobject(endpoint_spec, NETWORK_TUNNEL_KEY); - let network_proxy_config: NetworkProxyConfig = match network_proxy_config { + let network_tunnel_config: NetworkTunnelConfig = match network_tunnel_config { None => return Ok(endpoint_spec_json), Some(c) => serde_json::from_value(c)?, }; let (mut tx, rx) = oneshot::channel(); - tokio::spawn(Self::start_network_proxy(network_proxy_config, rx)); + tokio::spawn(Self::start_network_tunnel(network_tunnel_config, rx)); - // TODO: Refact the network-proxy and remove the timeout logic here after all connectors are converted to work with connector-proxy. + // TODO: Refact the network-tunnel and remove the timeout logic here after all connectors are converted to work with connector-proxy. - // Block for at most 5 seconds for network proxy to be prepared. + // Block for at most 5 seconds for network tunnel to be prepared. if let Err(_) = timeout(std::time::Duration::from_secs(5), tx.closed()).await { return Err(Error::ChannelTimeoutError); }; - tracing::info!("network proxy started."); + tracing::info!("network tunnel started."); let json = serde_json::to_string_pretty(&endpoint_spec)?; RawValue::from_string(json).map_err(Into::into) diff --git a/crates/network-proxy/src/main.rs b/crates/network-proxy/src/main.rs deleted file mode 100644 index bd8d63028e..0000000000 --- a/crates/network-proxy/src/main.rs +++ /dev/null @@ -1,39 +0,0 @@ -pub mod interface; -pub mod sshforwarding; -pub mod errors; -pub mod networkproxy; - -use errors::Error; -use flow_cli_common::{init_logging, LogArgs, LogFormat}; -use std::io::{self, Write}; - -use interface::NetworkProxyConfig; - -#[tokio::main] -async fn main() -> io::Result<()> { - init_logging(&LogArgs{level: "info".to_string(), format: Some(LogFormat::Json)}); - if let Err(err) = run().await.as_ref() { - tracing::error!(error = ?err, "network proxy failed."); - std::process::exit(1); - } - Ok(()) -} - -async fn run() -> Result<(), Error> { - let proxy_config: NetworkProxyConfig = serde_json::from_reader(io::stdin())?; - let mut proxy = proxy_config.new_proxy(); - - proxy.prepare().await?; - - // Write "READY" to stdio to unblock Go logic. - // The current workflow assumes that - // 1. After proxy.prepare() is called, the network proxy is able to accept requests from clients without sending errors back to clients. - // 2. The network proxy is able to process client requests immediately after `proxy.start_serve` is called. - // If either of the assumptions is invalid for any new proxy type, the READY-logic need to be moved to a separate task, which - // sends out the "READY" signal after making sure the network proxy is started and working properly. - println!("READY"); - - proxy.start_serve().await?; - - Ok(()) -} \ No newline at end of file diff --git a/crates/network-proxy/.gitignore b/crates/network-tunnel/.gitignore similarity index 100% rename from crates/network-proxy/.gitignore rename to crates/network-tunnel/.gitignore diff --git a/crates/network-proxy/Cargo.toml b/crates/network-tunnel/Cargo.toml similarity index 91% rename from crates/network-proxy/Cargo.toml rename to crates/network-tunnel/Cargo.toml index 2db5c5e5b8..9040d117c3 100644 --- a/crates/network-proxy/Cargo.toml +++ b/crates/network-tunnel/Cargo.toml @@ -1,12 +1,12 @@ [package] -name = "network-proxy" +name = "network-tunnel" version = "0.1.0" edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [[bin]] -name = "flow-network-proxy" +name = "flow-network-tunnel" path = "src/main.rs" [dependencies] diff --git a/crates/network-proxy/src/errors.rs b/crates/network-tunnel/src/errors.rs similarity index 100% rename from crates/network-proxy/src/errors.rs rename to crates/network-tunnel/src/errors.rs diff --git a/crates/network-proxy/src/interface.rs b/crates/network-tunnel/src/interface.rs similarity index 56% rename from crates/network-proxy/src/interface.rs rename to crates/network-tunnel/src/interface.rs index a474c48f06..038f6a6cce 100644 --- a/crates/network-proxy/src/interface.rs +++ b/crates/network-tunnel/src/interface.rs @@ -1,4 +1,4 @@ -use super::networkproxy::NetworkProxy; +use super::networktunnel::NetworkTunnel; use super::sshforwarding::{SshForwarding, SshForwardingConfig}; use schemars::JsonSchema; @@ -7,14 +7,14 @@ use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] #[serde(deny_unknown_fields)] -pub enum NetworkProxyConfig { +pub enum NetworkTunnelConfig { SshForwarding(SshForwardingConfig), } -impl NetworkProxyConfig { - pub fn new_proxy(self) -> Box { +impl NetworkTunnelConfig { + pub fn new_tunnel(self) -> Box { match self { - NetworkProxyConfig::SshForwarding(config) => Box::new(SshForwarding::new(config)), + NetworkTunnelConfig::SshForwarding(config) => Box::new(SshForwarding::new(config)), } } } diff --git a/crates/network-proxy/src/lib.rs b/crates/network-tunnel/src/lib.rs similarity index 71% rename from crates/network-proxy/src/lib.rs rename to crates/network-tunnel/src/lib.rs index aea73e18cd..d2dc8c8ae8 100644 --- a/crates/network-proxy/src/lib.rs +++ b/crates/network-tunnel/src/lib.rs @@ -1,4 +1,4 @@ pub mod errors; pub mod interface; -pub mod networkproxy; +pub mod networktunnel; pub mod sshforwarding; diff --git a/crates/network-tunnel/src/main.rs b/crates/network-tunnel/src/main.rs new file mode 100644 index 0000000000..9584ec1ea6 --- /dev/null +++ b/crates/network-tunnel/src/main.rs @@ -0,0 +1,39 @@ +pub mod interface; +pub mod sshforwarding; +pub mod errors; +pub mod networktunnel; + +use errors::Error; +use flow_cli_common::{init_logging, LogArgs, LogFormat}; +use std::io::{self, Write}; + +use interface::NetworkTunnelConfig; + +#[tokio::main] +async fn main() -> io::Result<()> { + init_logging(&LogArgs{level: "info".to_string(), format: Some(LogFormat::Json)}); + if let Err(err) = run().await.as_ref() { + tracing::error!(error = ?err, "network tunnel failed."); + std::process::exit(1); + } + Ok(()) +} + +async fn run() -> Result<(), Error> { + let tunnel_config: NetworkTunnelConfig = serde_json::from_reader(io::stdin())?; + let mut tunnel = tunnel_config.new_tunnel(); + + tunnel.prepare().await?; + + // Write "READY" to stdio to unblock Go logic. + // The current workflow assumes that + // 1. After tunnel.prepare() is called, the network tunnel is able to accept requests from clients without sending errors back to clients. + // 2. The network tunnel is able to process client requests immediately after `tunnel.start_serve` is called. + // If either of the assumptions is invalid for any new tunnel type, the READY-logic need to be moved to a separate task, which + // sends out the "READY" signal after making sure the network tunnel is started and working properly. + println!("READY"); + + tunnel.start_serve().await?; + + Ok(()) +} \ No newline at end of file diff --git a/crates/network-proxy/src/networkproxy.rs b/crates/network-tunnel/src/networktunnel.rs similarity index 92% rename from crates/network-proxy/src/networkproxy.rs rename to crates/network-tunnel/src/networktunnel.rs index b929a3b9ef..c2dd95a067 100644 --- a/crates/network-proxy/src/networkproxy.rs +++ b/crates/network-tunnel/src/networktunnel.rs @@ -3,7 +3,7 @@ use super::errors::Error; use async_trait::async_trait; #[async_trait] -pub trait NetworkProxy: Send { +pub trait NetworkTunnel: Send { // Setup the network proxy server. Network proxy should be able to listen and accept requests after `prepare` is performed. async fn prepare(&mut self) -> Result<(), Error>; // Start a long-running task that serves and processes all proxy requests from clients. diff --git a/crates/network-proxy/src/sshforwarding.rs b/crates/network-tunnel/src/sshforwarding.rs similarity index 98% rename from crates/network-proxy/src/sshforwarding.rs rename to crates/network-tunnel/src/sshforwarding.rs index e0336f2830..d3d5d89e59 100644 --- a/crates/network-proxy/src/sshforwarding.rs +++ b/crates/network-tunnel/src/sshforwarding.rs @@ -1,5 +1,5 @@ use super::errors::Error; -use super::networkproxy::NetworkProxy; +use super::networktunnel::NetworkTunnel; use async_trait::async_trait; use futures::pin_mut; @@ -99,7 +99,7 @@ impl SshForwarding { } #[async_trait] -impl NetworkProxy for SshForwarding { +impl NetworkTunnel for SshForwarding { async fn prepare(&mut self) -> Result<(), Error> { self.prepare_ssh_client().await?; self.prepare_local_listener().await?; diff --git a/go/network-proxy/networkproxy.go b/go/network-tunnel/networktunnel.go similarity index 66% rename from go/network-proxy/networkproxy.go rename to go/network-tunnel/networktunnel.go index f76e85e85a..9ad95333c9 100644 --- a/go/network-proxy/networkproxy.go +++ b/go/network-tunnel/networktunnel.go @@ -1,4 +1,4 @@ -package networkproxy +package networktunnel import ( "bytes" @@ -11,64 +11,64 @@ import ( "syscall" "time" - sf "github.com/estuary/flow/go/network-proxy/sshforwarding" + sf "github.com/estuary/flow/go/network-tunnel/sshforwarding" ) -const ProgramName = "network-proxy-service" +const ProgramName = "network-tunnel-service" -func SupportedNetworkProxyTypes() []string { +func SupportedNetworkTunnelTypes() []string { return []string{"sshForwarding"} } -type NetworkProxyConfig struct { - ProxyType string `json:"proxyType"` +type NetworkTunnelConfig struct { + TunnelType string `json:"tunnelType"` SshForwardingConfig sf.SshForwardingConfig `json:"sshForwarding"` } // GetFieldDocString implements the jsonschema.customSchemaGetFieldDocString interface. -func (NetworkProxyConfig) GetFieldDocString(fieldName string) string { +func (NetworkTunnelConfig) GetFieldDocString(fieldName string) string { switch fieldName { - case "ProxyType": - return fmt.Sprintf("The type of the network proxy. Supported types are: ( %s )", strings.Join(SupportedNetworkProxyTypes(), ", ")) + case "TunnelType": + return fmt.Sprintf("The type of the network tunnel. Supported types are: ( %s )", strings.Join(SupportedNetworkTunnelTypes(), ", ")) case "SshForwardingConfig": - return "Config for proxy of type sshForwarding" + return "Config for tunnel of type sshForwarding" default: return "" } } -func (npc *NetworkProxyConfig) Validate() error { +func (npc *NetworkTunnelConfig) Validate() error { if npc == nil { return nil } var supported = false - for _, t := range SupportedNetworkProxyTypes() { - if t == npc.ProxyType { + for _, t := range SupportedNetworkTunnelTypes() { + if t == npc.TunnelType { supported = true break } } if !supported { - return fmt.Errorf("Unsupported proxy type: %s. Valid values are: ( %s ).", npc.ProxyType, strings.Join(SupportedNetworkProxyTypes(), ", ")) + return fmt.Errorf("Unsupported proxy type: %s. Valid values are: ( %s ).", npc.TunnelType, strings.Join(SupportedNetworkTunnelTypes(), ", ")) } - switch npc.ProxyType { + switch npc.TunnelType { case "sshForwarding": return npc.SshForwardingConfig.Validate() default: - panic(fmt.Sprintf("Implementation of validating %s is not ready.", npc.ProxyType)) + panic(fmt.Sprintf("Implementation of validating %s is not ready.", npc.TunnelType)) } } -func (npc *NetworkProxyConfig) MarshalJSON() ([]byte, error) { +func (npc *NetworkTunnelConfig) MarshalJSON() ([]byte, error) { var m = make(map[string]interface{}) - switch npc.ProxyType { + switch npc.TunnelType { case "sshForwarding": - m[npc.ProxyType] = npc.SshForwardingConfig + m[npc.TunnelType] = npc.SshForwardingConfig default: - panic(fmt.Sprintf("Implementation of MarshalJSON for %s is missing.", npc.ProxyType)) + panic(fmt.Sprintf("Implementation of MarshalJSON for %s is missing.", npc.TunnelType)) } return json.Marshal(m) @@ -76,13 +76,13 @@ func (npc *NetworkProxyConfig) MarshalJSON() ([]byte, error) { const defaultTimeoutSecs = 5 -func (npc *NetworkProxyConfig) Start() error { +func (npc *NetworkTunnelConfig) Start() error { return npc.startInternal(defaultTimeoutSecs, os.Stderr) } -func (npc *NetworkProxyConfig) startInternal(timeoutSecs uint16, stderr io.Writer) error { +func (npc *NetworkTunnelConfig) startInternal(timeoutSecs uint16, stderr io.Writer) error { if npc == nil { - // NetworkProxyConfig is not set. + // NetworkTunnelConfig is not set. return nil } @@ -117,7 +117,7 @@ func (npc *NetworkProxyConfig) startInternal(timeoutSecs uint16, stderr io.Write } } -func (npc *NetworkProxyConfig) sendInput(cmd *exec.Cmd) error { +func (npc *NetworkTunnelConfig) sendInput(cmd *exec.Cmd) error { stdin, err := cmd.StdinPipe() if err != nil { return fmt.Errorf("getting stdin pipe: %w", err) @@ -131,7 +131,7 @@ func (npc *NetworkProxyConfig) sendInput(cmd *exec.Cmd) error { go func() { if _, err := stdin.Write(input); err != nil { - panic("Failed to send input to network-proxy-service binary.") + panic("Failed to send input to network-tunnel-service binary.") } stdin.Close() }() diff --git a/go/network-proxy/networkproxy_test.go b/go/network-tunnel/networktunnel_test.go similarity index 98% rename from go/network-proxy/networkproxy_test.go rename to go/network-tunnel/networktunnel_test.go index 16c9db37ad..93ea2ae8a4 100644 --- a/go/network-proxy/networkproxy_test.go +++ b/go/network-tunnel/networktunnel_test.go @@ -1,4 +1,4 @@ -package networkproxy +package networktunnel /* import ( diff --git a/go/network-proxy/sshforwarding/sshforwarding.go b/go/network-tunnel/sshforwarding/sshforwarding.go similarity index 100% rename from go/network-proxy/sshforwarding/sshforwarding.go rename to go/network-tunnel/sshforwarding/sshforwarding.go diff --git a/go/network-proxy/sshforwarding/sshforwarding_test.go b/go/network-tunnel/sshforwarding/sshforwarding_test.go similarity index 100% rename from go/network-proxy/sshforwarding/sshforwarding_test.go rename to go/network-tunnel/sshforwarding/sshforwarding_test.go diff --git a/go/network-proxy/testutil.go b/go/network-tunnel/testutil.go similarity index 75% rename from go/network-proxy/testutil.go rename to go/network-tunnel/testutil.go index 655616907f..4bed78074e 100644 --- a/go/network-proxy/testutil.go +++ b/go/network-tunnel/testutil.go @@ -1,20 +1,20 @@ -package networkproxy +package networktunnel import ( "encoding/base64" "os" - sf "github.com/estuary/flow/go/network-proxy/sshforwarding" + sf "github.com/estuary/flow/go/network-tunnel/sshforwarding" ) // Configuration set based on sshforwarding/test_sshd_configs/docker-compose.yaml. -func CreateSshForwardingTestConfig(keyFilePath string, remotePort uint16) (*NetworkProxyConfig, error) { +func CreateSshForwardingTestConfig(keyFilePath string, remotePort uint16) (*NetworkTunnelConfig, error) { var b, err = os.ReadFile(keyFilePath) if err != nil { return nil, err } - return &NetworkProxyConfig{ - ProxyType: "sshForwarding", + return &NetworkTunnelConfig{ + TunnelType: "sshForwarding", SshForwardingConfig: sf.SshForwardingConfig{ SshEndpoint: "ssh://127.0.0.1:2222", SshPrivateKeyBase64: base64.RawStdEncoding.EncodeToString(b), diff --git a/site/docs/concepts/connectors.md b/site/docs/concepts/connectors.md index c7f2f0d703..848b950934 100644 --- a/site/docs/concepts/connectors.md +++ b/site/docs/concepts/connectors.md @@ -567,7 +567,7 @@ materializations: database: flow user: flow_user password: secret - networkProxy: + networkTunnel: sshForwarding: # Port on the local machine from which you'll connect to the SSH server. # If a port is specified elsewhere in the connector configuration, it must match. diff --git a/tests/sshforwarding/materialize-postgres.ssh.config.yaml b/tests/sshforwarding/materialize-postgres.ssh.config.yaml index a5a9e8ebcb..c12e23dd4c 100644 --- a/tests/sshforwarding/materialize-postgres.ssh.config.yaml +++ b/tests/sshforwarding/materialize-postgres.ssh.config.yaml @@ -3,7 +3,7 @@ port: 16666 user: flow password: flow database: flow -networkProxy: +networkTunnel: sshForwarding: localPort: 16666 forwardHost: localhost @@ -49,4 +49,4 @@ networkProxy: KySvYOfiD8waRu2Gf7IqCHdgKBi7AkE45w72GhC+GOoDNMFgnlUgoDeRzxS7idf4 MIVS3sQzezB78ZAuZx0IkH8PxgqRI/D4CK9QBC0b2IT1xmqe5LCGhsMHSvScPLV3 Uu2cs5FkJUnkRpwup7KEfJfZG80DHP81GTsioAt40igx6gVAkIo= - -----END RSA PRIVATE KEY----- \ No newline at end of file + -----END RSA PRIVATE KEY----- From 8f2727ffbb295e8cf6bb72a69828a1afea6a4b90 Mon Sep 17 00:00:00 2001 From: Mahdi Dibaiee Date: Thu, 14 Apr 2022 12:39:08 +0000 Subject: [PATCH 9/9] operate on line bytes to allow handling of plaintext lines --- Cargo.lock | 11 + crates/connector_proxy/Cargo.toml | 1 + .../network_tunnel_capture_interceptor.rs | 6 +- .../network_tunnel_materialize_interceptor.rs | 6 +- crates/connector_proxy/src/libs/stream.rs | 247 ++++++++++-------- 5 files changed, 154 insertions(+), 117 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 243e70208a..d636f2adcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -450,6 +450,16 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" +[[package]] +name = "bytelines" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "784face321c535fcd9a1456632fa720aa53ea0640b57341d961c8c09de2da59f" +dependencies = [ + "futures", + "tokio", +] + [[package]] name = "byteorder" version = "1.4.3" @@ -663,6 +673,7 @@ name = "connector_proxy" version = "0.0.0" dependencies = [ "async-trait", + "bytelines", "byteorder", "bytes", "clap 3.1.8", diff --git a/crates/connector_proxy/Cargo.toml b/crates/connector_proxy/Cargo.toml index ed5fe19d9d..9cf16674ad 100644 --- a/crates/connector_proxy/Cargo.toml +++ b/crates/connector_proxy/Cargo.toml @@ -33,5 +33,6 @@ tempfile="*" thiserror = "*" tokio = { version = "1.15.0", features = ["full"] } tokio-util = { version = "*", features = ["io"] } +bytelines = "*" tracing="*" validator = { version = "*", features = ["derive"] } \ No newline at end of file diff --git a/crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs b/crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs index ece41962c0..2d05e61b72 100644 --- a/crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_tunnel_capture_interceptor.rs @@ -2,14 +2,14 @@ use crate::apis::{FlowCaptureOperation, InterceptorStream}; use crate::errors::{Error, Must}; use crate::libs::network_tunnel::NetworkTunnel; use crate::libs::protobuf::{decode_message, encode_message}; -use crate::libs::stream::{get_decoded_message, stream_all_bytes}; +use crate::libs::stream::get_decoded_message; use futures::{future, stream, StreamExt, TryStreamExt}; use protocol::capture::{ ApplyRequest, DiscoverRequest, PullRequest, SpecResponse, ValidateRequest, }; use serde_json::value::RawValue; -use tokio_util::io::StreamReader; +use tokio_util::io::{ReaderStream, StreamReader}; pub struct NetworkTunnelCaptureInterceptor {} @@ -81,7 +81,7 @@ impl NetworkTunnelCaptureInterceptor { } let first = stream::once(future::ready(encode_message(&request))); - let rest = stream_all_bytes(reader); + let rest = ReaderStream::new(reader); // We need to set explicit error type, see https://github.com/rust-lang/rust/issues/63502 Ok::<_, std::io::Error>(first.chain(rest)) diff --git a/crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs b/crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs index fe1ead6840..13ba1f3272 100644 --- a/crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs +++ b/crates/connector_proxy/src/interceptors/network_tunnel_materialize_interceptor.rs @@ -2,13 +2,13 @@ use crate::apis::{FlowMaterializeOperation, InterceptorStream}; use crate::errors::{Error, Must}; use crate::libs::network_tunnel::NetworkTunnel; use crate::libs::protobuf::{decode_message, encode_message}; -use crate::libs::stream::{get_decoded_message, stream_all_bytes}; +use crate::libs::stream::get_decoded_message; use futures::{future, stream, StreamExt, TryStreamExt}; use protocol::materialize::{ApplyRequest, SpecResponse, TransactionRequest, ValidateRequest}; use serde_json::value::RawValue; -use tokio_util::io::StreamReader; +use tokio_util::io::{ReaderStream, StreamReader}; pub struct NetworkTunnelMaterializeInterceptor {} @@ -63,7 +63,7 @@ impl NetworkTunnelMaterializeInterceptor { } } let first = stream::once(future::ready(encode_message(&request))); - let rest = stream_all_bytes(reader); + let rest = ReaderStream::new(reader); // We need to set explicit error type, see https://github.com/rust-lang/rust/issues/63502 Ok::<_, std::io::Error>(first.chain(rest)) diff --git a/crates/connector_proxy/src/libs/stream.rs b/crates/connector_proxy/src/libs/stream.rs index a38c553395..4649376b51 100644 --- a/crates/connector_proxy/src/libs/stream.rs +++ b/crates/connector_proxy/src/libs/stream.rs @@ -2,111 +2,65 @@ use crate::libs::airbyte_catalog::Message; use crate::{apis::InterceptorStream, errors::create_custom_error}; use crate::errors::raise_err; -use bytes::{Buf, Bytes, BytesMut}; -use futures::{stream, StreamExt, TryStream, TryStreamExt}; -use serde_json::{Deserializer, Value}; -use tokio::io::{AsyncRead, AsyncReadExt}; +use bytelines::AsyncByteLines; +use bytes::Bytes; +use futures::{StreamExt, TryStream, TryStreamExt}; use tokio_util::io::StreamReader; use validator::Validate; +use super::airbyte_catalog::{Log, LogLevel, MessageType}; use super::protobuf::decode_message; -pub fn stream_all_bytes( - reader: R, +// Creates a stream of bytes of lines from the given stream +// This allows our other methods such as stream_airbyte_responses to operate +// on lines, simplifying their logic +pub fn stream_lines( + in_stream: InterceptorStream, ) -> impl TryStream, Error = std::io::Error, Ok = bytes::Bytes> { - stream::try_unfold(reader, |mut r| async { - // consistent with the default capacity of ReaderStream. - // https://github.com/tokio-rs/tokio/blob/master/tokio-util/src/io/reader_stream.rs#L8 - let mut buf = BytesMut::with_capacity(4096); - match r.read_buf(&mut buf).await { - Ok(0) => Ok(None), - Ok(_) => Ok(Some((Bytes::from(buf), r))), - Err(e) => raise_err(&format!("error during streaming {:?}.", e)), - } - }) + AsyncByteLines::new(StreamReader::new(in_stream)) + .into_stream() + .map_ok(Bytes::from) } -/// Given a stream of bytes, try to deserialize them into Airbyte Messages. +/// Given a stream of lines, try to deserialize them into Airbyte Messages. /// This can be used when reading responses from the Airbyte connector, and will /// handle validation of messages as well as handling of AirbyteLogMessages. -/// Will ignore* messages that cannot be parsed to an AirbyteMessage. +/// Will ignore* lines that cannot be parsed to an AirbyteMessage. /// * See https://docs.airbyte.com/understanding-airbyte/airbyte-specification#the-airbyte-protocol pub fn stream_airbyte_responses( in_stream: InterceptorStream, ) -> impl TryStream, Ok = Message, Error = std::io::Error> { - stream::once(async { - let mut buf = BytesMut::new(); - let items = in_stream - .map(move |bytes| { - let b = bytes?; - buf.extend_from_slice(b.chunk()); - let chunk = buf.chunk(); - - // Deserialize to Value first, instead of Message, to avoid missing 'is_eof' signals in error. - let deserializer = Deserializer::from_slice(chunk); - let mut value_stream = deserializer.into_iter::(); - - // Turn Values into Messages and validate them - let values: Vec> = value_stream - .by_ref() - .map_while(|value| match value { - Ok(v) => Some(Ok(v)), - Err(e) => { - // we must stop as soon as we hit EOF to avoid - // progressing value_stream.byte_offset() so that we can - // safely drop the buffer up to byte_offset() and pick up the leftovers - // when working with the next bytes - if e.is_eof() { - return None; - } - - Some(raise_err(&format!( - "error in decoding JSON: {:?}, {:?}", - e, - std::str::from_utf8(chunk) - ))) - } + stream_lines(in_stream).try_filter_map(|line| async move { + let message: Message = match serde_json::from_slice(&line) { + Ok(m) => m, + Err(e) => { + // It is currently ambiguous for us whether Airbyte protocol specification + // mandates that there must be no plaintext or not, as such we handle all + // errors in parsing of stdout lines by logging the issue, but not failing + Message { + message_type: MessageType::Log, + connection_status: None, + state: None, + record: None, + spec: None, + catalog: None, + log: Some(Log { + level: LogLevel::Debug, + message: format!("Encountered error while trying to parse Airbyte Message: {:?} in line {:?}", e, line) }) - .map(|value| match value { - Ok(v) => { - let message: Message = match serde_json::from_value(v) { - Ok(m) => m, - // We ignore JSONs that are not Airbyte Messages according - // to the specification: - // https://docs.airbyte.com/understanding-airbyte/airbyte-specification#the-airbyte-protocol - Err(_) => return Ok(None), - }; - - message.validate().map_err(|e| { - create_custom_error(&format!("error in validating message {:?}", e)) - })?; - - tracing::debug!("read message:: {:?}", &message); - Ok(Some(message)) - } - Err(e) => Err(e), - }) - // Flipping the Option and Result to filter out the None values - .filter_map(|value| match value { - Ok(Some(v)) => Some(Ok(v)), - Ok(None) => None, - Err(e) => Some(Err(e)), - }) - .collect(); - - let byte_offset = value_stream.byte_offset(); - drop(buf.split_to(byte_offset)); + } + } + }; - Ok::<_, std::io::Error>(stream::iter(values)) - }) - .try_flatten(); + message + .validate() + .map_err(|e| create_custom_error(&format!("error in validating message {:?}", e)))?; - // We need to set explicit error type, see https://github.com/rust-lang/rust/issues/63502 - Ok::<_, std::io::Error>(items) + Ok(Some(message)) }) - .try_flatten() - // Handle logs here so we don't have to worry about them everywhere else .try_filter_map(|message| async { + // For AirbyteLogMessages, log them and then filter them out + // so that we don't have to handle them elsewhere if let Some(log) = message.log { log.log(); Ok(None) @@ -159,21 +113,46 @@ where #[cfg(test)] mod test { - use futures::future; + use std::{collections::HashMap, pin::Pin}; + + use bytes::BytesMut; + use futures::stream; + use protocol::{ + flow::EndpointType, + materialize::{validate_request, ValidateRequest}, + }; + use tokio_util::io::ReaderStream; - use crate::libs::airbyte_catalog::{ConnectionStatus, MessageType, Status}; + use crate::libs::{ + airbyte_catalog::{ConnectionStatus, MessageType, Status}, + protobuf::encode_message, + }; use super::*; + fn create_stream( + input: Vec, + ) -> Pin, Ok = T, Error = std::io::Error>>> { + Box::pin(stream::iter(input.into_iter().map(Ok::))) + } + #[tokio::test] - async fn test_stream_all_bytes() { - let input = "{\"test\": \"hello\"}".as_bytes(); - let stream = stream::once(future::ready(Ok::<_, std::io::Error>(input))); - let reader = StreamReader::new(stream); - let mut all_bytes = Box::pin(stream_all_bytes(reader)); - - let result = all_bytes.next().await.unwrap().unwrap(); - assert_eq!(result.chunk(), input); + async fn test_stream_lines() { + let line_0 = "{\"test\": \"hello\"}".as_bytes(); + let line_1 = "other".as_bytes(); + let line_2 = "{\"object\": {}}".as_bytes(); + let newline = "\n".as_bytes(); + let mut input = BytesMut::new(); + input.extend_from_slice(line_0); + input.extend_from_slice(newline); + input.extend_from_slice(line_1); + input.extend_from_slice(newline); + input.extend_from_slice(line_2); + let stream = create_stream(vec![Bytes::from(input)]); + let all_bytes = Box::pin(stream_lines(stream)); + + let result: Vec = all_bytes.try_collect::>().await.unwrap(); + assert_eq!(result, vec![line_0, line_1, line_2]); } #[tokio::test] @@ -191,16 +170,12 @@ mod test { }), }; let input = vec![ - Ok::<_, std::io::Error>( - "{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {".as_bytes(), - ), - Ok::<_, std::io::Error>("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}".as_bytes()), + Bytes::from("{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {"), + Bytes::from("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}"), ]; - let stream = stream::iter(input); - let reader = StreamReader::new(stream); + let stream = create_stream(input); - let byte_stream = Box::pin(stream_all_bytes(reader)); - let mut messages = Box::pin(stream_airbyte_responses(byte_stream)); + let mut messages = Box::pin(stream_airbyte_responses(stream)); let result = messages.next().await.unwrap().unwrap(); assert_eq!( @@ -224,16 +199,43 @@ mod test { }), }; let input = vec![ - Ok::<_, std::io::Error>( - "{}\n{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {".as_bytes(), + Bytes::from("{}\n{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {"), + Bytes::from("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}"), + ]; + let stream = create_stream(input); + + let mut messages = Box::pin(stream_airbyte_responses(stream)); + + let result = messages.next().await.unwrap().unwrap(); + assert_eq!( + result.connection_status.unwrap(), + input_message.connection_status.unwrap() + ); + } + + #[tokio::test] + async fn test_stream_airbyte_responses_plaintext_mixed() { + let input_message = Message { + message_type: MessageType::ConnectionStatus, + log: None, + state: None, + record: None, + spec: None, + catalog: None, + connection_status: Some(ConnectionStatus { + status: Status::Succeeded, + message: Some("test".to_string()), + }), + }; + let input = vec![ + Bytes::from( + "I am plaintext!\n{\"type\": \"CONNECTION_STATUS\", \"connectionStatus\": {", ), - Ok::<_, std::io::Error>("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}".as_bytes()), + Bytes::from("\"status\": \"SUCCEEDED\",\"message\":\"test\"}}"), ]; - let stream = stream::iter(input); - let reader = StreamReader::new(stream); + let stream = create_stream(input); - let byte_stream = Box::pin(stream_all_bytes(reader)); - let mut messages = Box::pin(stream_airbyte_responses(byte_stream)); + let mut messages = Box::pin(stream_airbyte_responses(stream)); let result = messages.next().await.unwrap().unwrap(); assert_eq!( @@ -241,4 +243,27 @@ mod test { input_message.connection_status.unwrap() ); } + + #[tokio::test] + async fn test_get_decoded_message() { + let msg = ValidateRequest { + materialization: "materialization".to_string(), + endpoint_type: EndpointType::AirbyteSource.into(), + endpoint_spec_json: "{}".to_string(), + bindings: vec![validate_request::Binding { + resource_spec_json: "{}".to_string(), + collection: None, + field_config_json: HashMap::new(), + }], + }; + + let msg_buf = encode_message(&msg).unwrap(); + + let stream = Box::pin(ReaderStream::new(std::io::Cursor::new(msg_buf))); + let result = get_decoded_message::(stream) + .await + .unwrap(); + + assert_eq!(result, msg); + } }