diff --git a/sxg_rs/src/headers.rs b/sxg_rs/src/headers.rs index f3dad47d..72be6871 100644 --- a/sxg_rs/src/headers.rs +++ b/sxg_rs/src/headers.rs @@ -16,6 +16,8 @@ use crate::http_parser::{ parse_accept_header, parse_cache_control_header, parse_content_type_header, + parse_link_header, + link::Link, media_type::MediaType, }; use std::collections::{BTreeSet, HashMap, HashSet}; @@ -24,6 +26,7 @@ use crate::http::HeaderFields; use serde::Deserialize; use std::cmp::min; use std::time::Duration; +use url::Url; #[derive(Debug)] pub struct Headers(HashMap); @@ -124,14 +127,60 @@ impl Headers { } Ok(()) } - pub fn get_signed_headers_bytes(&self, status_code: u16, mice_digest: &[u8]) -> Vec { - use crate::cbor::DataItem; + // Filters the link header to comply with + // https://github.com/google/webpackager/blob/main/docs/cache_requirements.md. + fn process_link_header(value: &str, fallback_url: &Url) -> String { + static ALLOWED_PARAM: Lazy> = Lazy::new(|| { + vec!["as", "header-integrity", "media", "rel", "imagesrcset", "imagesizes", "crossorigin"].into_iter().collect()}); + static ALLOWED_REL: Lazy> = Lazy::new(|| { + vec!["preload", "allowed-alt-sxg"].into_iter().collect()}); + static ALLOWED_CROSSORIGIN: Lazy> = Lazy::new(|| { + vec!["", "anonymous"].into_iter().collect()}); + match parse_link_header(value) { + Ok(links) => { + let mut count = 0; + links.into_iter().filter_map(|link| { + let uri: String = fallback_url.join(&link.uri).ok()?.into(); + let params_valid = link.params.iter().all(|(k, v)| + ALLOWED_PARAM.contains(k) && + match *k { + "rel" => matches!(v, Some(v) if ALLOWED_REL.contains(v.as_str())), + "crossorigin" => matches!(v, Some(v) if ALLOWED_CROSSORIGIN.contains(v.as_str())), + _ => true, + } + ); + if params_valid { + if link.params.iter().any(|(k,v)| *k == "rel" && matches!(v, Some(v) if v == "preload")) { + if count >= 20 { + return None + } + count += 1; + } + Some(Link{uri: &uri, ..link}.serialize()) + } else { + None + } + }).collect::>().join(",") + }, + Err(_) => "".into(), + } + } + // Returns the signed headers via the serializer callback instead of return + // value, because it contains a mix of &str and String. This makes it easy + // to test the intermediate Vec<(&str, &str)> without sacrificing + // performance by copying it into a Vec<(String, String)>. + fn get_signed_headers(&self, fallback_url: &Url, status_code: u16, mice_digest: &[u8], serializer: F) -> O + where F: Fn(Vec<(&str, &str)>) -> O { let connection = self.connection_headers(); let mut fields: Vec<(&str, &str)> = vec![]; let html = self.0.get("content-type").map_or(false, |t| matches!(parse_content_type_header(t), Ok(MediaType {primary_type, sub_type, ..}) if primary_type.eq_ignore_ascii_case("text") && sub_type.eq_ignore_ascii_case("html"))); + let link = self.0.get("link").map_or("".into(), |value| Self::process_link_header(value, fallback_url)); + if !link.is_empty() { + fields.push(("link", &link)); + } for (k, v) in self.0.iter() { if STRIP_RESPONSE_HEADERS.contains(k.as_str()) || DONT_SIGN_RESPONSE_HEADERS.contains(k.as_str()) || connection.contains(k) { continue; @@ -139,6 +188,10 @@ impl Headers { if !html && (STRIP_SUBRESOURCE_RESPONSE_HEADERS.contains(k.as_str()) || crate::id_headers::ID_HEADERS.contains(k.as_str())) { continue; } + if k == "link" { + // Handled above. + continue; + } fields.push((k, v)); } let status_code = status_code.to_string(); @@ -146,12 +199,18 @@ impl Headers { fields.push((":status", &status_code)); fields.push(("content-encoding", "mi-sha256-03")); fields.push(("digest", &digest)); - let cbor_data = DataItem::Map( - fields.iter().map(|(key, value)| { - (DataItem::ByteString(key.as_bytes()), DataItem::ByteString(value.as_bytes())) - }).collect() - ); - cbor_data.serialize() + serializer(fields) + } + pub fn get_signed_headers_bytes(&self, fallback_url: &Url, status_code: u16, mice_digest: &[u8]) -> Vec { + self.get_signed_headers(fallback_url, status_code, mice_digest, |fields| { + use crate::cbor::DataItem; + let cbor_data = DataItem::Map( + fields.iter().map(|(key, value)| { + (DataItem::ByteString(key.as_bytes()), DataItem::ByteString(value.as_bytes())) + }).collect() + ); + cbor_data.serialize() + }) } // Connection-specific headers per // https://datatracker.ietf.org/doc/html/rfc7230#section-6.1. @@ -425,12 +484,79 @@ mod tests { assert_eq!(headers(vec![("cache-control", "max=, max-age=3600")]).signature_duration().unwrap(), SEVEN_DAYS); } - // === get_signed_headers_bytes === + // === process_link_header === + #[test] + fn process_link_header() { + use std::iter::repeat; + let url = Url::parse("https://foo.com").unwrap(); + assert_eq!(Headers::process_link_header(r#" ; rel = "preload""#, &url), + ";rel=preload"); + { + let link = ";rel=preload"; + assert_eq!(Headers::process_link_header(&repeat(link).take(21).collect::>().join(","), &url), + repeat(link).take(20).collect::>().join(",")); + } + { + let link = r#";rel=preload,;rel=allowed-alt-sxg;header-integrity="sha256-OcpYAC5zFQtAXUURzXkMDDxMbxuEeWVjdRCDcLcBhBY=""#; + assert_eq!(Headers::process_link_header(&repeat(link).take(21).collect::>().join(","), &url), + repeat(link).take(20).collect::>().join(",") + r#",;rel=allowed-alt-sxg;header-integrity="sha256-OcpYAC5zFQtAXUURzXkMDDxMbxuEeWVjdRCDcLcBhBY=""#); + } + assert_eq!(Headers::process_link_header(";rel=preload", &url), + ";rel=preload"); + assert_eq!(Headers::process_link_header("<../quux>;rel=preload", &url.join("/bar/baz/").unwrap()), + ";rel=preload"); + assert_eq!(Headers::process_link_header(";rel=prefetch", &url), + ""); + assert_eq!(Headers::process_link_header(";other", &url), + ""); + assert_eq!(Headers::process_link_header(";rel=preload,;rel=prefetch", &url), + ";rel=preload"); + assert_eq!(Headers::process_link_header(r#";rel=preload;as=image;imagesizes=800px;imagesrcset="img.jpg 800w""#, &url), + r#";rel=preload;as=image;imagesizes=800px;imagesrcset="img.jpg 800w""#); + } + + // === get_signed_headers === #[test] fn strip_id_headers() { - assert_eq!(headers(vec![("content-type", "image/jpeg"), ("x-request-id", "abcdef123")]).get_signed_headers_bytes(200, &[]), + let url = Url::parse("https://foo.com").unwrap(); + assert_eq!(headers(vec![("content-type", "image/jpeg"), ("x-request-id", "abcdef123")]).get_signed_headers::, _>(&url, 200, &[], header_fields), + header_fields::>(vec![ + ("content-type", "image/jpeg"), + // x-request-id is missing + (":status", "200"), + ("content-encoding", "mi-sha256-03"), + ("digest", "mi-sha256-03=")])); + assert_eq!(headers(vec![("content-type", "text/html;charset=utf-8"), ("x-request-id", "abcdef123")]).get_signed_headers::, _>(&url, 200, &[], header_fields), + header_fields::>(vec![ + ("content-type", "text/html;charset=utf-8"), + ("x-request-id", "abcdef123"), + (":status", "200"), + ("content-encoding", "mi-sha256-03"), + ("digest", "mi-sha256-03=")])); + } + #[test] + fn includes_link_if_valid() { + let url = Url::parse("https://foo.com").unwrap(); + assert_eq!(headers(vec![("content-type", "text/html"), ("link", ";rel=preload")]).get_signed_headers::, _>(&url ,200, &[], header_fields), + header_fields::>(vec![ + ("content-type", "text/html"), + ("link", ";rel=preload"), + (":status", "200"), + ("content-encoding", "mi-sha256-03"), + ("digest", "mi-sha256-03=")])); + assert_eq!(headers(vec![("content-type", "text/html"), ("link", r#";rel=prefetch"#)]).get_signed_headers::, _>(&url, 200, &[], header_fields), + header_fields::>(vec![ + ("content-type", "text/html"), + (":status", "200"), + ("content-encoding", "mi-sha256-03"), + ("digest", "mi-sha256-03=")])); + } + + // === get_signed_headers_bytes === + #[test] + fn get_signed_headers_bytes() { + let url = Url::parse("https://foo.com").unwrap(); + assert_eq!(headers(vec![("content-type", "image/jpeg")]).get_signed_headers_bytes(&url, 200, &[]), b"\xA4FdigestMmi-sha256-03=G:statusC200Lcontent-typeJimage/jpegPcontent-encodingLmi-sha256-03"); - assert_eq!(headers(vec![("content-type", "text/html;charset=utf-8"), ("x-request-id", "abcdef123")]).get_signed_headers_bytes(200, &[]), - b"\xA5FdigestMmi-sha256-03=G:statusC200Lcontent-typeWtext/html;charset=utf-8Lx-request-idIabcdef123Pcontent-encodingLmi-sha256-03"); } } diff --git a/sxg_rs/src/http_parser/base.rs b/sxg_rs/src/http_parser/base.rs index 8d8aa32f..734e064e 100644 --- a/sxg_rs/src/http_parser/base.rs +++ b/sxg_rs/src/http_parser/base.rs @@ -35,7 +35,7 @@ pub fn token(input: &str) -> IResult<&str, &str> { take_while1(is_tchar)(input) } -fn is_tchar(c: char) -> bool { +pub fn is_tchar(c: char) -> bool { match c { '!' | '#' | '$' | '%' | '&' | '\'' | '*' => true, '+' | '-' | '.' | '^' | '_' | '`' | '|' | '~' => true, @@ -83,7 +83,7 @@ fn is_qdtext(c: char) -> bool { } } -fn is_quoted_pair_payload(c: char) -> bool { +pub fn is_quoted_pair_payload(c: char) -> bool { match c { '\t' | ' ' => true, '\x21'..='\x7E' => true, diff --git a/sxg_rs/src/http_parser/link.rs b/sxg_rs/src/http_parser/link.rs new file mode 100644 index 00000000..a844dfa8 --- /dev/null +++ b/sxg_rs/src/http_parser/link.rs @@ -0,0 +1,133 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use nom::{ + IResult, + bytes::complete::take_while, + character::complete::char, + combinator::{map, opt}, + multi::many0, + sequence::{delimited, pair, preceded, terminated, tuple}, +}; +use super::base::{ + is_quoted_pair_payload, + is_tchar, + ows, + parameter_value, + token, +}; + +// Represents an individual link directive i.e. an instance of `link-value` +// from https://datatracker.ietf.org/doc/html/rfc8288#section-3. +// Parameters with alternate character encodings (via RFC8187) are not +// supported. +#[derive(Clone, Debug, PartialEq)] +pub struct Link<'a> { + pub uri: &'a str, + pub params: Vec<(&'a str, Option)>, +} + +fn quote(value: &str) -> Option { + if value.chars().all(|c| is_tchar(c)) { + Some(value.into()) + } else if value.chars().all(|c| is_quoted_pair_payload(c)) { + Some("\"".to_string() + &value.chars().map(|c: char| { + if c == '\\' || c == '"' { + format!("\\{}", c) + } else { + format!("{}", c) + } + }).collect::() + "\"") + } else { + None + } +} + +impl <'a> Link<'a> { + pub fn serialize(&self) -> String { + "<".to_string() + self.uri + ">" + + &self.params.iter().filter_map(|(k, v)| { + Some(if let Some(v) = v { + format!(";{}={}", k, quote(v)?) + } else { + format!(";{}", k) + }) + }).collect::() + } +} + +fn uri_ref(input: &str) -> IResult<&str, &str> { + // We don't need to fully parse the URI ref using nom. It would be + // sufficient to scan up until the closing delimiter '>' and then pass the result to the + // URL class for parsing and validation. For defense in depth, we only allow + // the characters specified in + // https://datatracker.ietf.org/doc/html/rfc3986#appendix-A. + take_while(|c: char| + match c { + // unreserved + 'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '.' | '_' | '~' => true, + // gen-delims + ':' | '|' | '?' | '#' | '[' | ']' | '@' => true, + // sub-delims + '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => true, + // pct-encoded + '%' => true, + // path + '/' => true, + _ => false, + } + )(input) +} + +fn link_param<'a>(input: &'a str) -> IResult<&str, (&'a str, Option)> { + pair(terminated(token, ows), + opt(preceded(pair(char('='), ows), parameter_value)))(input) +} + +pub fn link(input: &str) -> IResult<&str, Link> { + map(pair(delimited(char('<'), uri_ref, char('>')), + many0(preceded(tuple((ows, char(';'), ows)), link_param))), |(uri, params)| + Link{uri, params} + )(input) +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn parse() { + assert_eq!(link("<>").unwrap(), ("", Link{uri: "", params: vec![]})); + assert_eq!(link("").unwrap(), ("", Link{uri: "/foo,bar;baz", params: vec![]})); + assert_eq!(link(";bar;baz=quux").unwrap(), + ("", Link{uri: "/foo", + params: vec![("bar", None), ("baz", Some("quux".into()))]})); + assert_eq!(link(r#";bar="baz \\\"quux""#).unwrap(), + ("", Link{uri: "/foo", + params: vec![("bar", Some(r#"baz \"quux"#.into()))]})); + assert!(matches!(link(r#";bar="baz \""#).unwrap_err(), nom::Err::Incomplete(_))); + } + #[test] + fn serialize() { + assert_eq!(Link{uri: "/foo", params: vec![("bar", None)]}.serialize(), + ";bar"); + assert_eq!(Link{uri: "/foo", params: vec![("bar", Some("baz".into()))]}.serialize(), + ";bar=baz"); + assert_eq!(Link{uri: "/foo", params: vec![("bar", Some("baz quux".into()))]}.serialize(), + r#";bar="baz quux""#); + assert_eq!(Link{uri: "/foo", params: vec![("bar", Some(r#"baz\"quux"#.into()))]}.serialize(), + r#";bar="baz\\\"quux""#); + assert_eq!(Link{uri: "/foo", params: vec![("bar", Some("\x7f".into()))]}.serialize(), + ""); + } +} diff --git a/sxg_rs/src/http_parser/mod.rs b/sxg_rs/src/http_parser/mod.rs index e4eb2a6e..455869ef 100644 --- a/sxg_rs/src/http_parser/mod.rs +++ b/sxg_rs/src/http_parser/mod.rs @@ -16,6 +16,7 @@ mod accept; mod base; mod cache_control; pub mod media_type; +pub mod link; use nom::{ IResult, @@ -50,14 +51,18 @@ pub fn parse_accept_header(input: &str) -> Result, String> { parse_vec(input, accept::accept) } +// Returns the freshness lifetime for a shared cache. +pub fn parse_cache_control_header(input: &str) -> Result { + let directives = parse_vec(input, cache_control::directive)?; + cache_control::freshness_lifetime(directives).ok_or("Freshness lifetime is implicit".into()) +} + pub fn parse_content_type_header(input: &str) -> Result { complete(media_type::media_type)(input) .map(|(_, output)| output) .map_err(format_nom_err) } -// Returns the freshness lifetime for a shared cache. -pub fn parse_cache_control_header(input: &str) -> Result { - let directives = parse_vec(input, cache_control::directive)?; - cache_control::freshness_lifetime(directives).ok_or("Freshness lifetime is implicit".into()) +pub fn parse_link_header(input: &str) -> Result, String> { + parse_vec(input, link::link) } diff --git a/sxg_rs/src/lib.rs b/sxg_rs/src/lib.rs index 4d7d9dc0..872e7cc8 100644 --- a/sxg_rs/src/lib.rs +++ b/sxg_rs/src/lib.rs @@ -89,7 +89,7 @@ impl SxgWorker { // 16384 is the max mice record size allowed by SXG spec. // https://wicg.github.io/webpackage/draft-yasskin-http-origin-signed-responses.html#section-3.5-7.9.1 let (mice_digest, payload_body) = crate::mice::calculate(payload_body, 16384); - let signed_headers = payload_headers.get_signed_headers_bytes(status_code, &mice_digest); + let signed_headers = payload_headers.get_signed_headers_bytes(&fallback_base, status_code, &mice_digest); let cert_url = cert_base.join(&format!("{}{}", &self.config.cert_url_dirname, &self.cert_basename())) .map_err(|_| "Failed to parse cert_url_dirname")?; let validity_url = fallback_base.join(&format!("{}{}", &self.config.validity_url_dirname, "validity"))