google · twifkak · Sep 3, 2021 · Sep 2, 2021 · Sep 3, 2021 · Sep 3, 2021
diff --git a/sxg_rs/src/headers.rs b/sxg_rs/src/headers.rs
@@ -16,6 +16,8 @@ use crate::http_parser::{
     parse_accept_header,
     parse_cache_control_header,
     parse_content_type_header,
+    parse_link_header,
+    link::Link,
     media_type::MediaType,
 };
 use std::collections::{BTreeSet, HashMap, HashSet};
@@ -24,6 +26,7 @@ use crate::http::HeaderFields;
 use serde::Deserialize;
 use std::cmp::min;
 use std::time::Duration;
+use url::Url;
 
 #[derive(Debug)]
 pub struct Headers(HashMap<String, String>);
@@ -124,34 +127,90 @@ impl Headers {
         }
         Ok(())
     }
-    pub fn get_signed_headers_bytes(&self, status_code: u16, mice_digest: &[u8]) -> Vec<u8> {
-        use crate::cbor::DataItem;
+    // Filters the link header to comply with
+    // https://github.com/google/webpackager/blob/main/docs/cache_requirements.md.
+    fn process_link_header(value: &str, fallback_url: &Url) -> String {
+        static ALLOWED_PARAM: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+            vec!["as", "header-integrity", "media", "rel", "imagesrcset", "imagesizes", "crossorigin"].into_iter().collect()});
+        static ALLOWED_REL: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+            vec!["preload", "allowed-alt-sxg"].into_iter().collect()});
+        static ALLOWED_CROSSORIGIN: Lazy<HashSet<&'static str>> = Lazy::new(|| {
+            vec!["", "anonymous"].into_iter().collect()});
+        match parse_link_header(value) {
+            Ok(links) => {
+                let mut count = 0;
+                links.into_iter().filter_map(|link| {
+                    let uri: String = fallback_url.join(&link.uri).ok()?.into();
+                    let params_valid = link.params.iter().all(|(k, v)|
+                        ALLOWED_PARAM.contains(k) &&
+                        match *k {
+                            "rel" => matches!(v, Some(v) if ALLOWED_REL.contains(v.as_str())),
+                            "crossorigin" => matches!(v, Some(v) if ALLOWED_CROSSORIGIN.contains(v.as_str())),
+                            _ => true,
+                        }
+                    );
+                    if params_valid {
+                        if link.params.iter().any(|(k,v)| *k == "rel" && matches!(v, Some(v) if v == "preload")) {
+                            if count >= 20 {
+                                return None
+                            }
+                            count += 1;
+                        }
+                        Some(Link{uri: &uri, ..link}.serialize())
+                    } else {
+                        None
+                    }
+                }).collect::<Vec<String>>().join(",")
+            },
+            Err(_) => "".into(),
+        }
+    }
+    // Returns the signed headers via the serializer callback instead of return
+    // value, because it contains a mix of &str and String. This makes it easy
+    // to test the intermediate Vec<(&str, &str)> without sacrificing
+    // performance by copying it into a Vec<(String, String)>.
+    fn get_signed_headers<O, F>(&self, fallback_url: &Url, status_code: u16, mice_digest: &[u8], serializer: F) -> O
+        where F: Fn(Vec<(&str, &str)>) -> O {
         let connection = self.connection_headers();
         let mut fields: Vec<(&str, &str)> = vec![];
         let html = self.0.get("content-type").map_or(false, |t|
             matches!(parse_content_type_header(t),
                      Ok(MediaType {primary_type, sub_type, ..})
                          if primary_type.eq_ignore_ascii_case("text") && sub_type.eq_ignore_ascii_case("html")));
+        let link = self.0.get("link").map_or("".into(), |value| Self::process_link_header(value, fallback_url));
+        if !link.is_empty() {
+            fields.push(("link", &link));
+        }
         for (k, v) in self.0.iter() {
             if STRIP_RESPONSE_HEADERS.contains(k.as_str()) || DONT_SIGN_RESPONSE_HEADERS.contains(k.as_str()) || connection.contains(k) {
                 continue;
             }
             if !html && (STRIP_SUBRESOURCE_RESPONSE_HEADERS.contains(k.as_str()) || crate::id_headers::ID_HEADERS.contains(k.as_str())) {
                 continue;
             }
+            if k == "link" {
+                // Handled above.
+                continue;
+            }
             fields.push((k, v));
         }
         let status_code = status_code.to_string();
         let digest = format!("mi-sha256-03={}", ::base64::encode(&mice_digest));
         fields.push((":status", &status_code));
         fields.push(("content-encoding", "mi-sha256-03"));
         fields.push(("digest", &digest));
-        let cbor_data = DataItem::Map(
-            fields.iter().map(|(key, value)| {
-                (DataItem::ByteString(key.as_bytes()), DataItem::ByteString(value.as_bytes()))
-            }).collect()
-        );
-        cbor_data.serialize()
+        serializer(fields)
+    }
+    pub fn get_signed_headers_bytes(&self, fallback_url: &Url, status_code: u16, mice_digest: &[u8]) -> Vec<u8> {
+        self.get_signed_headers(fallback_url, status_code, mice_digest, |fields| {
+            use crate::cbor::DataItem;
+            let cbor_data = DataItem::Map(
+                fields.iter().map(|(key, value)| {
+                    (DataItem::ByteString(key.as_bytes()), DataItem::ByteString(value.as_bytes()))
+                }).collect()
+            );
+            cbor_data.serialize()
+        })
     }
     // Connection-specific headers per
     // https://datatracker.ietf.org/doc/html/rfc7230#section-6.1.
@@ -425,12 +484,79 @@ mod tests {
         assert_eq!(headers(vec![("cache-control", "max=, max-age=3600")]).signature_duration().unwrap(), SEVEN_DAYS);
     }
 
-    // === get_signed_headers_bytes ===
+    // === process_link_header ===
+    #[test]
+    fn process_link_header() {
+        use std::iter::repeat;
+        let url = Url::parse("https://foo.com").unwrap();
+        assert_eq!(Headers::process_link_header(r#"<https://foo.com/> ; rel = "preload""#, &url),
+                   "<https://foo.com/>;rel=preload");
+        {
+            let link = "<https://foo.com/>;rel=preload";
+            assert_eq!(Headers::process_link_header(&repeat(link).take(21).collect::<Vec<&str>>().join(","), &url),
+                       repeat(link).take(20).collect::<Vec<&str>>().join(","));
+        }
+        {
+            let link = r#"<https://foo.com/>;rel=preload,<https://foo.com/>;rel=allowed-alt-sxg;header-integrity="sha256-OcpYAC5zFQtAXUURzXkMDDxMbxuEeWVjdRCDcLcBhBY=""#;
+            assert_eq!(Headers::process_link_header(&repeat(link).take(21).collect::<Vec<&str>>().join(","), &url),
+                       repeat(link).take(20).collect::<Vec<&str>>().join(",") + r#",<https://foo.com/>;rel=allowed-alt-sxg;header-integrity="sha256-OcpYAC5zFQtAXUURzXkMDDxMbxuEeWVjdRCDcLcBhBY=""#);
+        }
+        assert_eq!(Headers::process_link_header("</foo>;rel=preload", &url),
+                   "<https://foo.com/foo>;rel=preload");
+        assert_eq!(Headers::process_link_header("<../quux>;rel=preload", &url.join("/bar/baz/").unwrap()),
+                   "<https://foo.com/bar/quux>;rel=preload");
+        assert_eq!(Headers::process_link_header("<https://foo.com/>;rel=prefetch", &url),
+                   "");
+        assert_eq!(Headers::process_link_header("<https://foo.com/>;other", &url),
+                   "");
+        assert_eq!(Headers::process_link_header("<https://foo.com/>;rel=preload,<https://foo.com/>;rel=prefetch", &url),
+                   "<https://foo.com/>;rel=preload");
+        assert_eq!(Headers::process_link_header(r#"<img.jpg>;rel=preload;as=image;imagesizes=800px;imagesrcset="img.jpg 800w""#, &url),
+                   r#"<https://foo.com/img.jpg>;rel=preload;as=image;imagesizes=800px;imagesrcset="img.jpg 800w""#);
+    }
+
+    // === get_signed_headers ===
     #[test]
     fn strip_id_headers() {
-        assert_eq!(headers(vec![("content-type", "image/jpeg"), ("x-request-id", "abcdef123")]).get_signed_headers_bytes(200, &[]),
+        let url = Url::parse("https://foo.com").unwrap();
+        assert_eq!(headers(vec![("content-type", "image/jpeg"), ("x-request-id", "abcdef123")]).get_signed_headers::<HashMap<String, String>, _>(&url, 200, &[], header_fields),
+                   header_fields::<HashMap<String, String>>(vec![
+                       ("content-type", "image/jpeg"),
+                       // x-request-id is missing
+                       (":status", "200"),
+                       ("content-encoding", "mi-sha256-03"),
+                       ("digest", "mi-sha256-03=")]));
+        assert_eq!(headers(vec![("content-type", "text/html;charset=utf-8"), ("x-request-id", "abcdef123")]).get_signed_headers::<HashMap<String, String>, _>(&url, 200, &[], header_fields),
+                   header_fields::<HashMap<String, String>>(vec![
+                       ("content-type", "text/html;charset=utf-8"),
+                       ("x-request-id", "abcdef123"),
+                       (":status", "200"),
+                       ("content-encoding", "mi-sha256-03"),
+                       ("digest", "mi-sha256-03=")]));
+    }
+    #[test]
+    fn includes_link_if_valid() {
+        let url = Url::parse("https://foo.com").unwrap();
+        assert_eq!(headers(vec![("content-type", "text/html"), ("link", "<https://foo.com/>;rel=preload")]).get_signed_headers::<HashMap<String, String>, _>(&url ,200, &[], header_fields),
+                   header_fields::<HashMap<String, String>>(vec![
+                       ("content-type", "text/html"),
+                       ("link", "<https://foo.com/>;rel=preload"),
+                       (":status", "200"),
+                       ("content-encoding", "mi-sha256-03"),
+                       ("digest", "mi-sha256-03=")]));
+        assert_eq!(headers(vec![("content-type", "text/html"), ("link", r#"</foo>;rel=prefetch"#)]).get_signed_headers::<HashMap<String, String>, _>(&url, 200, &[], header_fields),
+                   header_fields::<HashMap<String, String>>(vec![
+                       ("content-type", "text/html"),
+                       (":status", "200"),
+                       ("content-encoding", "mi-sha256-03"),
+                       ("digest", "mi-sha256-03=")]));
+    }
+
+    // === get_signed_headers_bytes ===
+    #[test]
+    fn get_signed_headers_bytes() {
+        let url = Url::parse("https://foo.com").unwrap();
+        assert_eq!(headers(vec![("content-type", "image/jpeg")]).get_signed_headers_bytes(&url, 200, &[]),
                    b"\xA4FdigestMmi-sha256-03=G:statusC200Lcontent-typeJimage/jpegPcontent-encodingLmi-sha256-03");
-        assert_eq!(headers(vec![("content-type", "text/html;charset=utf-8"), ("x-request-id", "abcdef123")]).get_signed_headers_bytes(200, &[]),
-                   b"\xA5FdigestMmi-sha256-03=G:statusC200Lcontent-typeWtext/html;charset=utf-8Lx-request-idIabcdef123Pcontent-encodingLmi-sha256-03");
     }
 }
diff --git a/sxg_rs/src/http_parser/base.rs b/sxg_rs/src/http_parser/base.rs
@@ -35,7 +35,7 @@ pub fn token(input: &str) -> IResult<&str, &str> {
     take_while1(is_tchar)(input)
 }
 
-fn is_tchar(c: char) -> bool {
+pub fn is_tchar(c: char) -> bool {
     match c {
         '!' | '#' | '$' | '%' | '&' | '\'' | '*' => true,
         '+' | '-' | '.' | '^' | '_' | '`' | '|' | '~' => true,
@@ -83,7 +83,7 @@ fn is_qdtext(c: char) -> bool {
     }
 }
 
-fn is_quoted_pair_payload(c: char) -> bool {
+pub fn is_quoted_pair_payload(c: char) -> bool {
     match c {
         '\t' | ' ' => true,
         '\x21'..='\x7E' => true,

diff --git a/sxg_rs/src/http_parser/link.rs b/sxg_rs/src/http_parser/link.rs
@@ -0,0 +1,140 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use nom::{
+    IResult,
+    bytes::complete::take_while,
+    character::complete::char,
+    combinator::{map, opt},
+    multi::many0,
+    sequence::{delimited, pair, preceded, terminated, tuple},
+};
+use super::base::{
+    is_quoted_pair_payload,
+    is_tchar,
+    ows,
+    parameter_value,
+    token,
+};
+
+// Represents an individual link directive i.e. an instance of `link-value`
+// from https://datatracker.ietf.org/doc/html/rfc8288#section-3.
+// Parameters with alternate character encodings (via RFC8187) are not
+// supported.
+#[derive(Clone, Debug, PartialEq)]
+pub struct Link<'a> {
+    pub uri: &'a str,
+    pub params: Vec<(&'a str, Option<String>)>,
+}
+
+fn quote(value: &str) -> Option<String> {
+    if value.chars().all(|c| is_tchar(c)) {
+        Some(value.into())
+    } else if value.chars().all(|c| is_quoted_pair_payload(c)) {
+        Some("\"".to_string() + &value.chars().map(|c: char| {
+            let mut quoted_pair = if c == '\\' || c == '"' {
+                "\\"
+            } else {
+                ""
+            }.to_string();
+            quoted_pair.push(c);
+            quoted_pair
+        }).collect::<String>() + "\"")
+    } else {
+        None
+    }
+}
+
+impl <'a> Link<'a> {
+    pub fn serialize(&self) -> String {
+        "<".to_string() + self.uri + ">" +
+            &self.params.iter().filter_map(|(k, v)| {
+                let mut directive = ";".to_string() + k;
+                if let Some(v) = v {
+                    if let Some(quoted) = quote(v) {
+                        directive.push('=');
+                        directive.push_str(&quoted);
+                    } else {
+                        return None
+                    }
+                }
+                Some(directive)
+            }).collect::<String>()
+    }
+}
+
+fn uri_ref(input: &str) -> IResult<&str, &str> {
+    // We don't need to fully parse the URI ref using nom. It would be
+    // sufficient to scan up until the closing delimiter '>' and then pass the result to the
+    // URL class for parsing and validation. For defense in depth, we only allow
+    // the characters specified in
+    // https://datatracker.ietf.org/doc/html/rfc3986#appendix-A.
+    take_while(|c: char|
+        match c {
+            // unreserved
+            'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '.' | '_' | '~' => true,
+            // gen-delims
+            ':' | '|' | '?' | '#' | '[' | ']' | '@' => true,
+            // sub-delims
+            '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => true,
+            // pct-encoded
+            '%' => true,
+            // path
+            '/' => true,
+            _ => false,
+        }
+    )(input)
+}
+
+fn link_param<'a>(input: &'a str) -> IResult<&str, (&'a str, Option<String>)> {
+    pair(terminated(token, ows),
+         opt(preceded(pair(char('='), ows), parameter_value)))(input)
+}
+
+pub fn link(input: &str) -> IResult<&str, Link> {
+    map(pair(delimited(char('<'), uri_ref, char('>')),
+             many0(preceded(tuple((ows, char(';'), ows)), link_param))), |(uri, params)|
+        Link{uri, params}
+    )(input)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn parse() {
+        assert_eq!(link("<>").unwrap(), ("", Link{uri: "", params: vec![]}));
+        assert_eq!(link("</foo,bar;baz>").unwrap(), ("", Link{uri: "/foo,bar;baz", params: vec![]}));
+        assert_eq!(link("</foo>;bar;baz=quux").unwrap(),
+                   ("", Link{uri: "/foo",
+                             params: vec![("bar", None), ("baz", Some("quux".into()))]}));
+        assert_eq!(link(r#"</foo>;bar="baz \\\"quux""#).unwrap(),
+                   ("", Link{uri: "/foo",
+                             params: vec![("bar", Some(r#"baz \"quux"#.into()))]}));
+        assert!(matches!(link(r#"</foo>;bar="baz \""#).unwrap_err(), nom::Err::Incomplete(_)));
+    }
+    #[test]
+    fn serialize() {
+        assert_eq!(Link{uri: "/foo", params: vec![("bar", None)]}.serialize(),
+                   "</foo>;bar");
+        assert_eq!(Link{uri: "/foo", params: vec![("bar", Some("baz".into()))]}.serialize(),
+                   "</foo>;bar=baz");
+        assert_eq!(Link{uri: "/foo", params: vec![("bar", Some("baz quux".into()))]}.serialize(),
+                   r#"</foo>;bar="baz quux""#);
+        assert_eq!(Link{uri: "/foo", params: vec![("bar", Some(r#"baz\"quux"#.into()))]}.serialize(),
+                   r#"</foo>;bar="baz\\\"quux""#);
+        assert_eq!(Link{uri: "/foo", params: vec![("bar", Some("\x7f".into()))]}.serialize(),
+                   "</foo>");
+    }
+}
diff --git a/sxg_rs/src/http_parser/mod.rs b/sxg_rs/src/http_parser/mod.rs
@@ -16,6 +16,7 @@ mod accept;
 mod base;
 mod cache_control;
 pub mod media_type;
+pub mod link;
 
 use nom::{
     IResult,
@@ -50,14 +51,18 @@ pub fn parse_accept_header(input: &str) -> Result<Vec<accept::Accept>, String> {
     parse_vec(input, accept::accept)
 }
 
+// Returns the freshness lifetime for a shared cache.
+pub fn parse_cache_control_header(input: &str) -> Result<Duration, String> {
+    let directives = parse_vec(input, cache_control::directive)?;
+    cache_control::freshness_lifetime(directives).ok_or("Freshness lifetime is implicit".into())
+}
+
 pub fn parse_content_type_header(input: &str) -> Result<media_type::MediaType, String> {
     complete(media_type::media_type)(input)
         .map(|(_, output)| output)
         .map_err(format_nom_err)
 }
 
-// Returns the freshness lifetime for a shared cache.
-pub fn parse_cache_control_header(input: &str) -> Result<Duration, String> {
-    let directives = parse_vec(input, cache_control::directive)?;
-    cache_control::freshness_lifetime(directives).ok_or("Freshness lifetime is implicit".into())
+pub fn parse_link_header(input: &str) -> Result<Vec<link::Link>, String> {
+    parse_vec(input, link::link)
 }
diff --git a/sxg_rs/src/lib.rs b/sxg_rs/src/lib.rs
@@ -89,7 +89,7 @@ impl SxgWorker {
         // 16384 is the max mice record size allowed by SXG spec.
         // https://wicg.github.io/webpackage/draft-yasskin-http-origin-signed-responses.html#section-3.5-7.9.1
         let (mice_digest, payload_body) = crate::mice::calculate(payload_body, 16384);
-        let signed_headers = payload_headers.get_signed_headers_bytes(status_code, &mice_digest);
+        let signed_headers = payload_headers.get_signed_headers_bytes(&fallback_base, status_code, &mice_digest);
         let cert_url = cert_base.join(&format!("{}{}", &self.config.cert_url_dirname, &self.cert_basename()))
             .map_err(|_| "Failed to parse cert_url_dirname")?;
         let validity_url = fallback_base.join(&format!("{}{}", &self.config.validity_url_dirname, "validity"))