diff --git a/CHANGELOG.md b/CHANGELOG.md index 31d0e42..6c2384e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Unreleased +- allow country TLDs in scheme-less links + ## 0.14.0 - Bug fixes and scheme-less links - Parse scheme-less links for some TLDs diff --git a/message_parser_wasm/src/lib.rs b/message_parser_wasm/src/lib.rs index 99b9c2a..6aa8a16 100644 --- a/message_parser_wasm/src/lib.rs +++ b/message_parser_wasm/src/lib.rs @@ -41,6 +41,7 @@ export type LinkDestination = { target: string; hostname: null | string; punycode: null | PunycodeWarning; + scheme: null | string; }; export type ParsedElement = | { t: "Text"; c: string } diff --git a/src/parser/link_url/allowed_tlds/country_tlds.rs b/src/parser/link_url/allowed_tlds/country_tlds.rs new file mode 100644 index 0000000..475afe2 --- /dev/null +++ b/src/parser/link_url/allowed_tlds/country_tlds.rs @@ -0,0 +1,19 @@ +// extracted from first table on https://de.wikipedia.org/wiki/Liste_länderspezifischer_Top-Level-Domains +pub const COUNTRY_TLDS: [&str; 254] = [ + "ac", "ad", "ae", "af", "ag", "ai", "al", "am", "ao", "aq", "ar", "as", "at", "au", "aw", "ax", + "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bl", "bm", "bn", "bo", "bq", "br", + "bs", "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", + "cn", "co", "cr", "cu", "cv", "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", + "ee", "eg", "eh", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", + "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy", + "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", + "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", + "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mf", "mg", + "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", + "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe", + "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", + "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", + "sr", "ss", "st", "su", "sv", "sx", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl", + "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", + "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "za", "zm", "zr", "zw", +]; diff --git a/src/parser/link_url/allowed_tlds/mod.rs b/src/parser/link_url/allowed_tlds/mod.rs new file mode 100644 index 0000000..e9ad60c --- /dev/null +++ b/src/parser/link_url/allowed_tlds/mod.rs @@ -0,0 +1,36 @@ +mod country_tlds; + +const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[ + // originals from RFC920 + net + "com", "org", "net", "edu", "gov", "mil", // for deltachat + "chat", +]; + +pub fn check_if_tld_is_allowed(tld: &str) -> bool { + if ALLOWED_TOP_LEVEL_DOMAINS.iter().any(|item| *item == tld) { + true + } else { + country_tlds::COUNTRY_TLDS.binary_search(&tld).is_ok() + } +} + +#[cfg(test)] +mod test { + use crate::parser::link_url::allowed_tlds::check_if_tld_is_allowed; + + #[test] + fn test_check_tld() { + assert!(check_if_tld_is_allowed("chat")); + assert!(check_if_tld_is_allowed("com")); + + assert!(check_if_tld_is_allowed("de")); + assert!(check_if_tld_is_allowed("at")); + assert!(check_if_tld_is_allowed("uk")); + assert!(check_if_tld_is_allowed("fr")); + } + + #[test] + fn test_check_tld_not_allowed() { + assert!(!check_if_tld_is_allowed("doesnotexist")); + } +} diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index f5062b9..4af8593 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -1,3 +1,4 @@ +mod allowed_tlds; mod ip; mod parenthesis_counter; mod parse_link; diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 61e2ed2..d9d4d97 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -23,6 +23,7 @@ use crate::parser::{ }; use super::{ + allowed_tlds::check_if_tld_is_allowed, parenthesis_counter::count_chars_in_complete_parenthesis, punycode_warning::get_puny_code_warning, }; @@ -46,14 +47,6 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool { ) } -const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[ - // originals from RFC920 + net - ".com", ".org", ".net", ".edu", ".gov", ".mil", - // for deltachat - ".chat", - // !todo country codes here next -]; - // These ranges have been extracted from RFC3987, Page 8. const UCSCHAR_RANGES: [RangeInclusive; 17] = [ 0xa0..=0xd7ff, @@ -294,10 +287,18 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { // now with host, if we dont have a scheme we need to check it for TLD if scheme.is_empty() { - ALLOWED_TOP_LEVEL_DOMAINS - .iter() - .find(|&&tld| host.ends_with(tld)) - .ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLink))?; + if !host.contains('.') { + return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink)); + } + + let tld = host + .split('.') + .last() + .ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLinkNoTLD))?; + + if !check_if_tld_is_allowed(tld) { + return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink)); + } } let (input, path) = opt(alt(( diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index e5cb491..817d0ac 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -18,6 +18,7 @@ pub enum CustomError { Nom(I, ErrorKind), InvalidEmail, InvalidLink, + InvalidLinkNoTLD, UnexpectedContent, PrecedingWhitespaceMissing, OptionIsUnexpectedNone,