From 2417e603699ec430b6d1ee1b28240eb7791fd8cc Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Mon, 26 May 2025 15:22:30 +0200 Subject: [PATCH 1/3] allow country TLDs in scheme-less links --- CHANGELOG.md | 2 + .../link_url/allowed_tlds/country_tlds.rs | 257 ++++++++++++++++++ src/parser/link_url/allowed_tlds/mod.rs | 34 +++ src/parser/link_url/mod.rs | 1 + src/parser/link_url/parse_link.rs | 25 +- src/parser/parse_from_text/base_parsers.rs | 1 + 6 files changed, 307 insertions(+), 13 deletions(-) create mode 100644 src/parser/link_url/allowed_tlds/country_tlds.rs create mode 100644 src/parser/link_url/allowed_tlds/mod.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 31d0e42..6c2384e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## Unreleased +- allow country TLDs in scheme-less links + ## 0.14.0 - Bug fixes and scheme-less links - Parse scheme-less links for some TLDs diff --git a/src/parser/link_url/allowed_tlds/country_tlds.rs b/src/parser/link_url/allowed_tlds/country_tlds.rs new file mode 100644 index 0000000..348c167 --- /dev/null +++ b/src/parser/link_url/allowed_tlds/country_tlds.rs @@ -0,0 +1,257 @@ + +// extracted from first table on https://de.wikipedia.org/wiki/Liste_länderspezifischer_Top-Level-Domains +pub const COUNTRY_TLDS: [&str;254] = [ +"ac", +"ad", +"ae", +"af", +"ag", +"ai", +"al", +"am", +"ao", +"aq", +"ar", +"as", +"at", +"au", +"aw", +"ax", +"az", +"ba", +"bb", +"bd", +"be", +"bf", +"bg", +"bh", +"bi", +"bj", +"bl", +"bm", +"bn", +"bo", +"bq", +"br", +"bs", +"bt", +"bv", +"bw", +"by", +"bz", +"ca", +"cc", +"cd", +"cf", +"cg", +"ch", +"ci", +"ck", +"cl", +"cm", +"cn", +"co", +"cr", +"cu", +"cv", +"cw", +"cx", +"cy", +"cz", +"de", +"dj", +"dk", +"dm", +"do", +"dz", +"ec", +"ee", +"eg", +"eh", +"er", +"es", +"et", +"eu", +"fi", +"fj", +"fk", +"fm", +"fo", +"fr", +"ga", +"gb", +"gd", +"ge", +"gf", +"gg", +"gh", +"gi", +"gl", +"gm", +"gn", +"gp", +"gq", +"gr", +"gs", +"gt", +"gu", +"gw", +"gy", +"hk", +"hm", +"hn", +"hr", +"ht", +"hu", +"id", +"ie", +"il", +"im", +"in", +"io", +"iq", +"ir", +"is", +"it", +"je", +"jm", +"jo", +"jp", +"ke", +"kg", +"kh", +"ki", +"km", +"kn", +"kp", +"kr", +"kw", +"ky", +"kz", +"la", +"lb", +"lc", +"li", +"lk", +"lr", +"ls", +"lt", +"lu", +"lv", +"ly", +"ma", +"mc", +"md", +"me", +"mf", +"mg", +"mh", +"mk", +"ml", +"mm", +"mn", +"mo", +"mp", +"mq", +"mr", +"ms", +"mt", +"mu", +"mv", +"mw", +"mx", +"my", +"mz", +"na", +"nc", +"ne", +"nf", +"ng", +"ni", +"nl", +"no", +"np", +"nr", +"nu", +"nz", +"om", +"pa", +"pe", +"pf", +"pg", +"ph", +"pk", +"pl", +"pm", +"pn", +"pr", +"ps", +"pt", +"pw", +"py", +"qa", +"re", +"ro", +"rs", +"ru", +"rw", +"sa", +"sb", +"sc", +"sd", +"se", +"sg", +"sh", +"si", +"sj", +"sk", +"sl", +"sm", +"sn", +"so", +"sr", +"ss", +"st", +"su", +"sv", +"sx", +"sy", +"sz", +"tc", +"td", +"tf", +"tg", +"th", +"tj", +"tk", +"tl", +"tm", +"tn", +"to", +"tp", +"tr", +"tt", +"tv", +"tw", +"tz", +"ua", +"ug", +"uk", +"us", +"uy", +"uz", +"va", +"vc", +"ve", +"vg", +"vi", +"vn", +"vu", +"wf", +"ws", +"ye", +"yt", +"za", +"zm", +"zr", +"zw"]; \ No newline at end of file diff --git a/src/parser/link_url/allowed_tlds/mod.rs b/src/parser/link_url/allowed_tlds/mod.rs new file mode 100644 index 0000000..188b32c --- /dev/null +++ b/src/parser/link_url/allowed_tlds/mod.rs @@ -0,0 +1,34 @@ +mod country_tlds; + +const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[ + // originals from RFC920 + net + "com", "org", "net", "edu", "gov", "mil", // for deltachat + "chat", +]; + +pub fn check_if_tld_is_allowed(tld: &str) -> bool { + if ALLOWED_TOP_LEVEL_DOMAINS.iter().any(|item|*item == tld) { + true + } else { country_tlds::COUNTRY_TLDS.binary_search(&tld).is_ok() } +} + +#[cfg(test)] +mod test { + use crate::parser::link_url::allowed_tlds::check_if_tld_is_allowed; + + #[test] + fn test_check_tld() { + assert!(check_if_tld_is_allowed("chat")); + assert!(check_if_tld_is_allowed("com")); + + assert!(check_if_tld_is_allowed("de")); + assert!(check_if_tld_is_allowed("at")); + assert!(check_if_tld_is_allowed("uk")); + assert!(check_if_tld_is_allowed("fr")); + } + + #[test] + fn test_check_tld_not_allowed() { + assert!(!check_if_tld_is_allowed("doesnotexist")); + } +} \ No newline at end of file diff --git a/src/parser/link_url/mod.rs b/src/parser/link_url/mod.rs index f5062b9..4af8593 100644 --- a/src/parser/link_url/mod.rs +++ b/src/parser/link_url/mod.rs @@ -1,3 +1,4 @@ +mod allowed_tlds; mod ip; mod parenthesis_counter; mod parse_link; diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 61e2ed2..7c860e3 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -23,8 +23,7 @@ use crate::parser::{ }; use super::{ - parenthesis_counter::count_chars_in_complete_parenthesis, - punycode_warning::get_puny_code_warning, + allowed_tlds::check_if_tld_is_allowed, parenthesis_counter::count_chars_in_complete_parenthesis, punycode_warning::get_puny_code_warning }; /// determines which generic schemes (without '://') get linkifyed @@ -46,13 +45,7 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool { ) } -const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[ - // originals from RFC920 + net - ".com", ".org", ".net", ".edu", ".gov", ".mil", - // for deltachat - ".chat", - // !todo country codes here next -]; + // These ranges have been extracted from RFC3987, Page 8. const UCSCHAR_RANGES: [RangeInclusive; 17] = [ @@ -294,10 +287,16 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { // now with host, if we dont have a scheme we need to check it for TLD if scheme.is_empty() { - ALLOWED_TOP_LEVEL_DOMAINS - .iter() - .find(|&&tld| host.ends_with(tld)) - .ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLink))?; + if !host.contains('.') { + return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink)); + } + + let tld = host.split('.').last() + .ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLinkNoTLD))?; + + if !check_if_tld_is_allowed(tld) { + return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink)); + } } let (input, path) = opt(alt(( diff --git a/src/parser/parse_from_text/base_parsers.rs b/src/parser/parse_from_text/base_parsers.rs index e5cb491..817d0ac 100644 --- a/src/parser/parse_from_text/base_parsers.rs +++ b/src/parser/parse_from_text/base_parsers.rs @@ -18,6 +18,7 @@ pub enum CustomError { Nom(I, ErrorKind), InvalidEmail, InvalidLink, + InvalidLinkNoTLD, UnexpectedContent, PrecedingWhitespaceMissing, OptionIsUnexpectedNone, From c9ba8b2c3d52da9389713943fc7d9cbaf2e609c5 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Mon, 26 May 2025 15:25:26 +0200 Subject: [PATCH 2/3] fix fmt --- .../link_url/allowed_tlds/country_tlds.rs | 274 ++---------------- src/parser/link_url/allowed_tlds/mod.rs | 8 +- src/parser/link_url/parse_link.rs | 14 +- 3 files changed, 31 insertions(+), 265 deletions(-) diff --git a/src/parser/link_url/allowed_tlds/country_tlds.rs b/src/parser/link_url/allowed_tlds/country_tlds.rs index 348c167..475afe2 100644 --- a/src/parser/link_url/allowed_tlds/country_tlds.rs +++ b/src/parser/link_url/allowed_tlds/country_tlds.rs @@ -1,257 +1,19 @@ - // extracted from first table on https://de.wikipedia.org/wiki/Liste_länderspezifischer_Top-Level-Domains -pub const COUNTRY_TLDS: [&str;254] = [ -"ac", -"ad", -"ae", -"af", -"ag", -"ai", -"al", -"am", -"ao", -"aq", -"ar", -"as", -"at", -"au", -"aw", -"ax", -"az", -"ba", -"bb", -"bd", -"be", -"bf", -"bg", -"bh", -"bi", -"bj", -"bl", -"bm", -"bn", -"bo", -"bq", -"br", -"bs", -"bt", -"bv", -"bw", -"by", -"bz", -"ca", -"cc", -"cd", -"cf", -"cg", -"ch", -"ci", -"ck", -"cl", -"cm", -"cn", -"co", -"cr", -"cu", -"cv", -"cw", -"cx", -"cy", -"cz", -"de", -"dj", -"dk", -"dm", -"do", -"dz", -"ec", -"ee", -"eg", -"eh", -"er", -"es", -"et", -"eu", -"fi", -"fj", -"fk", -"fm", -"fo", -"fr", -"ga", -"gb", -"gd", -"ge", -"gf", -"gg", -"gh", -"gi", -"gl", -"gm", -"gn", -"gp", -"gq", -"gr", -"gs", -"gt", -"gu", -"gw", -"gy", -"hk", -"hm", -"hn", -"hr", -"ht", -"hu", -"id", -"ie", -"il", -"im", -"in", -"io", -"iq", -"ir", -"is", -"it", -"je", -"jm", -"jo", -"jp", -"ke", -"kg", -"kh", -"ki", -"km", -"kn", -"kp", -"kr", -"kw", -"ky", -"kz", -"la", -"lb", -"lc", -"li", -"lk", -"lr", -"ls", -"lt", -"lu", -"lv", -"ly", -"ma", -"mc", -"md", -"me", -"mf", -"mg", -"mh", -"mk", -"ml", -"mm", -"mn", -"mo", -"mp", -"mq", -"mr", -"ms", -"mt", -"mu", -"mv", -"mw", -"mx", -"my", -"mz", -"na", -"nc", -"ne", -"nf", -"ng", -"ni", -"nl", -"no", -"np", -"nr", -"nu", -"nz", -"om", -"pa", -"pe", -"pf", -"pg", -"ph", -"pk", -"pl", -"pm", -"pn", -"pr", -"ps", -"pt", -"pw", -"py", -"qa", -"re", -"ro", -"rs", -"ru", -"rw", -"sa", -"sb", -"sc", -"sd", -"se", -"sg", -"sh", -"si", -"sj", -"sk", -"sl", -"sm", -"sn", -"so", -"sr", -"ss", -"st", -"su", -"sv", -"sx", -"sy", -"sz", -"tc", -"td", -"tf", -"tg", -"th", -"tj", -"tk", -"tl", -"tm", -"tn", -"to", -"tp", -"tr", -"tt", -"tv", -"tw", -"tz", -"ua", -"ug", -"uk", -"us", -"uy", -"uz", -"va", -"vc", -"ve", -"vg", -"vi", -"vn", -"vu", -"wf", -"ws", -"ye", -"yt", -"za", -"zm", -"zr", -"zw"]; \ No newline at end of file +pub const COUNTRY_TLDS: [&str; 254] = [ + "ac", "ad", "ae", "af", "ag", "ai", "al", "am", "ao", "aq", "ar", "as", "at", "au", "aw", "ax", + "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bl", "bm", "bn", "bo", "bq", "br", + "bs", "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", + "cn", "co", "cr", "cu", "cv", "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", + "ee", "eg", "eh", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", + "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy", + "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", + "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz", "la", + "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mf", "mg", + "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", + "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe", + "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", + "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "sk", "sl", "sm", "sn", "so", + "sr", "ss", "st", "su", "sv", "sx", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl", + "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", + "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "za", "zm", "zr", "zw", +]; diff --git a/src/parser/link_url/allowed_tlds/mod.rs b/src/parser/link_url/allowed_tlds/mod.rs index 188b32c..e9ad60c 100644 --- a/src/parser/link_url/allowed_tlds/mod.rs +++ b/src/parser/link_url/allowed_tlds/mod.rs @@ -7,9 +7,11 @@ const ALLOWED_TOP_LEVEL_DOMAINS: &[&str] = &[ ]; pub fn check_if_tld_is_allowed(tld: &str) -> bool { - if ALLOWED_TOP_LEVEL_DOMAINS.iter().any(|item|*item == tld) { + if ALLOWED_TOP_LEVEL_DOMAINS.iter().any(|item| *item == tld) { true - } else { country_tlds::COUNTRY_TLDS.binary_search(&tld).is_ok() } + } else { + country_tlds::COUNTRY_TLDS.binary_search(&tld).is_ok() + } } #[cfg(test)] @@ -31,4 +33,4 @@ mod test { fn test_check_tld_not_allowed() { assert!(!check_if_tld_is_allowed("doesnotexist")); } -} \ No newline at end of file +} diff --git a/src/parser/link_url/parse_link.rs b/src/parser/link_url/parse_link.rs index 7c860e3..d9d4d97 100644 --- a/src/parser/link_url/parse_link.rs +++ b/src/parser/link_url/parse_link.rs @@ -23,7 +23,9 @@ use crate::parser::{ }; use super::{ - allowed_tlds::check_if_tld_is_allowed, parenthesis_counter::count_chars_in_complete_parenthesis, punycode_warning::get_puny_code_warning + allowed_tlds::check_if_tld_is_allowed, + parenthesis_counter::count_chars_in_complete_parenthesis, + punycode_warning::get_puny_code_warning, }; /// determines which generic schemes (without '://') get linkifyed @@ -45,8 +47,6 @@ fn is_allowed_generic_scheme(scheme: &str) -> bool { ) } - - // These ranges have been extracted from RFC3987, Page 8. const UCSCHAR_RANGES: [RangeInclusive; 17] = [ 0xa0..=0xd7ff, @@ -291,9 +291,11 @@ fn parse_iri(input: &str) -> IResult<&str, LinkDestination, CustomError<&str>> { return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink)); } - let tld = host.split('.').last() - .ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLinkNoTLD))?; - + let tld = host + .split('.') + .last() + .ok_or(nom::Err::Failure(CustomError::<&str>::InvalidLinkNoTLD))?; + if !check_if_tld_is_allowed(tld) { return Err(nom::Err::Failure(CustomError::<&str>::InvalidLink)); } From 12a0f1f2d70a165bbe491399cfc9440e95aaeb08 Mon Sep 17 00:00:00 2001 From: Simon Laux Date: Mon, 26 May 2025 15:43:01 +0200 Subject: [PATCH 3/3] fix manual ts types --- message_parser_wasm/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/message_parser_wasm/src/lib.rs b/message_parser_wasm/src/lib.rs index 99b9c2a..6aa8a16 100644 --- a/message_parser_wasm/src/lib.rs +++ b/message_parser_wasm/src/lib.rs @@ -41,6 +41,7 @@ export type LinkDestination = { target: string; hostname: null | string; punycode: null | PunycodeWarning; + scheme: null | string; }; export type ParsedElement = | { t: "Text"; c: string }