greyblake · greyblake · Apr 15, 2022 · Mar 10, 2022 · Mar 10, 2022 · Mar 10, 2022
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,6 +23,7 @@ github-actions = { repository = "greyblake/whatlang-rs", workflow = "CI", branch
 
 [dependencies]
 hashbrown = "0.11.2"
+once_cell = "1.10.0"
 enum-map = { version = "1.1.1", optional = true }
 
 [dev-dependencies]

diff --git a/misc/alphabets/calculate_scores.rs.erb b/misc/alphabets/calculate_scores.rs.erb
@@ -1,3 +1,5 @@
+use std::cmp::Reverse;
+
 use super::Outcome;
 use crate::utils::is_stop_char;
 use crate::core::{LowercaseText, FilterList};
@@ -43,7 +45,7 @@ pub fn alphabet_calculate_scores(text: &str) -> Outcome {
         }
     }
 
-    raw_scores.sort_by(|a, b| b.1.cmp(&a.1));
+    raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));
 
     let raw_scores: Vec<(Lang, usize)> = raw_scores
         .into_iter()

diff --git a/src/alphabets/cyrillic.rs b/src/alphabets/cyrillic.rs
@@ -1,3 +1,5 @@
+use std::cmp::Reverse;
+
 use super::RawOutcome;
 use crate::core::{FilterList, LowercaseText};
 use crate::{Lang, Script};
@@ -35,7 +37,7 @@ pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList)
         }
     }
 
-    raw_scores.sort_by(|a, b| b.1.cmp(&a.1));
+    raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));
 
     let raw_scores: Vec<(Lang, usize)> = raw_scores
         .into_iter()

diff --git a/src/alphabets/latin.rs b/src/alphabets/latin.rs
@@ -1,3 +1,8 @@
+use std::cmp::Reverse;
+use std::collections::HashMap;
+
+use once_cell::sync::Lazy;
+
 use super::RawOutcome;
 use crate::core::{FilterList, LowercaseText};
 use crate::utils::is_stop_char;
@@ -41,88 +46,126 @@ const VIE: &str =
     "abcdefghijklmnopqrstuvwxyzàáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ";
 const ZUL: &str = "abcdefghijklmnopqrstuvwxyz";
 
-fn get_lang_chars(lang: Lang) -> Vec<char> {
-    let alphabet = match lang {
-        Lang::Afr => AFR,
-        Lang::Aka => AKA,
-        Lang::Aze => AZE,
-        Lang::Cat => CAT,
-        Lang::Ces => CES,
-        Lang::Dan => DAN,
-        Lang::Deu => DEU,
-        Lang::Eng => ENG,
-        Lang::Epo => EPO,
-        Lang::Est => EST,
-        Lang::Fin => FIN,
-        Lang::Fra => FRA,
-        Lang::Hrv => HRV,
-        Lang::Hun => HUN,
-        Lang::Ind => IND,
-        Lang::Ita => ITA,
-        Lang::Jav => JAV,
-        Lang::Lat => LAT,
-        Lang::Lav => LAV,
-        Lang::Lit => LIT,
-        Lang::Nld => NLD,
-        Lang::Nob => NOB,
-        Lang::Pol => POL,
-        Lang::Por => POR,
-        Lang::Ron => RON,
-        Lang::Slk => SLK,
-        Lang::Slv => SLV,
-        Lang::Sna => SNA,
-        Lang::Spa => SPA,
-        Lang::Swe => SWE,
-        Lang::Tgl => TGL,
-        Lang::Tuk => TUK,
-        Lang::Tur => TUR,
-        Lang::Uzb => UZB,
-        Lang::Vie => VIE,
-        Lang::Zul => ZUL,
-
-        _ => panic!("No alphabet for {}", lang),
-    };
-    alphabet.chars().collect()
-}
+const LATIN_ALPHABETS: &[(Lang, &str)] = &[
+    (Lang::Afr, AFR),
+    (Lang::Aka, AKA),
+    (Lang::Aze, AZE),
+    (Lang::Cat, CAT),
+    (Lang::Ces, CES),
+    (Lang::Dan, DAN),
+    (Lang::Deu, DEU),
+    (Lang::Eng, ENG),
+    (Lang::Epo, EPO),
+    (Lang::Est, EST),
+    (Lang::Fin, FIN),
+    (Lang::Fra, FRA),
+    (Lang::Hrv, HRV),
+    (Lang::Hun, HUN),
+    (Lang::Ind, IND),
+    (Lang::Ita, ITA),
+    (Lang::Jav, JAV),
+    (Lang::Lat, LAT),
+    (Lang::Lav, LAV),
+    (Lang::Lit, LIT),
+    (Lang::Nld, NLD),
+    (Lang::Nob, NOB),
+    (Lang::Pol, POL),
+    (Lang::Por, POR),
+    (Lang::Ron, RON),
+    (Lang::Slk, SLK),
+    (Lang::Slv, SLV),
+    (Lang::Sna, SNA),
+    (Lang::Spa, SPA),
+    (Lang::Swe, SWE),
+    (Lang::Tgl, TGL),
+    (Lang::Tuk, TUK),
+    (Lang::Tur, TUR),
+    (Lang::Uzb, UZB),
+    (Lang::Vie, VIE),
+    (Lang::Zul, ZUL),
+];
+
+/// Inverted map binding a character to a set of languages.
+pub static ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> = Lazy::new(|| {
-/// Inverted map binding a character to a set of languages.
-pub static ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> = Lazy::new(|| {
+/// Inverted map binding a character to a set of languages.
+pub static ALPHABET_LANG_MAP: Lazy<(Vec<(char, Vec<Lang>)>)> = Lazy::new(|| {
-/// Inverted map binding a character to a set of languages.
-pub static ALPHABET_LANG_MAP: Lazy<(Vec<char>, Vec<Vec<Lang>>)> = Lazy::new(|| {
+/// Inverted map binding a character to a set of languages.
+pub static ALPHABET_LANG_MAP: Lazy<(Vec<(char, Vec<Lang>)>)> = Lazy::new(|| {
+    let mut map = HashMap::new();
+
+    for (lang, alphabet) in LATIN_ALPHABETS {
+        for c in alphabet.chars() {
+            let entry = map.entry(c).or_insert_with(Vec::new);
+            entry.push(*lang);
+        }
+    }
+
+    let mut char_lang: Vec<_> = map.into_iter().collect();
+
+    char_lang.sort_unstable_by_key(|(c, _)| *c);
+
+    let mut chars = Vec::with_capacity(char_lang.len());
+    let mut langs = Vec::with_capacity(char_lang.len());
+    for (ch, languages) in char_lang {
+        chars.push(ch);
+        langs.push(languages);
+    }
+
+    (chars, langs)
+});
 
 pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
-    let mut raw_scores: Vec<(Lang, i32)> = Script::Latin
-        .langs()
-        .iter()
-        .filter(|&&l| filter_list.is_allowed(l))
-        .map(|&l| (l, 0i32))
-        .collect();
+    let (chars, langs) = &*ALPHABET_LANG_MAP;
 
-    let max_raw_score = text.chars().filter(|&ch| !is_stop_char(ch)).count();
+    // score of each character.
+    let mut char_scores = vec![0; chars.len()];
+    let mut max_raw_score = 0;
+    // iterate over the text and scores characters.
+    for ch in text.chars() {
+        if is_stop_char(ch) {
+            continue;
+        }
+
+        max_raw_score += 1;
 
-    for (lang, score) in &mut raw_scores {
-        let alphabet = get_lang_chars(*lang);
+        if let Ok(position) = chars.binary_search(&ch) {
+            // add 2 and remove max_raw_score at the end,
+            // to keep the score interval of -max_raw_score..max_raw_score
+            char_scores[position] += 2;
+        }
+    }
 
-        for ch in text.chars() {
-            if is_stop_char(ch) {
-                continue;
-            };
-            if alphabet.contains(&ch) {
-                *score += 1;
+    // score of each lang.
+    let mut lang_scores = vec![0; Lang::all().len()];
+    let mut common_score: usize = 0;
+    // iterate over scored characters to compute language's scores.
+    for (position, char_score) in char_scores.into_iter().enumerate() {
+        if char_score > 0 {
+            let languages = &langs[position];
+            // if current character is common to all Languages, increment a common score
+            // instead of iterating over all Languages scores.
+            if languages.len() == LATIN_ALPHABETS.len() {
+                common_score += char_score;
             } else {
-                *score -= 1;
+                for &lang in languages {
+                    lang_scores[lang as usize] += char_score;
+                }
             }
         }
     }
 
-    raw_scores.sort_by(|a, b| b.1.cmp(&a.1));
-
-    let raw_scores: Vec<(Lang, usize)> = raw_scores
-        .into_iter()
-        .map(|(l, s)| {
-            let score = if s < 0 { 0usize } else { s as usize };
+    // remap languages with theirs scores.
+    let mut raw_scores: Vec<(Lang, usize)> = Script::Latin
+        .langs()
+        .iter()
+        .filter(|&&l| filter_list.is_allowed(l))
+        .map(|&l| {
+            let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
             (l, score)
         })
         .collect();
 
+    raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));
+
     let mut normalized_scores = vec![];
 
-    for &(lang, raw_score) in &raw_scores {
+    for &(lang, raw_score) in raw_scores.iter() {
         let normalized_score = raw_score as f64 / max_raw_score as f64;
         normalized_scores.push((lang, normalized_score));
     }

diff --git a/src/combined/mod.rs b/src/combined/mod.rs
@@ -78,7 +78,7 @@ pub fn raw_detect(iquery: &InternalQuery) -> RawOutcome {
         scores.push((lang, score));
     }
 
-    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
+    scores.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
 
     RawOutcome {
         scores,

diff --git a/src/scripts/detect.rs b/src/scripts/detect.rs
@@ -1,3 +1,5 @@
+use std::cmp::Reverse;
+
 use super::script::Script;
 use crate::utils::is_stop_char;
 
@@ -24,7 +26,7 @@ pub struct RawScriptInfo {
 
 impl RawScriptInfo {
     fn new(mut counters: Vec<(Script, usize)>) -> Self {
-        counters.sort_by(|a, b| b.1.cmp(&a.1));
+        counters.sort_unstable_by_key(|(_, score)| Reverse(*score));
         Self { counters }
     }
 

diff --git a/src/trigrams/detection.rs b/src/trigrams/detection.rs
@@ -80,7 +80,7 @@ fn calculate_scores_in_profiles(
     }
 
     // Sort languages by distance
-    lang_distances.sort_by_key(|key| key.1);
+    lang_distances.sort_unstable_by_key(|(_, dist)| *dist);
 
     let max_dist = unique_trigrams_count as u32 * MAX_TRIGRAM_DISTANCE;
 

diff --git a/src/trigrams/utils.rs b/src/trigrams/utils.rs
@@ -35,7 +35,7 @@ fn trigram_occurances_to_positions(
         .into_iter()
         .map(|(trigram, count)| (count, trigram))
         .collect();
-    count_vec.sort_by(|a, b| b.cmp(a));
+    count_vec.sort_unstable_by(|a, b| b.cmp(a));
-    count_vec.sort_unstable_by(|a, b| b.cmp(a));
+    count_vec.sort_unstable_by_key(|(count, _trigram)| Reverse(*count));
-    count_vec.sort_unstable_by(|a, b| b.cmp(a));
+    count_vec.sort_unstable_by_key(|(count, _trigram)| Reverse(*count));
 
     count_vec
         .into_iter()