Skip to content

Commit

Permalink
Fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
greyblake committed Nov 9, 2018
1 parent a604c8c commit b26296e
Show file tree
Hide file tree
Showing 13 changed files with 343 additions and 254 deletions.
2 changes: 1 addition & 1 deletion benches/example.rs
@@ -1,7 +1,7 @@
#[macro_use]
extern crate bencher;
extern crate whatlang;
extern crate serde_json;
extern crate whatlang;

use bencher::Bencher;
use std::collections::HashMap;
Expand Down
42 changes: 30 additions & 12 deletions build.rs
@@ -1,16 +1,16 @@
extern crate csv;
extern crate skeptic;
extern crate serde_json;
extern crate serde;
extern crate serde_json;
extern crate skeptic;
#[macro_use]
extern crate serde_derive;
extern crate tera;

use std::io::{Write, BufReader, BufWriter};
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::Path;
use std::env;

const DATA_PATH: &'static str = "misc/data.json";
const SUPPORTED_LANG_PATH: &'static str = "misc/supported_languages.csv";
Expand Down Expand Up @@ -53,16 +53,21 @@ fn generate_source_files() {

fn load_data() -> (Vec<LangInfo>, HashMap<String, Vec<Lang>>) {
let data_file = BufReader::new(File::open(DATA_PATH).unwrap());
let mut lang_reader = csv::ReaderBuilder::new().flexible(true).from_path(SUPPORTED_LANG_PATH).unwrap();
let mut lang_reader = csv::ReaderBuilder::new()
.flexible(true)
.from_path(SUPPORTED_LANG_PATH)
.unwrap();

let mut lang_infos: Vec<LangInfo> = lang_reader.deserialize().map(Result::unwrap).collect();
lang_infos.sort_by(|left, right| left.code.cmp(&right.code));

let supported_lang_codes: HashMap<String, LangInfo> = lang_infos.iter()
let supported_lang_codes: HashMap<String, LangInfo> = lang_infos
.iter()
.map(|lang| (lang.code.clone(), lang.clone()))
.collect();

let lang_data: HashMap<String, HashMap<String, String>> = serde_json::from_reader(data_file).unwrap();
let lang_data: HashMap<String, HashMap<String, String>> =
serde_json::from_reader(data_file).unwrap();

let mut scripts: HashMap<String, Vec<Lang>> = HashMap::with_capacity(lang_data.len());
let mut all_langs: Vec<Lang> = Vec::new();
Expand All @@ -75,23 +80,36 @@ fn load_data() -> (Vec<LangInfo>, HashMap<String, Vec<Lang>>) {
let lang = Lang {
info: (*info).clone(),
script: script.clone(),
trigrams: trigrams.split('|').map(Into::into).collect()
trigrams: trigrams.split('|').map(Into::into).collect(),
};
if lang.trigrams.len() != TRIGRAM_COUNT {
panic!("Language {} has {} trigrams, instead of {}", code, lang.trigrams.len(), TRIGRAM_COUNT);
panic!(
"Language {} has {} trigrams, instead of {}",
code,
lang.trigrams.len(),
TRIGRAM_COUNT
);
}

all_langs.push(lang.clone());
scripts.entry(script.clone()).or_insert_with(Vec::new).push(lang);
scripts
.entry(script.clone())
.or_insert_with(Vec::new)
.push(lang);
}
}

(lang_infos, scripts)
}

fn render_lang_rs(buf: &mut BufWriter<File>, lang_infos: &[LangInfo], scripts: &HashMap<String, Vec<Lang>>) {
fn render_lang_rs(
buf: &mut BufWriter<File>,
lang_infos: &[LangInfo],
scripts: &HashMap<String, Vec<Lang>>,
) {
let mut tera = tera::Tera::default();
tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs")).unwrap();
tera.add_template_file(TEMPLATE_LANG_RS_PATH, Some("lang.rs"))
.unwrap();

let mut ctx = tera::Context::new();
ctx.insert("lang_infos", lang_infos);
Expand Down
5 changes: 3 additions & 2 deletions examples/cli.rs
Expand Up @@ -6,7 +6,9 @@ use whatlang::detect;
fn main() {
let mut text = String::new();
println!("Please enter a text:");
io::stdin().read_line(&mut text).expect("Failed to read line");
io::stdin()
.read_line(&mut text)
.expect("Failed to read line");

if let Some(info) = detect(&text) {
println!("Language: {}", info.lang());
Expand All @@ -16,4 +18,3 @@ fn main() {
println!("Cannot recognize a language :(");
}
}

102 changes: 61 additions & 41 deletions src/detect.rs
@@ -1,11 +1,11 @@
use hashbrown::HashMap;

use constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE};
use info::Info;
use lang::*;
use options::{List, Options};
use script::*;
use trigrams::*;
use info::Info;
use options::{Options, List};
use constants::{MAX_TRIGRAM_DISTANCE, MAX_TOTAL_DISTANCE};

/// Detect a language and a script by a given text.
///
Expand Down Expand Up @@ -39,56 +39,66 @@ pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> {

pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> {
detect_script(text).and_then(|script| {
detect_lang_based_on_script(text, options, script).map( |(lang, confidence)| {
Info { lang, script, confidence }
detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info {
lang,
script,
confidence,
})
})
}

fn detect_lang_based_on_script(text: &str, options: &Options, script : Script) -> Option<(Lang, f64)> {
fn detect_lang_based_on_script(
text: &str,
options: &Options,
script: Script,
) -> Option<(Lang, f64)> {
match script {
Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS),
Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
Script::Mandarin => Some((Lang::Cmn, 1.0)),
Script::Bengali => Some((Lang::Ben, 1.0)),
Script::Hangul => Some((Lang::Kor, 1.0)),
Script::Georgian => Some((Lang::Kat, 1.0)),
Script::Greek => Some((Lang::Ell, 1.0)),
Script::Kannada => Some((Lang::Kan, 1.0)),
Script::Tamil => Some((Lang::Tam, 1.0)),
Script::Thai => Some((Lang::Tha, 1.0)),
Script::Gujarati => Some((Lang::Guj, 1.0)),
Script::Gurmukhi => Some((Lang::Pan, 1.0)),
Script::Telugu => Some((Lang::Tel, 1.0)),
Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
Script::Mandarin => Some((Lang::Cmn, 1.0)),
Script::Bengali => Some((Lang::Ben, 1.0)),
Script::Hangul => Some((Lang::Kor, 1.0)),
Script::Georgian => Some((Lang::Kat, 1.0)),
Script::Greek => Some((Lang::Ell, 1.0)),
Script::Kannada => Some((Lang::Kan, 1.0)),
Script::Tamil => Some((Lang::Tam, 1.0)),
Script::Thai => Some((Lang::Tha, 1.0)),
Script::Gujarati => Some((Lang::Guj, 1.0)),
Script::Gurmukhi => Some((Lang::Pan, 1.0)),
Script::Telugu => Some((Lang::Tel, 1.0)),
Script::Malayalam => Some((Lang::Mal, 1.0)),
Script::Oriya => Some((Lang::Ori, 1.0)),
Script::Myanmar => Some((Lang::Mya, 1.0)),
Script::Sinhala => Some((Lang::Sin, 1.0)),
Script::Khmer => Some((Lang::Khm, 1.0)),
Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0))
Script::Oriya => Some((Lang::Ori, 1.0)),
Script::Myanmar => Some((Lang::Mya, 1.0)),
Script::Sinhala => Some((Lang::Sin, 1.0)),
Script::Khmer => Some((Lang::Khm, 1.0)),
Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)),
}
}

fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : LangProfileList) -> Option<(Lang, f64)> {
let mut lang_distances : Vec<(Lang, u32)> = vec![];
fn detect_lang_in_profiles(
text: &str,
options: &Options,
lang_profile_list: LangProfileList,
) -> Option<(Lang, f64)> {
let mut lang_distances: Vec<(Lang, u32)> = vec![];
let trigrams = get_trigrams_with_positions(text);

for &(ref lang, lang_trigrams) in lang_profile_list {
match options.list {
Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue,
Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue,
_ => {},
_ => {}
}
let dist = calculate_distance(lang_trigrams, &trigrams);
lang_distances.push(((*lang), dist));
}

// Sort languages by distance
lang_distances.sort_by_key(|key| key.1 );
lang_distances.sort_by_key(|key| key.1);

// Return None if lang_distances is empty
// Return the only language with is_reliable=true if there is only 1 item
Expand Down Expand Up @@ -131,23 +141,22 @@ fn detect_lang_in_profiles(text: &str, options: &Options, lang_profile_list : La
// Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
//
let confident_rate = (12.0 / trigrams.len() as f64) + 0.05;
let confidence =
if rate > confident_rate {
1.0
} else {
rate / confident_rate
};
let confidence = if rate > confident_rate {
1.0
} else {
rate / confident_rate
};

Some((lang_dist1.0, confidence))
}

fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
let mut total_dist = 0u32;

for (i, &trigram) in lang_trigrams.iter().enumerate() {
let dist = match text_trigrams.get(trigram) {
Some(&n) => (n as i32 - i as i32).abs() as u32,
None => MAX_TRIGRAM_DISTANCE
None => MAX_TRIGRAM_DISTANCE,
};
total_dist += dist;
}
Expand Down Expand Up @@ -186,7 +195,16 @@ mod tests {
assert_eq!(info.lang, Lang::Tgl);

// with blacklist
let blacklist = vec![Lang::Tgl, Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob, Lang::Ceb, Lang::Ilo];
let blacklist = vec![
Lang::Tgl,
Lang::Jav,
Lang::Nld,
Lang::Uzb,
Lang::Swe,
Lang::Nob,
Lang::Ceb,
Lang::Ilo,
];
let options = Options::new().set_blacklist(blacklist);
let output = detect_with_options(text, &options);
assert_eq!(output.is_some(), true);
Expand Down Expand Up @@ -224,7 +242,9 @@ mod tests {
let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap();
assert!(!info.is_reliable());

let info = detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm").unwrap();
let info =
detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm")
.unwrap();
assert!(!info.is_reliable());

// 1000 chars of randomly generated Cyrillic text
Expand Down
13 changes: 8 additions & 5 deletions src/detector.rs
@@ -1,9 +1,9 @@
use lang::Lang;
use script::Script;
use script::detect_script;
use detect;
use info::Info;
use lang::Lang;
use options::Options;
use detect;
use script::detect_script;
use script::Script;

/// Configurable structure that holds detection options and provides functions
/// to detect language and script.
Expand Down Expand Up @@ -72,7 +72,10 @@ mod tests {
#[test]
fn test_detect_script() {
// Russian, Cyrillic
assert_eq!(Detector::new().detect_script("Кириллица"), Some(Script::Cyrillic));
assert_eq!(
Detector::new().detect_script("Кириллица"),
Some(Script::Cyrillic)
);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/info.rs
Expand Up @@ -8,7 +8,7 @@ const RELIABLE_CONFIDENCE_THRESHOLD: f64 = 0.8;
pub struct Info {
pub(crate) lang: Lang,
pub(crate) script: Script,
pub(crate) confidence: f64
pub(crate) confidence: f64,
}

impl Info {
Expand Down
18 changes: 9 additions & 9 deletions src/lib.rs
Expand Up @@ -32,21 +32,21 @@
//! assert_eq!(lang, Some(Lang::Eng));
extern crate hashbrown;

mod lang;
mod script;
mod info;
mod utils;
mod trigrams;
mod constants;
mod detect;
mod detector;
mod info;
mod lang;
mod options;
mod constants;
mod script;
mod trigrams;
mod utils;

pub use lang::Lang;
pub use script::Script;
pub use info::Info;
pub use detector::Detector;
pub use info::Info;
pub use lang::Lang;
pub use options::Options;
pub use script::Script;

pub use detect::detect;
pub use detect::detect_lang;
Expand Down
4 changes: 2 additions & 2 deletions src/options.rs
Expand Up @@ -3,13 +3,13 @@ use lang::Lang;
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum List {
White(Vec<Lang>),
Black(Vec<Lang>)
Black(Vec<Lang>),
}

/// Allows to customize behaviour of [Detector](struct.Detector.html).
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct Options {
pub(crate) list: Option<List>
pub(crate) list: Option<List>,
}

impl Options {
Expand Down

0 comments on commit b26296e

Please sign in to comment.