Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a builder for system dictionary #74

Merged
merged 4 commits into from
Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions prepare/src/system.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::io::BufWriter;
use std::path::PathBuf;
use std::time::Instant;

use vibrato::dictionary::Dictionary;
use vibrato::dictionary::SystemDictionaryBuilder;

use clap::{CommandFactory, ErrorKind, Parser};

Expand Down Expand Up @@ -53,7 +53,7 @@ fn main() -> Result<(), Box<dyn Error>> {
eprintln!("Compiling the system dictionary...");
let start = Instant::now();
let dict = if let Some(matrix_in) = args.matrix_in {
Dictionary::from_readers(
SystemDictionaryBuilder::from_readers(
File::open(args.lexicon_in)?,
File::open(matrix_in)?,
File::open(args.char_in)?,
Expand All @@ -64,7 +64,7 @@ fn main() -> Result<(), Box<dyn Error>> {
args.bigram_left_in,
args.bigram_cost_in,
) {
Dictionary::from_readers_with_bigram_info(
SystemDictionaryBuilder::from_readers_with_bigram_info(
File::open(args.lexicon_in)?,
File::open(bigram_right_in)?,
File::open(bigram_left_in)?,
Expand Down
68 changes: 66 additions & 2 deletions vibrato/src/dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//! Dictionary for tokenization.
pub(crate) mod builder;
pub mod builder;
pub(crate) mod character;
pub(crate) mod connector;
pub(crate) mod lexicon;
Expand All @@ -18,7 +18,9 @@ use crate::dictionary::lexicon::Lexicon;
use crate::dictionary::mapper::ConnIdMapper;
use crate::dictionary::unknown::UnkHandler;
use crate::dictionary::word_idx::WordIdx;
use crate::errors::Result;
use crate::errors::{Result, VibratoError};

pub use crate::dictionary::builder::SystemDictionaryBuilder;

pub(crate) use crate::dictionary::lexicon::WordParam;

Expand Down Expand Up @@ -165,4 +167,66 @@ impl Dictionary {
need_check: false,
})
}

/// Resets the user dictionary from a reader.
///
/// # Arguments
///
/// - `user_lexicon_rdr`: A reader of a lexicon file `*.csv` in the MeCab format.
/// If `None`, clear the current user dictionary.
///
/// # Errors
///
/// [`VibratoError`] is returned when an input format is invalid.
pub fn user_lexicon_from_reader<R>(mut self, user_lexicon_rdr: Option<R>) -> Result<Self>
where
R: Read,
{
if let Some(user_lexicon_rdr) = user_lexicon_rdr {
let mut user_lexicon = Lexicon::from_reader(user_lexicon_rdr, LexType::User)?;
if let Some(mapper) = self.data.mapper.as_ref() {
user_lexicon.do_mapping(mapper);
}
if !user_lexicon.verify(self.connector()) {
return Err(VibratoError::invalid_argument(
"user_lexicon_rdr",
"user_lexicon_rdr includes invalid connection ids.",
kampersanda marked this conversation as resolved.
Show resolved Hide resolved
));
}
self.data.user_lexicon = Some(user_lexicon);
} else {
self.data.user_lexicon = None;
}
Ok(self)
}

/// Edits connection ids with the given mappings.
///
/// # Arguments
///
/// - `lmap/rmap`: An iterator of mappings of left/right ids, where
/// the `i`-th item (1-origin) indicates a new id mapped from id `i`.
///
/// # Errors
///
/// [`VibratoError`] is returned when
/// - a new id of [`BOS_EOS_CONNECTION_ID`](crate::common::BOS_EOS_CONNECTION_ID)
/// is included,
/// - new ids are duplicated, or
/// - the set of new ids are not same as that of old ids.
pub fn mapping_from_iter<L, R>(mut self, lmap: L, rmap: R) -> Result<Self>
where
L: IntoIterator<Item = u16>,
R: IntoIterator<Item = u16>,
{
let mapper = ConnIdMapper::from_iter(lmap, rmap)?;
self.data.system_lexicon.do_mapping(&mapper);
if let Some(user_lexicon) = self.data.user_lexicon.as_mut() {
user_lexicon.do_mapping(&mapper);
}
self.data.connector.do_mapping(&mapper);
self.data.unk_handler.do_mapping(&mapper);
self.data.mapper = Some(mapper);
Ok(self)
}
}
94 changes: 18 additions & 76 deletions vibrato/src/dictionary/builder.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
//! Builders for [`Dictionary`].
use std::io::Read;

use crate::dictionary::connector::{MatrixConnector, RawConnector};
use crate::dictionary::{
CharProperty, ConnIdMapper, Connector, ConnectorWrapper, Dictionary, DictionaryInner, LexType,
Lexicon, UnkHandler,
CharProperty, ConnectorWrapper, Dictionary, DictionaryInner, LexType, Lexicon, UnkHandler,
};
use crate::errors::{Result, VibratoError};

use super::lexicon::RawWordEntry;

impl Dictionary {
pub(crate) fn new(
/// Builder for [`Dictionary`] from system lexicon entries.
pub struct SystemDictionaryBuilder {}

impl SystemDictionaryBuilder {
pub(crate) fn build(
system_word_entries: &[RawWordEntry],
connector: ConnectorWrapper,
char_prop: CharProperty,
unk_handler: UnkHandler,
) -> Result<Self> {
) -> Result<Dictionary> {
let system_lexicon = Lexicon::from_entries(system_word_entries, LexType::System)?;

if !system_lexicon.verify(&connector) {
Expand All @@ -31,7 +34,7 @@ impl Dictionary {
));
}

Ok(Self {
Ok(Dictionary {
data: DictionaryInner {
system_lexicon,
user_lexicon: None,
Expand All @@ -44,7 +47,7 @@ impl Dictionary {
})
}

/// Creates a new instance from readers in the MeCab format.
/// Creates a new [`Dictionary`] from readers of system entries in the MeCab format.
///
/// # Arguments
///
Expand All @@ -61,7 +64,7 @@ impl Dictionary {
connector_rdr: C,
char_prop_rdr: P,
unk_handler_rdr: U,
) -> Result<Self>
) -> Result<Dictionary>
where
S: Read,
C: Read,
Expand All @@ -75,15 +78,16 @@ impl Dictionary {
let char_prop = CharProperty::from_reader(char_prop_rdr)?;
let unk_handler = UnkHandler::from_reader(unk_handler_rdr, &char_prop)?;

Self::new(
Self::build(
&system_word_entries,
ConnectorWrapper::Matrix(connector),
char_prop,
unk_handler,
)
}

/// Creates a new instance from readers with the detailed bi-gram information.
/// Creates a new [`Dictionary`] from readers of system entries
/// with the detailed bi-gram information.
///
/// # Arguments
///
Expand All @@ -104,7 +108,7 @@ impl Dictionary {
bigram_cost_rdr: C,
char_prop_rdr: P,
unk_handler_rdr: U,
) -> Result<Self>
) -> Result<Dictionary>
where
S: Read,
R: Read,
Expand All @@ -121,75 +125,13 @@ impl Dictionary {
let char_prop = CharProperty::from_reader(char_prop_rdr)?;
let unk_handler = UnkHandler::from_reader(unk_handler_rdr, &char_prop)?;

Self::new(
Self::build(
&system_word_entries,
ConnectorWrapper::Raw(connector),
char_prop,
unk_handler,
)
}

/// Resets the user dictionary from a reader.
///
/// # Arguments
///
/// - `user_lexicon_rdr`: A reader of a lexicon file `*.csv` in the MeCab format.
/// If `None`, clear the current user dictionary.
///
/// # Errors
///
/// [`VibratoError`] is returned when an input format is invalid.
pub fn user_lexicon_from_reader<R>(mut self, user_lexicon_rdr: Option<R>) -> Result<Self>
where
R: Read,
{
if let Some(user_lexicon_rdr) = user_lexicon_rdr {
let mut user_lexicon = Lexicon::from_reader(user_lexicon_rdr, LexType::User)?;
if let Some(mapper) = self.data.mapper.as_ref() {
user_lexicon.do_mapping(mapper);
}
if !user_lexicon.verify(self.connector()) {
return Err(VibratoError::invalid_argument(
"user_lexicon_rdr",
"user_lexicon_rdr includes invalid connection ids.",
));
}
self.data.user_lexicon = Some(user_lexicon);
} else {
self.data.user_lexicon = None;
}
Ok(self)
}

/// Edits connection ids with the given mappings.
///
/// # Arguments
///
/// - `lmap/rmap`: An iterator of mappings of left/right ids, where
/// the `i`-th item (1-origin) indicates a new id mapped from id `i`.
///
/// # Errors
///
/// [`VibratoError`] is returned when
/// - a new id of [`BOS_EOS_CONNECTION_ID`](crate::common::BOS_EOS_CONNECTION_ID)
/// is included,
/// - new ids are duplicated, or
/// - the set of new ids are not same as that of old ids.
pub fn mapping_from_iter<L, R>(mut self, lmap: L, rmap: R) -> Result<Self>
where
L: IntoIterator<Item = u16>,
R: IntoIterator<Item = u16>,
{
let mapper = ConnIdMapper::from_iter(lmap, rmap)?;
self.data.system_lexicon.do_mapping(&mapper);
if let Some(user_lexicon) = self.data.user_lexicon.as_mut() {
user_lexicon.do_mapping(&mapper);
}
self.data.connector.do_mapping(&mapper);
self.data.unk_handler.do_mapping(&mapper);
self.data.mapper = Some(mapper);
Ok(self)
}
}

#[cfg(test)]
Expand All @@ -203,7 +145,7 @@ mod tests {
let char_def = "DEFAULT 0 1 0";
let unk_def = "DEFAULT,0,0,100,*";

let result = Dictionary::from_readers(
let result = SystemDictionaryBuilder::from_readers(
lexicon_csv.as_bytes(),
matrix_def.as_bytes(),
char_def.as_bytes(),
Expand All @@ -220,7 +162,7 @@ mod tests {
let char_def = "DEFAULT 0 1 0";
let unk_def = "DEFAULT,1,1,100,*";

let result = Dictionary::from_readers(
let result = SystemDictionaryBuilder::from_readers(
lexicon_csv.as_bytes(),
matrix_def.as_bytes(),
char_def.as_bytes(),
Expand Down
6 changes: 3 additions & 3 deletions vibrato/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
//! use std::fs::File;
//! use std::io::{BufRead, BufReader};
//!
//! use vibrato::{Dictionary, Tokenizer};
//! use vibrato::{SystemDictionaryBuilder, Tokenizer};
//!
//! // Loads a set of raw dictionary files
//! let dict = Dictionary::from_readers(
//! let dict = SystemDictionaryBuilder::from_readers(
//! File::open("src/tests/resources/lex.csv")?,
//! File::open("src/tests/resources/matrix.def")?,
//! File::open("src/tests/resources/char.def")?,
Expand Down Expand Up @@ -62,5 +62,5 @@ mod test_utils;
#[cfg(test)]
mod tests;

pub use dictionary::Dictionary;
pub use dictionary::{Dictionary, SystemDictionaryBuilder};
pub use tokenizer::Tokenizer;
Loading