Fix command position of arguments and make title-filter-list optional (…

…#201) * Make --title-filter-list optional * Fix position of arguments in scripts * Also adjust README
common-voice · Aug 19, 2023 · 75b5949 · 75b5949
1 parent f89fd75
commit 75b5949
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -81,7 +81,7 @@ In the beginning, the WikiExtractor prints out how many processes it will use fo
 ```bash
 cd ../cv-sentence-extractor
 pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
-cargo run --release -- extract -l en -d ../wikiextractor/text/ >> wiki.en.txt
+cargo run --release -- -l en -d ../wikiextractor/text/ extract >> wiki.en.txt
 ```
 
 *Tip: You don't need this last process to finish to start observing the output, wiki.en.txt should get a few thousands sentences in just a few minutes, and you can use that as a way to estimate the quality of the output early on and stop the process if you are not happy.*
@@ -137,7 +137,7 @@ python WikiExtractor.py --json ../enwikisource-latest-pages-articles.xml
 ```bash
 cd ../cv-sentence-extractor
 pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
-cargo run --release -- extract-wikisource -l en -d ../wikiextractor/text/ >> wiki.en.txt
+cargo run --release -- -l en -d ../wikiextractor/text/ extract-wikisource >> wiki.en.txt
 ```
 
 *Tip: You don't need this last process to finish to start observing the output, wiki.en.txt should get a few thousands sentences in just a few minutes, and you can use that as a way to estimate the quality of the output early on and stop the process if you are not happy.*
@@ -148,7 +148,7 @@ If you have one or multiple files with one sentence per line, you can use this e
 
 ```bash
 pip3 install -r requirements.txt # can be skipped if your language doesn't use the Python segmenter
-cargo run --release -- extract-file -l en -d ../texts/ >> file.en.txt
+cargo run --release -- -l en -d ../texts/ extract-file >> file.en.txt
 ```
 
 ## Using language rules
@@ -257,7 +257,7 @@ After running step 1 and 2 from the `Usage` section above, run:
 
 ```bash
 cd ../cv-sentence-extractor
-cargo run --release -- extract -l en -d ../wikiextractor/text/ --no-check >> wiki.en.all.txt
+cargo run --release -- -l en -d ../wikiextractor/text/ --no-check extract >> wiki.en.all.txt
 ```
 
 Then you can use the cvtools scripts to generate a list of the word frequency:

diff --git a/scripts/providers/wiki-source.sh b/scripts/providers/wiki-source.sh
@@ -30,7 +30,7 @@ function extract {
   python $WIKI_EXTRACTOR_PATH --processes 4 --json $DUMP_FILE
 
   echo "Running extraction"
-  cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH >> $EXTRACTED_SENTENCES_PATH
+  cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH extract >> $EXTRACTED_SENTENCES_PATH
 }
 
 function cleanup {

diff --git a/scripts/providers/wiki.sh b/scripts/providers/wiki.sh
@@ -58,11 +58,11 @@ function extract {
 
   echo "Running extraction"
   if [ $TYPE == "blocklist" ]; then
-    cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH --no-check >> $EXTRACTED_SENTENCES_PATH
+    cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH --no-check extract >> $EXTRACTED_SENTENCES_PATH
   elif [ -f "$TITLE_FILTER_PATH" ]; then
-    cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH --title-filter-list $TITLE_FILTER_PATH >> $EXTRACTED_SENTENCES_PATH
+    cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH extract --title-filter-list $TITLE_FILTER_PATH >> $EXTRACTED_SENTENCES_PATH
   else
-    cargo run --release -- extract -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH >> $EXTRACTED_SENTENCES_PATH
+    cargo run --release -- -l $LANGUAGE_CODE -d $EXTRACTED_TEXT_PATH extract >> $EXTRACTED_SENTENCES_PATH
   fi
 }
 

diff --git a/src/app.rs b/src/app.rs
@@ -28,7 +28,7 @@ enum Commands {
     Extract {
         /// path to the file containing titles to filter for
         #[arg(short, long)]
-        title_filter_list: String,
+        title_filter_list: Option<String>,
     },
 
     /// Extract sentences from Wikisource dump extracts using WikiExtractor
@@ -48,15 +48,16 @@ pub fn start() -> Result<(), String> {
     match &args.command {
         Commands::Extract { title_filter_list } => {
             let wikipedia_loader = Wikipedia::new(language, directory);
-            extract(wikipedia_loader, no_check, title_filter_list)
+            let filter_list_value = title_filter_list.clone().unwrap_or(String::from(""));
+            extract(wikipedia_loader, no_check, filter_list_value)
         },
         Commands::ExtractWikisource => {
             let wikipedia_loader = Wikipedia::new(language, directory);
-            extract(wikipedia_loader, no_check, "")
+            extract(wikipedia_loader, no_check, String::from(""))
         },
         Commands::ExtractFile => {
             let file_loader = File::new(language, directory);
-            extract(file_loader, no_check, "")
+            extract(file_loader, no_check, String::from(""))
         }
     }
 }
diff --git a/src/extractor.rs b/src/extractor.rs
@@ -15,7 +15,7 @@ use std::io::Read;
 use std::path::Path;
 use std::path::PathBuf;
 
-pub fn extract(loader: impl Loader, no_check: bool, filter_list_path: &str) -> Result<(), String> {
+pub fn extract(loader: impl Loader, no_check: bool, filter_list_path: String) -> Result<(), String> {
     let config = loader.get_config();
     let rules = load_rules(&config.language);
     let training_data = get_training_data(&config.language);
@@ -191,14 +191,14 @@ fn load_file_names(dir_name: &str, prefix: &str) -> Result<Vec<PathBuf>, String>
         .collect::<Result<Vec<PathBuf>, String>>()
 }
 
-fn read_filtered_titles(filtered_titles_path: &str) -> HashSet<String> {
+fn read_filtered_titles(filtered_titles_path: String) -> HashSet<String> {
     if filtered_titles_path.is_empty() {
         return HashSet::new();
     }
 
     eprintln!("Reading titles from {:?}", filtered_titles_path);
     let mut titles = HashSet::new();
-    let titles_path = Path::new(filtered_titles_path);
+    let titles_path = Path::new(&filtered_titles_path);
     let mut content = String::new();
     let mut file = File::open(titles_path).map_err(|e| format!("{}", e)).unwrap();
     file.read_to_string(&mut content)