From fe173ed21371df827bdbbf7db05e9d698feb4875 Mon Sep 17 00:00:00 2001 From: DAVID SIETZ Date: Tue, 26 Jun 2018 14:18:07 -0400 Subject: [PATCH] working on realistic determination of generated data --- Cargo.toml | 2 +- README.md | 31 ++------ src/data_sample_parser.rs | 152 ++++++++++++++++++------------------ src/lib.rs | 2 +- tests/data_sample_parser.rs | 19 +++-- 5 files changed, 94 insertions(+), 112 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 3f2b15c..634f1fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ regex = "0.2" rand = "0.3" crossbeam = "0.3.2" csv = "1.0.0-beta.5" -oozie = "0.1.2" +levenshtein = "1.0.3" [profile.release] opt-level = 3 diff --git a/README.md b/README.md index 1009947..5608247 100644 --- a/README.md +++ b/README.md @@ -35,31 +35,12 @@ or production environment (option #1 above) ## What's New -Here's whats new in 0.0.5: - -* Added the following new module and functions to the test_data_generation::shared module -> - string_to_static_str(s: String) -> &'static str -* The following macros have been modified with 'returns', instead of 'sets' -> - random_percentage -> - random_between -* Added the following macros data_test_generation::profile -> - symbolize_char -> char -> - factualize_entity -> (String, Vec) -* The following test_data_generation::data_sample_parser::DataSampleParser functions takes _&String_ instead of _&'static str_ as the path parameter. -> - analyze_csv_file -> - from_file -> - generate_csv -> - with_new -> - save -* The following test_data_generation::configs::Configs functions takes _&String_ instead of _&'static str_ as the path parameter. -> - new -* Added the test_data_generation::data_sample_parser::DataSampleParserfunction _analyze_csv_data_ function so that the csv data doesn't need to 'land' in order to be analyzed. -This is helpful when wrapping the test data generation library in a REST service for instance. -* Added the test_data_generation::profile::profile::Profile _factualize_ function so that the processing of building Facts can be multi-threaded in the future -* Added the test_data_generation::profile::pattern::Pattern _factualize_ function so that the processing of building Facts can be multi-threaded in the future. -* Refactored the following items -> - test_data_generation::profile::Profile function apply_facts renamed to generate_from_pattern -* Improved documentation +Here's whats new in 0.0.6: + +* Removed obsolete module test_data_generation::data +* Added functionality to determine how realist the generate test data is compared to the sample data. +> - test_data_generation::data_sample_parser::DataSampleParser::levenshtein_distance() +> - test_data_generation::data_sample_parser::DataSampleParser::realistic_test() ## About diff --git a/src/data_sample_parser.rs b/src/data_sample_parser.rs index 02fd8a4..df97583 100644 --- a/src/data_sample_parser.rs +++ b/src/data_sample_parser.rs @@ -86,8 +86,7 @@ use csv; use std::error::Error; use csv::WriterBuilder; use serde_json; -use oozie::similarity; -use std::collections::HashMap; +use levenshtein; type ProfilesMap = BTreeMap; @@ -445,6 +444,34 @@ impl DataSampleParser { profil.generate() } + /// This function returns a vector of header names + /// + /// # Example + /// + /// ``` + /// extern crate test_data_generation; + /// + /// use test_data_generation::data_sample_parser::DataSampleParser; + /// + /// fn main() { + /// // initalize a new DataSampelParser + /// let mut dsp = DataSampleParser::new(); + /// + /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap(); + /// let headers = dsp.extract_headers(); + /// + /// assert_eq!(headers.len(), 2); + /// } + pub fn extract_headers(&mut self) -> Vec { + let mut headers = vec!(); + + for profile in self.profiles.iter_mut() { + headers.push(profile.0.to_string()); + } + + headers + } + /// This function generates test data for the specified field name. /// /// # Arguments @@ -557,9 +584,15 @@ impl DataSampleParser { Ok(()) } - /// This function returns a vector of header names + /// This function calculates the levenshtein distance between 2 strings. + /// See: https://crates.io/crates/levenshtein /// - /// # Example + /// # Arguments + /// + /// * `control: &String` - The string to compare against. This would be the real data from the data sample.
+ /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.
+ /// + /// #Example /// /// ``` /// extern crate test_data_generation; @@ -567,24 +600,49 @@ impl DataSampleParser { /// use test_data_generation::data_sample_parser::DataSampleParser; /// /// fn main() { - /// // initalize a new DataSampelParser - /// let mut dsp = DataSampleParser::new(); - /// - /// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap(); - /// let headers = dsp.extract_headers(); - /// - /// assert_eq!(headers.len(), 2); + /// // analyze the dataset + /// let mut dsp = DataSampleParser::new(); + /// + /// assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize); /// } - pub fn extract_headers(&mut self) -> Vec { - let mut headers = vec!(); - - for profile in self.profiles.iter_mut() { - headers.push(profile.0.to_string()); - } - - headers + /// + pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize { + // https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html + levenshtein::levenshtein(control, experiment) } + /// This function calculates the percent difference between 2 strings. + /// + /// # Arguments + /// + /// * `control: &String` - The string to compare against. This would be the real data from the data sample.
+ /// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.
+ /// + /// #Example + /// + /// ``` + /// extern crate test_data_generation; + /// + /// use test_data_generation::data_sample_parser::DataSampleParser; + /// + /// fn main() { + /// // analyze the dataset + /// let mut dsp = DataSampleParser::new(); + /// + /// assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64); + /// } + /// + pub fn realistic_test(&mut self, control: &String, experiment: &String) -> f64 { + //https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html + //http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/ + // pearson's chi square test + // cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/ + let ld: f64 = levenshtein::levenshtein(control, experiment) as f64; + let total: f64 = control.len() as f64 + experiment.len() as f64; + let diff: f64 = total - ld; + (1 as f64 - ((total - diff)/total)) * 100 as f64 + } + /// This function returns a boolean that indicates if the data sample parsing had issues /// /// # Example @@ -660,60 +718,4 @@ impl DataSampleParser { Ok(true) } - - pub fn string_to_vector(&mut self, text: String) -> Vec{ - let vu8 = text.into_bytes(); - let mut vf64 = vec!(); - - for b in &vu8 { - vf64.push(*b as f64); - } - - vf64 - } - - pub fn realistic_test(&mut self, generated_data: &'static str, sample_data: &'static str) -> Result> { - //https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html - //http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/ - // pearson's chi square test - // cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/ - - let mut str_gen = String::from(generated_data); - let mut str_smpl = String::from(sample_data); - - while str_gen.len() < str_smpl.len() { - str_gen.push(' '); - } - - while str_smpl.len() < str_gen.len() { - str_smpl.push(' '); - } - - let gen_data = self.string_to_vector(str_gen); - let smpl_data = self.string_to_vector(str_smpl); - - let mut gen_map: HashMap = HashMap::new(); - let gen_sz = gen_data.len(); - for gd in gen_data { - gen_map.insert(gen_sz, gd); - } - - let mut smpl_map: HashMap = HashMap::new(); - let smpl_sz = smpl_data.len(); - for sd in smpl_data { - smpl_map.insert(smpl_sz, sd); - } - - - let cos = similarity::cosine(&gen_map, &smpl_map, gen_sz); - println!("cosine simularity {:?}", cos); - //let v = vec!(111 as f64, 101 as f64); - //let avg_gen_data = statistical::mean(&gen_data); - - //println!("{}",avg_gen_data); - //let corr = statistical::correlation(gen_data, 1 as usize, sam_data, 1 as usize, sam_data.len()); - //println!("the Correlation Coefficient is {}",avg_gen_data); - - Ok(1 as f64) - } } diff --git a/src/lib.rs b/src/lib.rs index dd7ee86..3f7125a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -142,7 +142,7 @@ extern crate regex; extern crate rand; extern crate crossbeam; extern crate csv; -extern crate oozie; +extern crate levenshtein; #[macro_use] pub mod macros; diff --git a/tests/data_sample_parser.rs b/tests/data_sample_parser.rs index 9c62df3..73b65b5 100644 --- a/tests/data_sample_parser.rs +++ b/tests/data_sample_parser.rs @@ -77,21 +77,20 @@ mod tests { } #[test] - // ensure the DataSampleParser object can convert a string to a vector of numbers for each char - fn string_to_vector(){ + // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data + fn levenshtein_test(){ let mut dsp = DataSampleParser::new(); - assert_eq!(dsp.string_to_vector(String::from("hello")), [104 as f64, 101 as f64, 108 as f64, 108 as f64, 111 as f64]); + assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize); } + #[test] + // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data + fn realistic_data_test(){ + let mut dsp = DataSampleParser::new(); - #[test] - // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data - fn realistic_data_test(){ - let mut dsp = DataSampleParser::new(); - - assert_eq!(dsp.realistic_test("Hello", "Hello").unwrap(),1 as f64); - } + assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64); + } #[test] // demo test