working on realistic determination of generated data

dsietz · Jun 26, 2018 · fe173ed · fe173ed
1 parent 9644d79
commit fe173ed
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 112 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -38,7 +38,7 @@ regex        = "0.2"
 rand         = "0.3"
 crossbeam    = "0.3.2"
 csv          = "1.0.0-beta.5"
-oozie        = "0.1.2"
+levenshtein  = "1.0.3"
 
 [profile.release]
 opt-level = 3

diff --git a/README.md b/README.md
@@ -35,31 +35,12 @@ or production environment (option #1 above)
 
 ## What's New
 
-Here's whats new in 0.0.5:
-
-* Added the following new module and functions to the test_data_generation::shared module
-> - string_to_static_str(s: String) -> &'static str
-* The following macros have been modified with 'returns', instead of 'sets'
-> - random_percentage
-> - random_between
-* Added the following macros data_test_generation::profile
-> - symbolize_char -> char
-> -  factualize_entity -> (String, Vec<Fact>)
-* The following test_data_generation::data_sample_parser::DataSampleParser functions takes _&String_ instead of _&'static str_ as the path parameter.
-> - analyze_csv_file
-> - from_file
-> - generate_csv
-> - with_new
-> - save
-* The following test_data_generation::configs::Configs functions takes _&String_ instead of _&'static str_ as the path parameter.
-> - new
-* Added the test_data_generation::data_sample_parser::DataSampleParserfunction _analyze_csv_data_ function so that the csv data doesn't need to 'land' in order to be analyzed.
-This is helpful when wrapping the test data generation library in a REST service for instance.
-* Added the test_data_generation::profile::profile::Profile _factualize_ function so that the processing of building Facts can be multi-threaded in the future
-* Added the test_data_generation::profile::pattern::Pattern _factualize_ function so that the processing of building Facts can be multi-threaded in the future.
-* Refactored the following items
-> - test_data_generation::profile::Profile function apply_facts renamed to generate_from_pattern
-* Improved documentation
+Here's whats new in 0.0.6:
+
+* Removed obsolete module test_data_generation::data
+* Added functionality to determine how realist the generate test data is compared to the sample data.
+> - test_data_generation::data_sample_parser::DataSampleParser::levenshtein_distance()
+> - test_data_generation::data_sample_parser::DataSampleParser::realistic_test()
 
 ## About
 

diff --git a/src/data_sample_parser.rs b/src/data_sample_parser.rs
@@ -86,8 +86,7 @@ use csv;
 use std::error::Error;
 use csv::WriterBuilder;
 use serde_json;
-use oozie::similarity;
-use std::collections::HashMap;
+use levenshtein;
 
 type ProfilesMap = BTreeMap<String, Profile>;
 
@@ -445,6 +444,34 @@ impl DataSampleParser {
     	profil.generate()
 	}
 
+	/// This function returns a vector of header names
+	///
+	/// # Example
+	///
+	/// ```
+	/// extern crate test_data_generation;
+	///
+	/// use test_data_generation::data_sample_parser::DataSampleParser;
+	///
+	/// fn main() {
+	///		// initalize a new DataSampelParser
+	///		let mut dsp = DataSampleParser::new();
+    ///
+    /// 	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap();
+    ///     let headers = dsp.extract_headers();
+    ///
+    ///		assert_eq!(headers.len(), 2);
+	/// }
+	pub fn extract_headers(&mut self) -> Vec<String> {
+		let mut headers = vec!();
+
+		for profile in self.profiles.iter_mut() {
+			headers.push(profile.0.to_string());
+		}
+
+		headers
+	}
+
 	/// This function generates test data for the specified field name.
 	///
 	/// # Arguments
@@ -557,34 +584,65 @@ impl DataSampleParser {
 		Ok(())
 	}
 
-	/// This function returns a vector of header names
+	/// This function calculates the levenshtein distance between 2 strings.
+	/// See: https://crates.io/crates/levenshtein
 	///
-	/// # Example
+	/// # Arguments
+	///
+	/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
+	/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
+	///
+	/// #Example
 	///
 	/// ```
 	/// extern crate test_data_generation;
 	///
 	/// use test_data_generation::data_sample_parser::DataSampleParser;
 	///
 	/// fn main() {
-	///		// initalize a new DataSampelParser
-	///		let mut dsp = DataSampleParser::new();
-    ///
-    /// 	dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap();
-    ///     let headers = dsp.extract_headers();
-    ///
-    ///		assert_eq!(headers.len(), 2);
+	/// 	// analyze the dataset
+	///		let mut dsp =  DataSampleParser::new();
+	///
+	///     assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
 	/// }
-	pub fn extract_headers(&mut self) -> Vec<String> {
-		let mut headers = vec!();
-
-		for profile in self.profiles.iter_mut() {
-			headers.push(profile.0.to_string());
-		}
-
-		headers
+	///
+	pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
+		// https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
+		levenshtein::levenshtein(control, experiment)
 	}
 
+	/// This function calculates the percent difference between 2 strings.
+	///
+	/// # Arguments
+	///
+	/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
+	/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
+	///
+	/// #Example
+	///
+	/// ```
+	/// extern crate test_data_generation;
+	///
+	/// use test_data_generation::data_sample_parser::DataSampleParser;
+	///
+	/// fn main() {
+	/// 	// analyze the dataset
+	///		let mut dsp =  DataSampleParser::new();
+	///
+	///     assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
+	/// }
+	///
+	pub fn realistic_test(&mut self, control: &String, experiment: &String) -> f64 {
+		//https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html
+		//http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
+		// pearson's chi square test
+		// cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
+		let ld: f64 = levenshtein::levenshtein(control, experiment) as f64;
+		let total: f64 = control.len() as f64 + experiment.len() as f64;
+		let diff: f64 = total - ld;
+		(1 as f64 - ((total - diff)/total)) * 100   as f64
+	}	
+
 	/// This function returns a boolean that indicates if the data sample parsing had issues
 	///
 	/// # Example
@@ -660,60 +718,4 @@ impl DataSampleParser {
 
 		Ok(true)
 	}
-
-	pub fn string_to_vector(&mut self, text: String) -> Vec<f64>{
-		let vu8 = text.into_bytes();
-		let mut vf64 = vec!();
-
-		for b in &vu8 {
-			vf64.push(*b as f64);
-		}
-
-		vf64
-	}
-
-	pub fn realistic_test(&mut self, generated_data: &'static str, sample_data: &'static str) -> Result<f64, Box<Error>> {
-		//https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html
-		//http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
-		// pearson's chi square test
-		// cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
-
-		let mut str_gen = String::from(generated_data);
-		let mut str_smpl = String::from(sample_data);
-
-		while str_gen.len() < str_smpl.len() {
-			str_gen.push(' ');
-		}
-
-		while str_smpl.len() < str_gen.len() {
-			str_smpl.push(' ');
-		}
-
-		let gen_data = self.string_to_vector(str_gen);
-		let smpl_data = self.string_to_vector(str_smpl);
-
-		let mut gen_map: HashMap<usize, f64> = HashMap::new();
-		let gen_sz = gen_data.len();
-		for gd in gen_data {
-			gen_map.insert(gen_sz, gd);
-		}
-
-		let mut smpl_map: HashMap<usize, f64> = HashMap::new();
-		let smpl_sz = smpl_data.len();
-		for sd in smpl_data {
-			smpl_map.insert(smpl_sz, sd);
-		}
-
-
-		let cos = similarity::cosine(&gen_map, &smpl_map, gen_sz);
-		println!("cosine simularity {:?}", cos);
-		//let v = vec!(111 as f64, 101 as f64);
-		//let avg_gen_data = statistical::mean(&gen_data);
-
-		//println!("{}",avg_gen_data);
-		//let corr = statistical::correlation(gen_data, 1 as usize, sam_data, 1 as usize, sam_data.len());
-		//println!("the Correlation Coefficient is {}",avg_gen_data);
-
-		Ok(1 as f64)
-	}
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -142,7 +142,7 @@ extern crate regex;
 extern crate rand;
 extern crate crossbeam;
 extern crate csv;
-extern crate oozie;
+extern crate levenshtein;
 
 #[macro_use]
 pub mod macros;

diff --git a/tests/data_sample_parser.rs b/tests/data_sample_parser.rs
@@ -77,21 +77,20 @@ mod tests {
     }
 
     #[test]
-    // ensure the DataSampleParser object can convert a string to a vector of numbers for each char
-    fn string_to_vector(){
+    // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
+    fn levenshtein_test(){
     	let mut dsp =  DataSampleParser::new();
 
-    	assert_eq!(dsp.string_to_vector(String::from("hello")), [104 as f64, 101 as f64, 108 as f64, 108 as f64, 111 as f64]);
+    	assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
     }
 
+	#[test]
+	// ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
+	fn realistic_data_test(){
+		let mut dsp =  DataSampleParser::new();
 
-    #[test]
-    // ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
-    fn realistic_data_test(){
-    	let mut dsp =  DataSampleParser::new();
-
-    	assert_eq!(dsp.realistic_test("Hello", "Hello").unwrap(),1 as f64);
-    }
+		assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
+	}
 
     #[test]
     // demo test