Skip to content

Commit

Permalink
working on realistic data learning
Browse files Browse the repository at this point in the history
  • Loading branch information
dsietz committed Jun 28, 2018
1 parent fe173ed commit eaa4354
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "test-data-generation"
version = "0.0.6"
version = "0.0.7"
authors = ["dsietz <davidsietz@yahoo.com>"]
repository = "https://github.com/dsietz/test-data-generation.git"
documentation = "https://docs.rs/test-data-generation/"
Expand Down
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ or production environment (option #1 above)

## What's New

Here's whats new in 0.0.6:
Here's whats new in 0.0.7:

* Removed obsolete module test_data_generation::data
* Added functionality to determine how realist the generate test data is compared to the sample data.
* Converted the following functionality to macros so they could be referenced in multiple modules.
> - test_data_generation::data_sample_parser::DataSampleParser::levenshtein_distance()
> - test_data_generation::data_sample_parser::DataSampleParser::realistic_test()
* Added the following functions to test_data_generation::profile::profile::Profile
> This is the beginning of how profiles can learn if they are generating realistic data and make adjustments
> - levenshtein_distance()
> - realistic_test()
> - learn_from_entity()
## About

Expand Down Expand Up @@ -166,4 +170,4 @@ Details on how to contribute can be found in the [CONTRIBUTING](./CONTRIBUTING.m

test-data-generation is primarily distributed under the terms of the Apache License (Version 2.0).

See ![LICENSE-APACHE "Apache License](./LICENSE-APACHE) for details.
See [LICENSE-APACHE "Apache License](./LICENSE-APACHE) for details.
10 changes: 3 additions & 7 deletions src/data_sample_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ use csv;
use std::error::Error;
use csv::WriterBuilder;
use serde_json;
use levenshtein;

type ProfilesMap = BTreeMap<String, Profile>;

Expand Down Expand Up @@ -608,7 +607,7 @@ impl DataSampleParser {
///
pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
// https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
levenshtein::levenshtein(control, experiment)
levenshtein_distance!(control, experiment)
}

/// This function calculates the percent difference between 2 strings.
Expand Down Expand Up @@ -637,11 +636,8 @@ impl DataSampleParser {
//http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
// pearson's chi square test
// cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
let ld: f64 = levenshtein::levenshtein(control, experiment) as f64;
let total: f64 = control.len() as f64 + experiment.len() as f64;
let diff: f64 = total - ld;
(1 as f64 - ((total - diff)/total)) * 100 as f64
}
realistic_test!(control, experiment)
}

/// This function returns a boolean that indicates if the data sample parsing had issues
///
Expand Down
58 changes: 58 additions & 0 deletions src/macros.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
/// This macro calculates the levenshtein distance between 2 strings.
/// See: https://crates.io/crates/levenshtein
///
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
///
/// #Example
///
/// ```
/// # #[macro_use] extern crate test_data_generation; extern crate levenshtein;
/// # fn main() {
/// // let kitten = String::from("kitten");
/// // let sitting = String::from("sitting");
/// // assert_eq!(levenshtein_distance!(&kitten, &sitting), 3 as usize);
/// # }
///
#[macro_export]
macro_rules! levenshtein_distance {
( $c:ident, $e:ident ) => {
{
use levenshtein;

levenshtein::levenshtein($c, $e)
}
}
}

/// This macro generates a random number between 0 and 100.
/// Returns a f64.
///
Expand Down Expand Up @@ -54,3 +83,32 @@ macro_rules! random_between {
}
};
}

/// This function calculates the percent difference between 2 strings.
///
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
///
/// #Example
///
/// ```
/// # #[macro_use] extern crate test_data_generation; extern crate levenshtein;
/// # fn main() {
/// // let kitten = String::from("kitten");
/// // let sitting = String::from("sitting");
/// // assert_eq!(realistic_test!(&kitten, &sitting), 76.92307692307692 as f64);
/// # }
///
#[macro_export]
macro_rules! realistic_test {
( $c:ident, $e:ident ) => {
{
let ld: f64 = levenshtein_distance!($c, $e) as f64;
let total: f64 = $c.len() as f64 + $e.len() as f64;
let diff: f64 = total - ld;
(1 as f64 - ((total - diff)/total)) * 100 as f64
}
}
}
1 change: 1 addition & 0 deletions src/profile/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ macro_rules! symbolize_char {
/// // will return a Fact that represents the char `W`
/// # }
/// ```
#[macro_export]
macro_rules! factualize_entity {
( $entity:ident, $idx:ident ) => {
{
Expand Down
101 changes: 101 additions & 0 deletions src/profile/profile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,107 @@ impl Profile {
generated
}

/// This function learns by measuring how realistic the test data it generates to the sample data that was provided.
///
/// # Arguments
///
/// * `control_list: Vec<String>` - The list of strings to compare against. This would be the real data from the data sample.</br>
///
/// # Example
///
/// ```
/// extern crate test_data_generation;
///
/// use test_data_generation::profile::profile::Profile;
///
/// fn main() {
/// let mut profil = Profile::new();
/// let sample_data = vec!("Smith, John".to_string(),"Doe, John".to_string(),"Dale, Danny".to_string(),"Rickets, Ronney".to_string());
///
/// for sample in sample_data.iter().clone() {
/// profil.analyze(&sample);
/// }
///
/// // in order to learn the profile must be prepared with pre_genrate()
/// // so it can generate data to learn from
/// profil.pre_generate();
///
/// let learning = profil.learn_from_entity(sample_data).unwrap();
///
/// assert_eq!(learning, true);
/// }
/// ```
pub fn learn_from_entity(&mut self, control_list: Vec<String>) -> Result<bool, String> {
for _n in 0..10 {
let experiment = self.generate();
let mut percent_similarity: Vec<f64> = Vec::new();

for control in control_list.iter().clone() {
debug!("Comparing {} with {} ...", &control, &experiment);
percent_similarity.push(self.realistic_test(&control, &experiment));
}

let percent = percent_similarity.iter().sum::<f64>() as f64 / percent_similarity.len() as f64;
debug!("Percent similarity is {} ...", &percent);

if percent >= 80 as f64 {
self.analyze(&experiment);
}
}

Ok(true)
}

/// This function calculates the levenshtein distance between 2 strings.
/// See: https://crates.io/crates/levenshtein
///
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
///
/// #Example
///
/// ```
/// extern crate test_data_generation;
///
/// use test_data_generation::profile::profile::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// assert_eq!(profile.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
/// }
///
pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
// https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
levenshtein_distance!(control, experiment)
}

/// This function calculates the percent difference between 2 strings.
///
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
///
/// #Example
///
/// ```
/// extern crate test_data_generation;
///
/// use test_data_generation::profile::profile::Profile;
///
/// fn main() {
/// let mut profile = Profile::new();
///
/// assert_eq!(profile.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
/// }
///
pub fn realistic_test(&mut self, control: &String, experiment: &String) -> f64 {
realistic_test!(control, experiment)
}

/// This function is called from within the implementated structure and returns a list processors (Vec) with empty lists (Vec) for their Facts.
/// Each processor shares the load of generating the data based on the Facts it has been assigned to manage.
///
Expand Down
11 changes: 11 additions & 0 deletions tests/data_sample_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@ mod tests {
assert_eq!(dsp.generate_record()[0], "OK".to_string());
}

#[test]
// ensure the Data Sample Parser can read all the headers from teh csv file
fn read_headers(){
let mut dsp = DataSampleParser::new();

dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap();
let headers = dsp.extract_headers();

assert_eq!(headers.len(), 2);
}

#[test]
// ensure DataSampleParser can analyze a csv formatted file
fn parse_csv_file(){
Expand Down
31 changes: 31 additions & 0 deletions tests/profile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ mod tests {

assert_eq!(profile.apply_facts(results.0, results.1).unwrap(),1);
}

#[test]
// ensure a Profile can convert a str to an array of facts and the pattern
fn factualize(){
Expand All @@ -25,6 +26,36 @@ mod tests {
assert_eq!(t.0,"Cvcc");
}

#[test]
fn levenshtein_test(){
let mut profil = Profile::new();

assert_eq!(profil.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
}

#[test]
fn realistic_data_test(){
let mut profil = Profile::new();

assert_eq!(profil.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
}

#[test]
fn learn_from_entity(){
let mut profil = Profile::new();
let sample_data = vec!("Smith, John".to_string(),"Doe, John".to_string(),"Dale, Danny".to_string(),"Rickets, Ronney".to_string());

for sample in sample_data.iter().clone() {
profil.analyze(&sample);
}

profil.pre_generate();

let learning = profil.learn_from_entity(sample_data).unwrap();

assert_eq!(learning, true);
}

#[test]
// ensure logging is working in the crate
fn logging_test(){
Expand Down

0 comments on commit eaa4354

Please sign in to comment.