Skip to content

Commit

Permalink
working on realistic determination of generated data
Browse files Browse the repository at this point in the history
  • Loading branch information
DAVID SIETZ authored and DAVID SIETZ committed Jun 26, 2018
1 parent 9644d79 commit fe173ed
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 112 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ regex = "0.2"
rand = "0.3"
crossbeam = "0.3.2"
csv = "1.0.0-beta.5"
oozie = "0.1.2"
levenshtein = "1.0.3"

[profile.release]
opt-level = 3
Expand Down
31 changes: 6 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,31 +35,12 @@ or production environment (option #1 above)

## What's New

Here's whats new in 0.0.5:

* Added the following new module and functions to the test_data_generation::shared module
> - string_to_static_str(s: String) -> &'static str
* The following macros have been modified with 'returns', instead of 'sets'
> - random_percentage
> - random_between
* Added the following macros data_test_generation::profile
> - symbolize_char -> char
> - factualize_entity -> (String, Vec<Fact>)
* The following test_data_generation::data_sample_parser::DataSampleParser functions takes _&String_ instead of _&'static str_ as the path parameter.
> - analyze_csv_file
> - from_file
> - generate_csv
> - with_new
> - save
* The following test_data_generation::configs::Configs functions takes _&String_ instead of _&'static str_ as the path parameter.
> - new
* Added the test_data_generation::data_sample_parser::DataSampleParserfunction _analyze_csv_data_ function so that the csv data doesn't need to 'land' in order to be analyzed.
This is helpful when wrapping the test data generation library in a REST service for instance.
* Added the test_data_generation::profile::profile::Profile _factualize_ function so that the processing of building Facts can be multi-threaded in the future
* Added the test_data_generation::profile::pattern::Pattern _factualize_ function so that the processing of building Facts can be multi-threaded in the future.
* Refactored the following items
> - test_data_generation::profile::Profile function apply_facts renamed to generate_from_pattern
* Improved documentation
Here's whats new in 0.0.6:

* Removed obsolete module test_data_generation::data
* Added functionality to determine how realist the generate test data is compared to the sample data.
> - test_data_generation::data_sample_parser::DataSampleParser::levenshtein_distance()
> - test_data_generation::data_sample_parser::DataSampleParser::realistic_test()
## About

Expand Down
152 changes: 77 additions & 75 deletions src/data_sample_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,7 @@ use csv;
use std::error::Error;
use csv::WriterBuilder;
use serde_json;
use oozie::similarity;
use std::collections::HashMap;
use levenshtein;

type ProfilesMap = BTreeMap<String, Profile>;

Expand Down Expand Up @@ -445,6 +444,34 @@ impl DataSampleParser {
profil.generate()
}

/// This function returns a vector of header names
///
/// # Example
///
/// ```
/// extern crate test_data_generation;
///
/// use test_data_generation::data_sample_parser::DataSampleParser;
///
/// fn main() {
/// // initalize a new DataSampelParser
/// let mut dsp = DataSampleParser::new();
///
/// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap();
/// let headers = dsp.extract_headers();
///
/// assert_eq!(headers.len(), 2);
/// }
pub fn extract_headers(&mut self) -> Vec<String> {
let mut headers = vec!();

for profile in self.profiles.iter_mut() {
headers.push(profile.0.to_string());
}

headers
}

/// This function generates test data for the specified field name.
///
/// # Arguments
Expand Down Expand Up @@ -557,34 +584,65 @@ impl DataSampleParser {
Ok(())
}

/// This function returns a vector of header names
/// This function calculates the levenshtein distance between 2 strings.
/// See: https://crates.io/crates/levenshtein
///
/// # Example
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the distance.</br>
///
/// #Example
///
/// ```
/// extern crate test_data_generation;
///
/// use test_data_generation::data_sample_parser::DataSampleParser;
///
/// fn main() {
/// // initalize a new DataSampelParser
/// let mut dsp = DataSampleParser::new();
///
/// dsp.analyze_csv_file(&String::from("./tests/samples/sample-01.csv")).unwrap();
/// let headers = dsp.extract_headers();
///
/// assert_eq!(headers.len(), 2);
/// // analyze the dataset
/// let mut dsp = DataSampleParser::new();
///
/// assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
/// }
pub fn extract_headers(&mut self) -> Vec<String> {
let mut headers = vec!();

for profile in self.profiles.iter_mut() {
headers.push(profile.0.to_string());
}

headers
///
pub fn levenshtein_distance(&mut self, control: &String, experiment: &String) -> usize {
// https://docs.rs/levenshtein/1.0.3/levenshtein/fn.levenshtein.html
levenshtein::levenshtein(control, experiment)
}

/// This function calculates the percent difference between 2 strings.
///
/// # Arguments
///
/// * `control: &String` - The string to compare against. This would be the real data from the data sample.</br>
/// * `experiment: &String` - The string to compare. This would be the generated data for which you want to find the percent difference.</br>
///
/// #Example
///
/// ```
/// extern crate test_data_generation;
///
/// use test_data_generation::data_sample_parser::DataSampleParser;
///
/// fn main() {
/// // analyze the dataset
/// let mut dsp = DataSampleParser::new();
///
/// assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
/// }
///
pub fn realistic_test(&mut self, control: &String, experiment: &String) -> f64 {
//https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html
//http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
// pearson's chi square test
// cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
let ld: f64 = levenshtein::levenshtein(control, experiment) as f64;
let total: f64 = control.len() as f64 + experiment.len() as f64;
let diff: f64 = total - ld;
(1 as f64 - ((total - diff)/total)) * 100 as f64
}

/// This function returns a boolean that indicates if the data sample parsing had issues
///
/// # Example
Expand Down Expand Up @@ -660,60 +718,4 @@ impl DataSampleParser {

Ok(true)
}

pub fn string_to_vector(&mut self, text: String) -> Vec<f64>{
let vu8 = text.into_bytes();
let mut vf64 = vec!();

for b in &vu8 {
vf64.push(*b as f64);
}

vf64
}

pub fn realistic_test(&mut self, generated_data: &'static str, sample_data: &'static str) -> Result<f64, Box<Error>> {
//https://docs.rs/GSL/0.4.31/rgsl/statistics/fn.correlation.html
//http://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
// pearson's chi square test
// cosine similarity - http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/

let mut str_gen = String::from(generated_data);
let mut str_smpl = String::from(sample_data);

while str_gen.len() < str_smpl.len() {
str_gen.push(' ');
}

while str_smpl.len() < str_gen.len() {
str_smpl.push(' ');
}

let gen_data = self.string_to_vector(str_gen);
let smpl_data = self.string_to_vector(str_smpl);

let mut gen_map: HashMap<usize, f64> = HashMap::new();
let gen_sz = gen_data.len();
for gd in gen_data {
gen_map.insert(gen_sz, gd);
}

let mut smpl_map: HashMap<usize, f64> = HashMap::new();
let smpl_sz = smpl_data.len();
for sd in smpl_data {
smpl_map.insert(smpl_sz, sd);
}


let cos = similarity::cosine(&gen_map, &smpl_map, gen_sz);
println!("cosine simularity {:?}", cos);
//let v = vec!(111 as f64, 101 as f64);
//let avg_gen_data = statistical::mean(&gen_data);

//println!("{}",avg_gen_data);
//let corr = statistical::correlation(gen_data, 1 as usize, sam_data, 1 as usize, sam_data.len());
//println!("the Correlation Coefficient is {}",avg_gen_data);

Ok(1 as f64)
}
}
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ extern crate regex;
extern crate rand;
extern crate crossbeam;
extern crate csv;
extern crate oozie;
extern crate levenshtein;

#[macro_use]
pub mod macros;
Expand Down
19 changes: 9 additions & 10 deletions tests/data_sample_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,20 @@ mod tests {
}

#[test]
// ensure the DataSampleParser object can convert a string to a vector of numbers for each char
fn string_to_vector(){
// ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
fn levenshtein_test(){
let mut dsp = DataSampleParser::new();

assert_eq!(dsp.string_to_vector(String::from("hello")), [104 as f64, 101 as f64, 108 as f64, 108 as f64, 111 as f64]);
assert_eq!(dsp.levenshtein_distance(&"kitten".to_string(), &"sitting".to_string()), 3 as usize);
}

#[test]
// ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
fn realistic_data_test(){
let mut dsp = DataSampleParser::new();

#[test]
// ensure the DataSampleParser object can recognize the difference between realistic data and unrealistic generated data
fn realistic_data_test(){
let mut dsp = DataSampleParser::new();

assert_eq!(dsp.realistic_test("Hello", "Hello").unwrap(),1 as f64);
}
assert_eq!(dsp.realistic_test(&"kitten".to_string(), &"sitting".to_string()), 76.92307692307692 as f64);
}

#[test]
// demo test
Expand Down

0 comments on commit fe173ed

Please sign in to comment.