From 4ceb7da87a81390f3c714cd7305e15cab890d0b1 Mon Sep 17 00:00:00 2001 From: xschen Date: Mon, 5 Jun 2017 17:34:41 +0800 Subject: [PATCH] two group proportions unit testing --- pysie/dsl/one_group.py | 4 +- pysie/dsl/two_groups.py | 44 +++++++--- tests/dsl/two_group_unit_test.py | 138 +++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 13 deletions(-) create mode 100644 tests/dsl/two_group_unit_test.py diff --git a/pysie/dsl/one_group.py b/pysie/dsl/one_group.py index a77a472..f46407f 100644 --- a/pysie/dsl/one_group.py +++ b/pysie/dsl/one_group.py @@ -75,8 +75,8 @@ def __init__(self, sampling_distribution, p_null, significance_level=None): else: simulated_proportions = self.simulate() - self.p_value_one_tail = sum(x for x in simulated_proportions if x > sampling_distribution.point_estimate) / 1000.0 - self.p_value_two_tail = self.p_value_one_tail * 2 + self.p_value_one_tail = sum(1.0 for x in simulated_proportions if x > sampling_distribution.point_estimate) / 1000.0 + self.p_value_two_tail = self.p_value_one_tail if significance_level is not None: self.reject_mean_null = (self.p_value_one_tail < significance_level, diff --git a/pysie/dsl/two_groups.py b/pysie/dsl/two_groups.py index 30ef614..b671320 100644 --- a/pysie/dsl/two_groups.py +++ b/pysie/dsl/two_groups.py @@ -1,3 +1,5 @@ +import random + from pysie.stats.distributions import DistributionFamily from scipy.stats import norm, t import math @@ -37,12 +39,16 @@ def __init__(self, sampling_distribution, significance_level=None): self.reject_mean_same = (self.p_value_one_tail < significance_level, self.p_value_two_tail < significance_level) + def will_reject(self, significance_level): + + return self.p_value_one_tail < significance_level, self.p_value_two_tail < significance_level + class ProportionDiffTesting(object): sampling_distribution = None p_value_one_tail = None p_value_two_tail = None - mean_null = None + p_null = None test_statistic = None significance_level = None reject_proportion_same = None @@ -55,7 +61,7 @@ def __init__(self, sampling_distribution, significance_level=None): self.significance_level = significance_level if self.sampling_distribution.distribution_family == DistributionFamily.normal: - standard_error_null = math.sqrt(p_null * (1 - p_null) / sampling_distribution.sample_size) + standard_error_null = math.sqrt(p_null * (1 - p_null) / sampling_distribution.grp1_sample_size + p_null * (1-p_null) / sampling_distribution.grp2_sample_size) Z = sampling_distribution.point_estimate / standard_error_null self.test_statistic = Z pf = norm.cdf(Z) @@ -64,15 +70,31 @@ def __init__(self, sampling_distribution, significance_level=None): self.p_value_one_tail = 1 - pf self.p_value_two_tail = self.p_value_one_tail * 2 else: - standard_error_null = math.sqrt(p_null * (1 - p_null) / sampling_distribution.sample_size) - td_df = sampling_distribution.point_estimate / standard_error_null - self.test_statistic = td_df - pf = t.cdf(td_df, sampling_distribution.df) - if td_df < 0: - pf = 1 - pf - self.p_value_one_tail = 1 - pf - self.p_value_two_tail = self.p_value_one_tail * 2 + simulated_proportions = self.simulate() + diff = sampling_distribution.grp1_point_estimate - sampling_distribution.grp2_point_estimate + pf = sum(1.0 for x in simulated_proportions if x > diff) / 1000.0 + self.p_value_one_tail = pf + self.p_value_two_tail = sum(1.0 for x in simulated_proportions if x > diff or x < -diff) / 1000.0 if significance_level is not None: self.reject_proportion_same = (self.p_value_one_tail < significance_level, - self.p_value_two_tail < significance_level) \ No newline at end of file + self.p_value_two_tail < significance_level) + + def simulate(self): + simulated_proportions = [0] * 1000 + for i in range(1000): + count1 = 0 + for trials in range(self.sampling_distribution.grp1_sample_size): + if random.random() <= self.p_null: + count1 += 1 + count2 = 0 + for trials in range(self.sampling_distribution.grp2_sample_size): + if random.random() <= self.p_null: + count2 += 1 + + simulated_proportions[i] = float(count1) / self.sampling_distribution.grp1_sample_size - float(count2) / self.sampling_distribution.grp2_sample_size + return sorted(simulated_proportions) + + def will_reject(self, significance_level): + + return self.p_value_one_tail < significance_level, self.p_value_two_tail < significance_level \ No newline at end of file diff --git a/tests/dsl/two_group_unit_test.py b/tests/dsl/two_group_unit_test.py new file mode 100644 index 0000000..a980599 --- /dev/null +++ b/tests/dsl/two_group_unit_test.py @@ -0,0 +1,138 @@ +import unittest +from random import random + +from numpy.random.mtrand import normal + +from pysie.dsl.two_groups import MeanDiffTesting, ProportionDiffTesting +from pysie.stats.distributions import MeanDiffSamplingDistribution, DistributionFamily, \ + ProportionDiffSamplingDistribution +from pysie.stats.samples import Sample, SampleDistribution + + +class MeanDiffTestingUnitTest(unittest.TestCase): + + def test_normal(self): + grp1_mu = 0.0 + grp1_sigma = 1.0 + grp1_sample_size = 31 + grp1_sample = Sample() + + grp2_mu = 0.09 + grp2_sigma = 2.0 + grp2_sample_size = 36 + grp2_sample = Sample() + + for i in range(grp1_sample_size): + grp1_sample.add_numeric(normal(grp1_mu, grp1_sigma)) + + for i in range(grp2_sample_size): + grp2_sample.add_numeric(normal(grp2_mu, grp2_sigma)) + + sampling_distribution = MeanDiffSamplingDistribution(grp1_sample_distribution=SampleDistribution(grp1_sample), + grp2_sample_distribution=SampleDistribution(grp2_sample)) + self.assertEqual(sampling_distribution.distribution_family, DistributionFamily.normal) + testing = MeanDiffTesting(sampling_distribution=sampling_distribution) + print('one tail p-value: ' + str(testing.p_value_one_tail)) + print('two tail p-value: ' + str(testing.p_value_two_tail)) + reject_one_tail, reject_two_tail = testing.will_reject(0.01) + print('will reject mean_1 == mean_2 (one-tail) ? ' + str(reject_one_tail)) + print('will reject mean_1 == mean_2 (two-tail) ? ' + str(reject_two_tail)) + self.assertFalse(reject_one_tail) + self.assertFalse(reject_two_tail) + + def test_student(self): + grp1_mu = 0.0 + grp1_sigma = 1.0 + grp1_sample_size = 29 + grp1_sample = Sample() + + grp2_mu = 0.09 + grp2_sigma = 2.0 + grp2_sample_size = 28 + grp2_sample = Sample() + + for i in range(grp1_sample_size): + grp1_sample.add_numeric(normal(grp1_mu, grp1_sigma)) + + for i in range(grp2_sample_size): + grp2_sample.add_numeric(normal(grp2_mu, grp2_sigma)) + + sampling_distribution = MeanDiffSamplingDistribution(grp1_sample_distribution=SampleDistribution(grp1_sample), + grp2_sample_distribution=SampleDistribution(grp2_sample)) + self.assertEqual(sampling_distribution.distribution_family, DistributionFamily.student_t) + testing = MeanDiffTesting(sampling_distribution=sampling_distribution) + print('one tail p-value: ' + str(testing.p_value_one_tail)) + print('two tail p-value: ' + str(testing.p_value_two_tail)) + reject_one_tail, reject_two_tail = testing.will_reject(0.01) + print('will reject mean_1 == mean_2 (one-tail) ? ' + str(reject_one_tail)) + print('will reject mean_1 == mean_2 (two-tail) ? ' + str(reject_two_tail)) + self.assertFalse(reject_one_tail) + self.assertFalse(reject_two_tail) + + +class ProportionDiffTestingUnitTest(unittest.TestCase): + + def test_normal(self): + grp1_sample = Sample() + grp2_sample = Sample() + + for i in range(100): + if random() <= 0.6: + grp1_sample.add_category("OK") + else: + grp1_sample.add_category("CANCEL") + + for i in range(100): + if random() <= 0.61: + grp2_sample.add_category("OK") + else: + grp2_sample.add_category("CANCEL") + + sampling_distribution = ProportionDiffSamplingDistribution(grp1_sample_distribution=SampleDistribution( + grp1_sample, categorical_value="OK"), + grp2_sample_distribution=SampleDistribution( + grp2_sample, categorical_value="OK")) + self.assertEqual(sampling_distribution.distribution_family, DistributionFamily.normal) + + testing = ProportionDiffTesting(sampling_distribution=sampling_distribution) + print('one tail p-value: ' + str(testing.p_value_one_tail)) + print('two tail p-value: ' + str(testing.p_value_two_tail)) + reject_one_tail, reject_two_tail = testing.will_reject(0.01) + print('will reject p_1 == p_2 (one-tail) ? ' + str(reject_one_tail)) + print('will reject p_1 == p_2 (two-tail) ? ' + str(reject_two_tail)) + self.assertFalse(reject_one_tail) + self.assertFalse(reject_two_tail) + + def test_student(self): + grp1_sample = Sample() + grp2_sample = Sample() + + for i in range(20): + if random() <= 0.6: + grp1_sample.add_category("OK") + else: + grp1_sample.add_category("CANCEL") + + for i in range(20): + if random() <= 0.61: + grp2_sample.add_category("OK") + else: + grp2_sample.add_category("CANCEL") + + sampling_distribution = ProportionDiffSamplingDistribution(grp1_sample_distribution=SampleDistribution( + grp1_sample, categorical_value="OK"), + grp2_sample_distribution=SampleDistribution( + grp2_sample, categorical_value="OK")) + self.assertEqual(sampling_distribution.distribution_family, DistributionFamily.simulation) + + testing = ProportionDiffTesting(sampling_distribution=sampling_distribution) + print('one tail p-value: ' + str(testing.p_value_one_tail)) + print('two tail p-value: ' + str(testing.p_value_two_tail)) + reject_one_tail, reject_two_tail = testing.will_reject(0.01) + print('will reject p_1 == p_2 (one-tail) ? ' + str(reject_one_tail)) + print('will reject p_1 == p_2 (two-tail) ? ' + str(reject_two_tail)) + self.assertFalse(reject_one_tail) + self.assertFalse(reject_two_tail) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file