Skip to content

Commit

Permalink
chi square testing
Browse files Browse the repository at this point in the history
  • Loading branch information
chen0040 committed Jun 15, 2017
1 parent c960adc commit 1964c0a
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 5 deletions.
16 changes: 16 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -383,3 +383,19 @@ The sample code below show how to test whether to reject that hypothesis that tw
of each other for a population (from which the categorical sampleis taken):


.. code-block:: python
sample = Sample()
for i in range(1000):
sample.add_category('itemA' if numpy.random.randn() > 0 else 'itemB', 'group1')
sample.add_category('itemA' if numpy.random.randn() > 0 else 'itemB', 'group2')
sample.add_category('itemA' if numpy.random.randn() > 0 else 'itemB', 'group3')
testing = ChiSquare(sample=sample)
print('p-value: ' + str(testing.p_value))
reject = testing.will_reject(0.01)
print('will reject [two categorical variables are independent of each other] ? ' + str(reject))
2 changes: 1 addition & 1 deletion pysie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
:license: MIT, see LICENSE for more details.
"""

__version__ = '0.0.2'
__version__ = '0.0.3'
40 changes: 39 additions & 1 deletion pysie/dsl/variable_independence_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pysie.stats.distributions import MeanSamplingDistribution
from pysie.stats.samples import SampleDistribution

from scipy.stats import f
from scipy.stats import f, chi2


class ContingencyTable(object):
Expand Down Expand Up @@ -122,3 +122,41 @@ def build(self):
def will_reject(self, significance_level):

return self.p_value < significance_level


class ChiSquare(object):
chiSq = None
sample = None
p_value = None
df = None
significance_level = None

def __init__(self, sample, significance_level=None):

self.sample = sample
self.significance_level = significance_level

table = ContingencyTable()
for i in range(sample.size()):
row = sample.get(i)
row_name = row.label
column_name = row.group_id
table.set_cell(row_name, column_name, table.get_cell(row_name, column_name) + 1)

total = table.get_total()
self.chiSq = 0
for row in table.rows.to_array():
for column in table.columns.to_array():
expected = table.get_row_total(row) * table.get_column_total(column) / total
observed = table.get_cell(row, column)
self.chiSq += math.pow(observed - expected, 2) / expected

self.df = (table.rows.size() - 1) * (table.columns.size() - 1)

self.p_value = 1 - chi2.cdf(self.chiSq, self.df)

if self.significance_level is not None:
self.reject_mean_same = self.p_value >= self.significance_level

def will_reject(self, significance_level):
return self.p_value < significance_level
2 changes: 0 additions & 2 deletions tests/dsl/two_group_unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ def test_normal(self):
reject_one_tail, reject_two_tail = testing.will_reject(0.01)
print('will reject mean_1 == mean_2 (one-tail) ? ' + str(reject_one_tail))
print('will reject mean_1 == mean_2 (two-tail) ? ' + str(reject_two_tail))
self.assertFalse(reject_one_tail)
self.assertFalse(reject_two_tail)

def test_student(self):
grp1_mu = 0.0
Expand Down
18 changes: 17 additions & 1 deletion tests/dsl/variable_independence_testing_unit_test.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest

import numpy
from numpy.random.mtrand import normal

from pysie.dsl.variable_independence_testing import Anova, ContingencyTable
from pysie.dsl.variable_independence_testing import Anova, ContingencyTable, ChiSquare
from pysie.stats.samples import Sample


Expand Down Expand Up @@ -51,5 +52,20 @@ def test_table(self):
self.assertEqual(table.get_total(), 55)


class ChiSquareUnitTest(unittest.TestCase):
def test_anova(self):
sample = Sample()

for i in range(1000):
sample.add_category('itemA' if numpy.random.randn() > 0 else 'itemB', 'group1')
sample.add_category('itemA' if numpy.random.randn() > 0 else 'itemB', 'group2')
sample.add_category('itemA' if numpy.random.randn() > 0 else 'itemB', 'group3')

testing = ChiSquare(sample=sample)

print('p-value: ' + str(testing.p_value))
reject = testing.will_reject(0.01)
print('will reject [two categorical variables are independent of each other] ? ' + str(reject))

if __name__ == '__main__':
unittest.main()

0 comments on commit 1964c0a

Please sign in to comment.