In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

!mkdir -p '/content/gdrive/My Drive/colab-materials-data8-notebooks/'
!git clone https://github.com/data-8/materials-sp22-colab '/content/gdrive/My Drive/colab-materials-data8-notebooks/materials-sp22-colab/'

%cd /content/gdrive/MyDrive/colab-materials-data8-notebooks/materials-sp22-colab/lectures/
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

## The Broste Thesis

In [2]:
summary = Table(['Age', 'Condition', 'Total', 'Deaths', 'CHD Deaths']).with_rows([
    ['0-34',  'Diet',    1367, 3, 0],
    ['35-44', 'Diet',    728, 3, 0],
    ['45-54', 'Diet',    767, 14, 4],
    ['55-64', 'Diet',    870, 35, 7],
    ['65+',   'Diet',    953, 190, 42],
    ['0-34',  'Control', 1337, 7, 1],
    ['35-44', 'Control', 731, 4, 1],
    ['45-54', 'Control', 816, 16, 4],
    ['55-64', 'Control', 896, 33, 12],
    ['65+',   'Control', 958, 162, 34],   
])
summary

Age,Condition,Total,Deaths,CHD Deaths
0-34,Diet,1367,3,0
35-44,Diet,728,3,0
45-54,Diet,767,14,4
55-64,Diet,870,35,7
65+,Diet,953,190,42
0-34,Control,1337,7,1
35-44,Control,731,4,1
45-54,Control,816,16,4
55-64,Control,896,33,12
65+,Control,958,162,34


In [3]:
np.arange(12) < 3

array([ True,  True,  True, False, False, False, False, False, False,
       False, False, False])

In [4]:
subjects = Table(['Age', 'Condition', 'Participated', 'Died'])
for row in summary.rows:
    i = np.arange(0, row.item('Total'))
    t = Table().with_columns('Died', i < row.item('Deaths'))
    t.append_column('Age', row.item('Age'))
    t.append_column('Condition', row.item('Condition'))
    t.append_column('Participated', True)
    subjects.append(t)
subjects

Age,Condition,Participated,Died
0-34,Diet,True,True
0-34,Diet,True,True
0-34,Diet,True,True
0-34,Diet,True,False
0-34,Diet,True,False
0-34,Diet,True,False
0-34,Diet,True,False
0-34,Diet,True,False
0-34,Diet,True,False
0-34,Diet,True,False


In [6]:
subjects.group(['Age', 'Condition'], sum)

Age,Condition,Participated sum,Died sum
0-34,Control,1337,7
0-34,Diet,1367,3
35-44,Control,731,4
35-44,Diet,728,3
45-54,Control,816,16
45-54,Diet,767,14
55-64,Control,896,33
55-64,Diet,870,35
65+,Control,958,162
65+,Diet,953,190


In [7]:
def hazard_rate(counts):
    return counts.item('Died sum') / counts.item('Participated sum')

def rate_difference(t):
    counts = t.drop('Age').group('Condition', sum)
    return abs(hazard_rate(counts.row(1)) - hazard_rate(counts.row(0)))

rate_difference(subjects)

0.005439343927004493

In [8]:
rate_difference(subjects.where('Age', '0-34'))

0.0030410154080667343

In [9]:
rate_difference(subjects.where('Age', '65+'))

0.030268112783058437

In [10]:
def test(t):
    observed = rate_difference(t)
    repetitions = 200

    stats = make_array()
    for i in np.arange(repetitions):
        simulated_results = t.select('Died').sample().column('Died')
        simulated_outcomes = t.with_column('Died', simulated_results)
        simulated_stat = rate_difference(simulated_outcomes)
        stats = np.append(stats, simulated_stat)

    # Find the empirical P-value:
    p = np.count_nonzero(stats >= observed) / repetitions
    
    print('Observed absolute difference in hazard rates:', observed)
    print('P-value:', p)

#test(subjects)

In [11]:
for age in subjects.group('Age').column('Age'):
    print('Ages', age)
    test(subjects.where('Age', age))

Ages 0-34
Observed absolute difference in hazard rates: 0.0030410154080667343
P-value: 0.165
Ages 35-44
Observed absolute difference in hazard rates: 0.0013510771034710841
P-value: 0.78
Ages 45-54
Observed absolute difference in hazard rates: 0.0013549096300841078
P-value: 0.84
Ages 55-64
Observed absolute difference in hazard rates: 0.00339952791461412
P-value: 0.67
Ages 65+
Observed absolute difference in hazard rates: 0.030268112783058437
P-value: 0.115
