In [1]:
from keyword import iskeyword
import numpy as np

def show_var(*names):
    for name in names:
        show(f'{name} = {n(globals()[name], digits=3)}')

def is_valid_var_name(name):
    return name.isidentifier() and not iskeyword(name)


class Table(object):
    def __init__(self, data,
                 row_label=None, row_categories=None,
                 column_label=None, column_categories=None):
        self.row_label = row_label
        self.row_categories = row_categories
        self.column_label = column_label
        self.column_categories = column_categories
        self._data = np.asarray(data, dtype=object)
    
    def __getitem__(self, key):
        return self._data[key]
    
    def __setitem__(self, key, item):
        self._data[key] = item
    
    def __call__(self, row_key, column_key):
        return self._data[self.row_categories.index(row_key)][self.column_categories.index(column_key)]
        
        
def draw_table(data,
               row_label=None, row_categories=None,
               column_label=None, column_categories=None,
               name=None, frame=False, **kwargs):
    if isinstance(data, Table):
        row_label = data.row_label
        row_categories = data.row_categories
        column_label = data.column_label
        column_categories = data.column_categories
        data = data[:]
    
    assert len(data) >= 1 and len(data[0]) >= 1
    assert name is None or is_valid_var_name(name)
    
    if 'skip_zeroes' not in kwargs: kwargs['skip_zeroes'] = True
    if 'truncate' not in kwargs: kwargs['truncate'] = True
        
    if name is not None:
        globals()[name] = Table(data,
                                row_label=row_label, 
                                row_categories=row_categories,
                                column_label=column_label, 
                                column_categories=column_categories)
    
    cat_or_label = any([row_label, column_label, 
                        row_categories, column_categories])
    
    s = r'\begin{array}{'
    if frame:
        s += '|'
    if cat_or_label:
        s += 'r|'
    s += 'c{}'.format('|' if frame else ' ') * len(data[0]) + '} '
    if frame:
        s += r'\hline '
    if row_categories:
        s += r'{\bf ' + row_label + r' \backslash ' + column_label + '} & '
        for l in column_categories:
            s += r'{\bf ' + str(l) + '}' + ' & '
        s = s[:-2] + r' \\ \hline '
    for i, row in enumerate(data):
        if row_categories:
            s += r'{\bf' + str(row_categories[i]) + '} & '
        for e in row:
            if parent(e) is RR:
                s += e.str(**kwargs)
            else:
                s += str(e)
            s += ' & '
        s = s[:-2] + r'\\ '
        if frame:
            s += r'\hline '
    s += r'\end{array}'
    return s

##### F21ETSMP Group Assignment 12
# Twin Study - Comparison of two data sets

In a study we test the lifespan of 10 pairs of twins where one has been smoking and the other has not.
### Table 1: 
{{draw_table([[100, 45], [84, 91], [82, 54], [70, 67], [88, 95], [62, 53], [91, 85], [35, 95], [75, 62], [81, 74]], row_label='no.', row_categories=range(1, 11), column_label='twin', column_categories=['non-smoker', 'smoker'], name='table1')}}  
In this assignment, you have to find out what test to perform to compare the two sample sets.You should answer the following questions:

## 1)  Estimate the mean and the variance of the population samples.

The empirical mean and variance can be estimated using the following equations for $n$ observations $\{x_1, \dots, x_n\}$:  
$$ \hat \mu = \hat x = \frac 1 n \sum_{i=1}^n x_i$$  
$$ \hat \sigma^2  = \frac 1 {n-1} \sum_{i=1}^n {{(x_i-\hat\mu)}^2}$$  

In [2]:
def emp_mean_and_var(data):
    n = len(data)
    E = sum(data)
    mean = E/n
    var = sum([(e-mean)^2 for e in data])/(n-1)
    return mean, var

non_smoker_mean, non_smoker_var = emp_mean_and_var(table1[:,0]);
smoker_mean, smoker_var = emp_mean_and_var(table1[:,1]); 
show_var('non_smoker_mean', 'non_smoker_var', 'smoker_mean', 'smoker_var')

## 2) Find the mean of difference of the population samples

Using the same method as above the mean and variance of the difference ($\delta$) is calculated:

In [3]:
diff_mean, diff_var = emp_mean_and_var([e[1]-e[0] for e in table1[:]])
show_var('diff_mean', 'diff_var')

## 3) Formulate the null-hypothesis to test wether the two samples comes from populationswith the same mean

To test whether the two populations have the same mean, when looking at the difference $d_i = x_{2i} - x_{1i}$, the null hypothesis is that the difference $\delta$ is zero:  
$\text{H0: } \delta = 0$ 

## 4) Formulate the alternative hypothesis to the NULL hypothesis

Which leads to the alternative hypothesis that delta is different from zero:  
$\text{H1: } \delta \ne 0$ 

## 5) What statistical test would would you perform to test the hypothesis?

Since the true variances of the populations are unknown, a paired t-test is needed to test the hypothesis

## 6)  Calculate the test-statistics

Here the following formulas are used:  
$$ t = \frac {\hat\delta - \delta_0} {\hat\sigma \mathbin/ \sqrt n} \sim \mathcal t(n-1)$$  
$$ 𝑝𝑣𝑎𝑙=2 \left(1 − \mathcal{𝑡}_{𝑐𝑑𝑓}\left(|𝑡|,𝑛−1\right) \right)$$

In [12]:
import scipy.stats
n_ = len(table1[:])
t = diff_mean/(sqrt(diff_var)/sqrt(n_))
pval = 2*(1-scipy.stats.t.cdf(float(abs(t)), int(n_-1)))
show_var('t', 'pval')