In [5]:
from IPython.display import HTML, display

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from scipy import stats

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import ast
import random
import math 
import time
import sys 

np.random.seed(6)

population_ages1 = stats.poisson.rvs(loc=18, mu=35, size=150000)
population_ages2 = stats.poisson.rvs(loc=18, mu=10, size=100000)
population_ages = np.concatenate((population_ages1, population_ages2))

minnesota_ages1 = stats.poisson.rvs(loc=18, mu=30, size=30)
minnesota_ages2 = stats.poisson.rvs(loc=18, mu=10, size=20)
minnesota_ages = np.concatenate((minnesota_ages1, minnesota_ages2))

print( population_ages.mean() )
print( minnesota_ages.mean() )

43.000112
39.26


In [6]:
#One-Sample T-Test

print(stats.ttest_1samp(a= minnesota_ages,               # Sample data
                 popmean= population_ages.mean())) #pop mean


print(stats.t.ppf(q=0.025,  # Quantile to check
            df=49))


print(stats.t.ppf(q=0.975,  # Quantile to check
            df=49))


print("Pvalue:", stats.t.cdf(x= -2.5742,    # T-test statistic
               df= 49) * 2)                 # Mupltiply by two for two tailed test*

sigma = minnesota_ages.std()/math.sqrt(50)  # Sample stdev/sample size

print("CI 95%:", stats.t.interval(0.95,                        # Confidence level
                 df = 49,                     # Degrees of freedom
                 loc = minnesota_ages.mean(), # Sample mean
                 scale= sigma))                # Standard dev estimate


print("CI 99%:",stats.t.interval(0.99,                        # Confidence level
                 df = 49,                     # Degrees of freedom
                 loc = minnesota_ages.mean(), # Sample mean
                 scale= sigma))                # Standard dev estimate

Ttest_1sampResult(statistic=-2.5742714883655027, pvalue=0.013118685425061678)
-2.0095752344892093
2.009575234489209
Pvalue: 0.013121066545690117
CI 95%: (36.369669080722176, 42.15033091927782)
CI 99%: (35.40547994092107, 43.11452005907893)


In [7]:
#Two-Sample T-Test

np.random.seed(12)
wisconsin_ages1 = stats.poisson.rvs(loc=18, mu=33, size=30)
wisconsin_ages2 = stats.poisson.rvs(loc=18, mu=13, size=20)
wisconsin_ages = np.concatenate((wisconsin_ages1, wisconsin_ages2))

print( wisconsin_ages.mean() )


print(stats.ttest_ind(a= minnesota_ages,
                b= wisconsin_ages,
                equal_var=False))    # Assume samples have equal variance?

42.8
Ttest_indResult(statistic=-1.7083870793286842, pvalue=0.09073104343957748)


In [8]:
#Paired t test  weight-loss drug 

np.random.seed(11)

before= stats.norm.rvs(scale=30, loc=250, size=100)

after = before + stats.norm.rvs(scale=5, loc=-0.25, size=100)

weight_df = pd.DataFrame({"weight_before":before,
                          "weight_after":after,
                          "weight_change":after-before})

print(weight_df.describe())             # Check a summary of the data

print(stats.ttest_rel(a = before,
                b = after))

       weight_before  weight_after  weight_change
count     100.000000    100.000000     100.000000
mean      250.345546    250.115171      -0.230375
std        28.132539     28.422183       4.783696
min       170.400443    166.913930     -10.495286
25%       230.421042    230.148236      -3.046211
50%       250.830805    252.134089      -0.413463
75%       270.637145    269.927258       2.738673
max       314.700233    317.720357      10.759282
Ttest_relResult(statistic=0.4815837177147337, pvalue=0.6311645282259187)
