In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# generate two random binomial sample sets
pop1 = np.random.binomial(10, 0.2, 10000)
pop2 = np.random.binomial(10,0.5, 10000) 
# generate sample sets of both binomial populations
sample1 = np.random.choice(pop1, 100, replace=True)
sample2 = np.random.choice(pop2, 100, replace=True)
# calculate and print mean
print("The mean of sample1 is: {}".format(sample1.mean()))
print("The mean of sample2 is: {}".format(sample2.mean()))
# calculate and print stdev
print("The standard deviation of sample1 is: {}".format(sample1.std()))
print("The standard deviation of sample2 is: {}".format(sample2.std()))


The mean of sample1 is: 1.94
The mean of sample2 is: 5.11
The standard deviation of sample1 is: 1.2068139873236472
The standard deviation of sample2 is: 1.4621559424356898


In [None]:
"""PREDICTION: If I increase the sample size the mean and standard deviation will behave more like a normal distribution.
Conversely, if I decrease the sample size the standard deviation will increase."""

In [3]:
# generate sample set with n = 1000
sample1_a = np.random.choice(pop1, 1000, replace=True)
sample2_a = np.random.choice(pop2, 1000, replace=True)
# calculate and print mean
print("The mean of sample1_a is: {}".format(sample1_a.mean()))
print("The mean of sample2_a is: {}".format(sample2_a.mean()))
# calculate and print stdev
print("The standard deviation of sample1_a is: {}".format(sample1_a.std()))
print("The standard deviation of sample2_a is: {}".format(sample2_a.std()))
# generate sample set with n = 20
sample1_b = np.random.choice(pop1, 20, replace=True)
sample2_b = np.random.choice(pop2, 20, replace=True)
# calculate and print mean
print("The mean of sample1_b is: {}".format(sample1_b.mean()))
print("The mean of sample2_b is: {}".format(sample2_b.mean()))
# calculate and print stdev
print("The standard deviation of sample1_b is: {}".format(sample1_b.std()))
print("The standard deviation of sample2_b is: {}".format(sample2_b.std()))


The mean of sample1_a is: 1.935
The mean of sample2_a is: 4.983
The standard deviation of sample1_a is: 1.2077975823787694
The standard deviation of sample2_a is: 1.5945880345719392
The mean of sample1_b is: 1.85
The mean of sample2_b is: 4.9
The standard deviation of sample1_b is: 1.1947803145348521
The standard deviation of sample2_b is: 1.3


In [4]:
"""REFLECTION: Decreasing the number of measurements decreased both the mean and stdev. Increasing the number of 
measurements also decreased the mean and stdev, but not by as much as with the smaller number of measurements. This 
behavior did not match my intuition."""

'REFLECTION: Decreasing the number of measurements decreased both the mean and stdev. Increasing the number of \nmeasurements also decreased the mean and stdev, but not by as much as with the smaller number of measurements. This \nbehavior did not match my intuition.'

In [None]:
"""PREDICTION: Increasing the p-value of pop1 will increase the p-value leading to an increased probability that 
difference in t-value is due to chance."""

In [5]:
# generate two random binomial sample sets, pop1 p-value = 0.3
pop1 = np.random.binomial(10, 0.3, 10000)
pop2 = np.random.binomial(10,0.5, 10000) 
# generate sample sets of both binomial populations
sample1 = np.random.choice(pop1, 100, replace=True)
sample2 = np.random.choice(pop2, 100, replace=True)

from scipy.stats import ttest_ind
print(ttest_ind(sample2, sample1, equal_var=False))

# generate two random binomial sample sets, pop1 p-value = 0.4
pop1 = np.random.binomial(10, 0.4, 10000)
pop2 = np.random.binomial(10,0.5, 10000) 
# generate sample sets of both binomial populations
sample1 = np.random.choice(pop1, 100, replace=True)
sample2 = np.random.choice(pop2, 100, replace=True)

from scipy.stats import ttest_ind
print(ttest_ind(sample2, sample1, equal_var=False))

Ttest_indResult(statistic=9.0088019580939029, pvalue=1.8201435207210435e-16)
Ttest_indResult(statistic=3.6004220719707067, pvalue=0.00040270584398703532)


In [None]:
"""REFLECTION: My insticts were confirmed by the calculations showing an increase in the p-value of several 
degrees of magnitude."""

In [None]:
"""PREDICTION: For another non-normal distribution the p-value will be smaller for larger sample sets."""

In [7]:
# WEIBULL distribution test
# generate two random weibull sample sets, pop1 p-value = 0.3
pop1_w = np.random.weibull(10, 1000)
pop2_w = np.random.weibull(10, 1000) 
# generate sample sets of both binomial populations
sample1_w = np.random.choice(pop1, 100, replace=True)
sample2_w = np.random.choice(pop2, 100, replace=True)
# calculate and print mean
print("The mean of sample1_w is: {}".format(sample1_w.mean()))
print("The mean of sample2_w is: {}".format(sample2_w.mean()))
# calculate and print stdev
print("The standard deviation of sample1_w is: {}".format(sample1_w.std()))
print("The standard deviation of sample2_w is: {}".format(sample2_w.std()))
# ttest_ind
from scipy.stats import ttest_ind
print(ttest_ind(sample2, sample1, equal_var=False))

# generate sample sets of both binomial populations
# increase sample size to observe effect on ttest_ind
sample1_w = np.random.choice(pop1, 1000, replace=True)
sample2_w = np.random.choice(pop2, 1000, replace=True)
# calculate and print mean
print("The mean of sample1_w is: {}".format(sample1_w.mean()))
print("The mean of sample2_w is: {}".format(sample2_w.mean()))
# calculate and print stdev
print("The standard deviation of sample1_w is: {}".format(sample1_w.std()))
print("The standard deviation of sample2_w is: {}".format(sample2_w.std()))
# ttest_ind
from scipy.stats import ttest_ind
print(ttest_ind(sample2, sample1, equal_var=False))

The mean of sample1_w is: 3.99
The mean of sample2_w is: 5.24
The standard deviation of sample1_w is: 1.4730580436629102
The standard deviation of sample2_w is: 1.5173661390712527
Ttest_indResult(statistic=3.6004220719707067, pvalue=0.00040270584398703532)
The mean of sample1_w is: 4.022
The mean of sample2_w is: 5.014
The standard deviation of sample1_w is: 1.5236521912825116
The standard deviation of sample2_w is: 1.5471922957409012
Ttest_indResult(statistic=3.6004220719707067, pvalue=0.00040270584398703532)


In [None]:
"""REFLECTION: Neither the mean, nor the stdev of Weibull distribution sample set seem to be affected much by 
the size of the sample set. The ttest and pvalues are identical."""