# Class 4 - Inference with Continous Variables

In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import numpy as np
from scipy import stats
import random

### Standard Error & 95% Confidence Interval
The Graduate Record Examination (GRE) this year had a mean of 151 and standard deviation of 7 for the verbal reasoning section (10,000 students writing). If we take a subset of students and look at the mean, how close will we be to 151?

In [2]:
# Simulate the test data (population)
test_data = np.random.normal(loc=151, scale=7, size=10000)
test_data = np.round(test_data)

In [3]:
running_mean = []
for i in range(1, 200):
    running_mean.append(test_data[:i].mean())

In [4]:
trace = go.Scatter(
    x = np.arange(1, 200),
    y = running_mean
)

layout = go.Layout(
    xaxis=dict(
        title='Size of Population'
    ),
    yaxis=dict(
        title='Mean'
    )
)

fig = go.Figure(data = [trace], layout = layout)
iplot(fig)

What happens when we change the number of students in our sample? In this example, we're taking the means of samples of 10, 100, and 1000 students. We'll take each of these means 1000 times.

In [5]:
means_of_10 = []
means_of_100 = []
means_of_1000 = []

# test_data = np.random.uniform(low=140, high=160, size=10000).astype(int)

for i in range(1000):
    means_of_10.append(np.mean(np.random.choice(test_data, size=10, replace=False)))
    means_of_100.append(np.mean(np.random.choice(test_data, size=100, replace=False)))
    means_of_1000.append(np.mean(np.random.choice(test_data, size=1000, replace=False)))


In [6]:
trace1 = go.Histogram(
    x=means_of_10,
    opacity=0.75,
    histnorm="probability",
    name="10"
)

trace2 = go.Histogram(
    x=means_of_100,
    opacity=0.75,
    histnorm="probability",
    name="100"
)

trace3 = go.Histogram(
    x=means_of_1000,
    opacity=0.75,
    histnorm="probability",
    name="1000"
)

layout = go.Layout(
    xaxis=dict(title="Mean"),
    yaxis=dict(title="Probability")
)

data = [trace1, trace2, trace3]
fig = go.Figure(data = data, layout = layout)
iplot(fig)

The standard deviation of the sample mean tells us how far the typical estimate is away from the actual population mean. It also describes the typical error of the point estimate, and for this reason we usually call this standard deviation the standard error (SE) of the estimate.

In [7]:
print(np.std(means_of_10))
print(np.std(means_of_100))
print(np.std(means_of_1000))

2.2547538313527715
0.7149548818631847
0.21295119721663913


Standard error can be approximated as the population standard deviation / square root of the sample size

In [8]:
se_10 = 7 / np.sqrt(10)
print(se_10)

se_100 = 7 / np.sqrt(100)
print(se_100)

se_1000 = 7 / np.sqrt(1000)
print(se_1000)

2.2135943621178655
0.7
0.22135943621178655


Often the standard deviation of the population is unknown. In that case, if our sample is sufficiently large (>30), we can use the standard deviation of the sample.

In [9]:
se_100_sampleSD = np.std(np.random.choice(test_data, size=100, replace=False), ddof=1) / np.sqrt(100)
print(se_100_sampleSD)

0.6727562783809676


Since this is normally distributed, 95% of the plot is covered by approximately two standard deviations (1.96 x SE). Therefore, we can say that 95% of the sample means will land within a given range.

Example: Thirty students are graduates of Gabe's GRE training course, their marks are displayed below. Is the course effective?

In [10]:
student_marks = [170, 152, 155, 156, 142, 158, 148, 144, 153, 155, 
                 161, 163, 153, 166, 155, 158, 161, 155, 146, 142,
                 165, 145, 153, 155, 158, 169, 178, 155, 166, 178]

se = np.std(student_marks, ddof=1) / np.sqrt(student_marks.__len__())
print("The standard error is %.4f" % se)

ci = se * 1.96 
print("Our value with confidence interval is %.4f +/- %.4f" % (np.mean(student_marks), ci))

The standard error is 1.7134
Our value with confidence interval is 157.1667 +/- 3.3583


Since the population mean (151) is beyond our confidence interval, we reject the null hypothesis that the class was not effective.

### Understanding the Z-score
What if we don't have a sample distribution, but instead only a single value?

Sophia who took the GRE scored 160 on the Verbal Reasoning section and 157 on the Quantitative Reasoning section. The mean score for Verbal Reasoning section for all test takers was 151 with a standard deviation of 7, and the mean score for the Quantitative Reasoning was 153 with a standard deviation of 7.67. Suppose that both distributions are nearly normal.

In [11]:
# What is Sophia's z-score on the verbal reasoning section?
z_score_v = (160 - 151) / 7
print("Z-score on verbal reasoning is %.4f" % z_score_v)

# What is Sophia's z-score on the quantitative reasoning section?
z_score_q = (157 - 153) / 7.67
print("Z-score on quantitative reasoning is %.4f" % z_score_q)

Z-score on verbal reasoning is 1.2857
Z-score on quantitative reasoning is 0.5215


In [12]:
data = [go.Histogram(x=test_data, histnorm='probability')]

layout = go.Layout(
    shapes = [
        {
            'type': 'line',
            'x0': 160,
            'y0': 0,
            'x1': 160,
            'y1': 0.06,
            'line': {
                'color': 'rgb(155, 0, 0)',
                'width': 3,
            },
        }
    ],
    xaxis=dict(title="Score"),
    yaxis=dict(title="Probability")
)

fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [13]:
# What is the corresponding percentile for these scores?
print("Percentile for verbal reasoning score is %.4f" % stats.norm.cdf(z_score_v))
print("Percentile for quantitative reasoning score is %.4f" % stats.norm.cdf(z_score_q))

Percentile for verbal reasoning score is 0.9007
Percentile for quantitative reasoning score is 0.6990


### Why do we use 0.05 as a threshold for significance?

www.openintro.org/why05

### So what do we do when we don't have 30 samples?
Let's say the class only had 10 students. Now, we will not have a great estimation of the population standard deviation. Instead we use the t-distribution in place of 'z'. 

In [14]:
student_marks_small = [170, 152, 155, 156, 142, 158, 148, 144, 153, 155]
n = student_marks_small.__len__()
degrees_of_freedom = n - 1

sample_mean = np.mean(student_marks_small)
sample_sd = np.std(student_marks_small, ddof=1)
sample_se = sample_sd / (np.sqrt(n))

small_ci = stats.t.ppf(1-0.05, degrees_of_freedom) * sample_se

print("Our value with confidence interval is %.4f +/- %.4f" % (sample_mean, small_ci))

Our value with confidence interval is 153.3000 +/- 4.5648


In this case, we do not have sufficient evidence to reject the null hypothesis. Another way to approach this problem would be to calcualte the t-statistic ourself and look up the associated p-value.

In [15]:
t_statistic = (151 - sample_mean) / (sample_sd / np.sqrt(n))
print("t-statistic is %.4f" % t_statistic)
p_val = stats.t.sf(np.abs(t_statistic), n-1)
print("p-value is %.4f" % p_val)

t-statistic is -0.9236
p-value is 0.1899


### The t-test statistic with two means (paired)
Let's say I allowed that set of students to re-take the test following a second round of my training course. So now we have two sets of values for the same group. Did the second course improve their marks?

In [16]:
original_marks = [170, 152, 155, 156, 142, 158, 148, 144, 153, 155]
revised_marks = [174, 155, 179, 155, 145, 160, 155, 150, 155, 165]

In this case, it might be best to look at the changes for each student as opposed to the overall marks. We'll create a third array that just shows the difference.

In [17]:
mark_diff = []
n = original_marks.__len__()
for i in range(n):
    mark_diff.append(revised_marks[i] - original_marks[i])
diff_mean = np.mean(mark_diff)
diff_sd = np.std(mark_diff, ddof=1)

So, in this case we can again apply our t-statistic test. In this case we're looking at distance from 0

In [18]:
t_statistic = (0 - diff_mean) / (diff_sd / np.sqrt(n))
print("t-statistic is %.4f" % t_statistic)
p_val = stats.t.sf(np.abs(t_statistic), n-1)
print("p-value is %.4f" % p_val)

t-statistic is -2.7014
p-value is 0.0122


In this case the t-statistic is small enough that we can reject the null hypothesis

### The t-test statistic with two means (independent)
The above example worked because we had groups that were paired (same individuals, different measurements). What if the data were unpaired? In this case we can not calculate the t-statistic the same way. In the example below we compare the values of two groups of students.

In [19]:
group_1_marks = [170, 152, 155, 156, 142, 158, 148, 144, 153, 155]
group_2_marks = [171, 159, 164, 175, 155, 181, 154, 158, 160, 170]

n1 = group_1_marks.__len__()
n2 = group_2_marks.__len__()

group_1_mean = np.mean(group_1_marks)
group_2_mean = np.mean(group_2_marks)

group_1_sd = np.std(group_1_marks, ddof=1)
group_2_sd = np.std(group_2_marks, ddof=1)

se = np.sqrt((group_1_sd**2)/(n1) + (group_2_sd**2)/(n2))

In [20]:
t_statistic = (group_1_mean - group_2_mean) / se
print("t-statistic is %.4f" % t_statistic)
p_val = stats.t.sf(np.abs(t_statistic), n-1)
print("p-value is %.4f" % p_val)

t-statistic is -2.9924
p-value is 0.0076


Or we could just use the python function

In [21]:
# Or we could just use the python function
stats.ttest_ind(group_1_marks, group_2_marks, equal_var=False)

Ttest_indResult(statistic=-2.9924111641121973, pvalue=0.007946445505460767)