# <font color=red>Tutorial 8 - Hypotheses testing - Bootstrap</font>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Sri Lanka population height sample

We would like to estimate the mean height of Sri Lanka's population based on a sample provided in the attached CSV file, which we assume, represents the entire Sri Lanka's population. We will use the bootstrap method to estimate the mean height with confidence of 95%.

### Read CSV file

In [None]:
sri_lanka_sample_df = pd.read_csv('Tutorial_8_data.csv')
sri_lanka_sample_df.head()

In [None]:
sri_lanka_sample_df.shape

### Use bootstrap to estimate Sri Lanka's population height mean

In [None]:
def bootstrap_mean(original_sample, column_name, num_replications):
    '''This function returns an array of bootstrapped sample means:
    original_sample: df containing the original sample
    column_name: name of column containing the variable of interest
    num_replications: number of bootstrap samples  '''  
    original_sample_size = original_sample.shape[0] # we need to replicate with the same sample size
    original_sample_var_of_interest = original_sample[[column_name]]
    bstrap_means = np.empty(num_replications)
    for i in range(num_replications):
        bootstrap_sample = original_sample_var_of_interest.sample(original_sample_size, replace=True) # note WITH REPLACEMENT!
        resampled_mean = bootstrap_sample.mean()
        bstrap_means[i] = resampled_mean
    
    return bstrap_means

In [None]:
# run the bootstrap procedure
means_bootstrapped = bootstrap_mean(sri_lanka_sample_df, 'Height(cm)', 5000)

In [None]:
# visualize results
fig, axes = plt.subplots()
sns_ax = sns.distplot(means_bootstrapped, kde=False, bins=10, ax=axes);
sns_ax.set(xlabel='Height mean', ylabel='Number of simulations');
fig.suptitle('Bootstrap means distribution (5000 simulations)')

In [None]:
sri_lanka_sample_df['Height(cm)'].mean()

In [None]:
means_bootstrapped.min()

In [None]:
means_bootstrapped.max()

In [None]:
# 95% confidence interval for the gross income of movies based on our sample
left_end = np.percentile(means_bootstrapped, 2.5, interpolation='higher')
right_end =  np.percentile(means_bootstrapped, 97.5, interpolation='higher')
print('lower value: ', left_end)
print('upper value: ', right_end)

In [None]:
# visualize results
ax = sns.distplot(means_bootstrapped, kde=False);
plt.hlines(y=0, xmin=left_end, xmax=right_end, colors='orange', linestyles='solid', lw=8)  # show line of values between 2.5 and 97.5 percentiles

## <font color=blue> **Exercise** </font>

Since Sri Lanka's males are relatively short, a student assumed that there is no difference between the mean height of males and females in Sri Lanka. Examine the student's claim.

1. Clearly state the two hypotheses

2. What is your test statistic? 

3. Use the bootstrap method to examine the student's claim