# 3.7.4

## Data

In [18]:
import random
from scipy.stats import norm

In [19]:
def generate_normal_data(seed=42196, size=10000, mean=5000, std_dev=1000):
    random.seed(seed)
    return [int(mean + norm.ppf(random.random()) * std_dev) for _ in range(size)]

In [20]:
data = generate_normal_data()

## a. Quartiles

In [21]:
def compute_median(data):
    data.sort()
    if len(data) % 2: # odd 
        return data[len(data) // 2]
    else: # even
        return (data[len(data) // 2 - 1] + data[len(data) // 2]) / 2.

In [22]:
def compute_quartiles(data):
    data.sort()
    if len(data) % 2: # odd
        exclude = 1
    else: # even 
        exclude = 0
    second = compute_median(data)
    first = compute_median(data[:len(data)//2])
    # exclude median from upper/lower quartile computation if data length is odd
    # if even, include respective median contributors with their quartiles
    third = compute_median(data[len(data)//2 + exclude:])
    return (first, second, third)

In [23]:
(first, second, third) = compute_quartiles(data)
print (f"First quartile: {first}")
print (f"Second quartile: {second}")
print (f"Third quartile: {third}")

First quartile: 4319.0
Second quartile: 5003.0
Third quartile: 5675.0


## b. Empirical Rule

Demonstrate that 68% of data falls within one standard deviation of the mean, 95% falls within two, and 99.7% falls within 3.

In [24]:
def compute_mean(data):
    return sum(data) / len(data)

In [25]:
def compute_variance(data):
    # average of squared differences from mean
    mean = compute_mean(data)
    return sum([(x - mean) ** 2 for x in data]) / len(data)

In [26]:
def compute_standard_deviation(data):
    return compute_variance(data) ** 0.5

In [27]:
def demonstrate_empirical_rule(data):
    mean = compute_mean(data)
    std_dev = compute_standard_deviation(data)
    
    one_sd = 0
    two_sd = 0
    three_sd = 0
    for entry in data:
        if entry >= mean:
            if entry < mean + std_dev:
                one_sd += 1
            if entry < mean + 2 * std_dev:
                two_sd += 1
            if entry < mean + 3 * std_dev:
                three_sd += 1
        else:
            if entry > mean - std_dev:
                one_sd += 1
            if entry > mean - 2 * std_dev:
                two_sd += 1
            if entry > mean - 3 * std_dev:
                three_sd += 1
    
    print (f"{one_sd / len(data) * 100.:.2f}% of the data falls within one standard deviation of the mean.")
    print (f"{two_sd / len(data) * 100.:.2f}% of the data falls within two standard deviations of the mean.")
    print (f"{three_sd / len(data) * 100.:.2f}% of the data falls within three standard deviations of the mean.")

In [28]:
demonstrate_empirical_rule(data)

68.45% of the data falls within one standard deviation of the mean.
95.38% of the data falls within two standard deviations of the mean.
99.74% of the data falls within three standard deviations of the mean.


## c. Z-Scores

In [29]:
def compute_z_score(value, mean, std_dev):
    return (value - mean) / std_dev

In [32]:
mean = compute_mean(data)
std_dev = compute_standard_deviation(data)

print (f"With mean {mean:.2f} and standard deviation {std_dev:.2f}:")
for value in range(0, 10001, 1000):
    print (f"The Z-score for {value} is {compute_z_score(value, mean, std_dev)}.")

With mean 4998.26 and standard deviation 1009.00:
The Z-score for 0 is -4.953670657865341.
The Z-score for 1000 is -3.9625918092471464.
The Z-score for 2000 is -2.971512960628952.
The Z-score for 3000 is -1.9804341120107574.
The Z-score for 4000 is -0.9893552633925629.
The Z-score for 5000 is 0.0017235852256316843.
The Z-score for 6000 is 0.9928024338438262.
The Z-score for 7000 is 1.9838812824620207.
The Z-score for 8000 is 2.9749601310802154.
The Z-score for 9000 is 3.9660389796984097.
The Z-score for 10000 is 4.957117828316604.


## d. Confidence Intervals