# 3.7.2

## Data

In [5]:
import random
from scipy.stats import norm
import numpy as np

In [6]:
def generate_normal_data(seed=42196, size=10000, mean=5000, std_dev=1000):
    random.seed(seed)
    return [int(mean + norm.ppf(random.random()) * std_dev) for _ in range(size)]

In [7]:
data = generate_normal_data()

## a. Mean

In [8]:
def compute_mean(data):
    return sum(data) / len(data)

In [9]:
mean = compute_mean(data)

print (f"Mean: {mean:.2f}")

Mean: 4998.26


## b. Median

In [10]:
def compute_median(data):
    data.sort()
    if len(data) % 2: # odd 
        return data[len(data) // 2]
    else: # even
        return (data[len(data) // 2 - 1] + data[len(data) // 2]) / 2.

In [11]:
median = compute_median(data)
print (f"Median: {median:.2f}")

Median: 5003.00


## c. Mode

In [12]:
def compute_mode(data):
    # map values to counts
    counts = {}
    for entry in data:
        if entry not in counts.keys():
            counts[entry] = 1
        else:
            counts[entry] += 1
    
    # max occurences
    high_count = max(list(counts.values()))
    
    modes = []
    for key in counts.keys():
        if counts[key] == high_count:
            modes.append(key)
            
    return modes

In [13]:
modes = compute_mode(data)
print (f"Modes: {modes}")

Modes: [5532]


## d. Quartiles

Exclude median from upper/lower quartile computation if data length is odd. If even, include respective median contributors with their quartiles.

In [14]:
def compute_quartiles(data):
    data.sort()
    if len(data) % 2: # odd
        exclude = 1
    else: # even 
        exclude = 0
    second = compute_median(data)
    first = compute_median(data[:len(data)//2])
    third = compute_median(data[len(data)//2 + exclude:])
    return (first, second, third)

In [15]:
(first, second, third) = compute_quartiles(data)
print (f"First quartile: {first}")
print (f"Second quartile: {second}")
print (f"Third quartile: {third}")

First quartile: 4319.0
Second quartile: 5003.0
Third quartile: 5675.0


## e. Standard Deviation

Square root of variance.

In [16]:
def compute_variance(data):
    # average of squared differences from mean
    mean = compute_mean(data)
    return sum([(x - mean) ** 2 for x in data]) / len(data)

In [17]:
def compute_standard_deviation(data):
    return compute_variance(data) ** 0.5

In [18]:
std_dev = compute_standard_deviation(data)
print (f"Standard deviation: {std_dev:.2f}")

Standard deviation: 1009.00


## f. Variance

Average of squared differences from mean.

In [19]:
variance = compute_variance(data)
print (f"Variance: {variance:.2f}")

Variance: 1018083.94


## g. Covariance

In [20]:
# sample data
np.random.seed(42196)
data = np.random.rand(5, 2)

covariance_matrix = np.cov(data, rowvar=False)
print(covariance_matrix)

[[0.09269736 0.00407952]
 [0.00407952 0.09826989]]


Now let's compute manually. 

In [21]:
def compute_covariance(data):
    # step 1: compute means of each variable
    mean_x = compute_mean(data[:,0])
    mean_y = compute_mean(data[:,1])
    
    # step 2: compute deviations from mean for each value
    dev_x = data[:,0] - mean_x
    dev_y = data[:,1] - mean_y
    
    # might be missing something here... compare to matrix?
    cov = sum(dev_x * dev_y) / (len(data) - 1)
    
    print("Covariance:", cov)

In [22]:
compute_covariance(data)

Covariance: 0.004079516436756522
