BerkeleyX: Data8.3x

Foundations of Data Science: Prediction and Machine Learning

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lec 1.1 Introduction

In [None]:
values = np.array([2, 3, 3, 9])

# explicit calculation; average() could be weighted; use mean() otherwise
sum(values) / len(values), np.average(values), np.mean(values)

In [None]:
# plain algorithm for mean
print((2 + 3 + 3 + 9) / 4)

# focus on numerosity
print(2 * (1/4) + 3 * (2/4) + 9 * (1/4))
print(2 * 0.25 + 3 * 0.5 + 9 * 0.25)

In [None]:
# show it
values_table = Table().with_columns('Value', values)
print(values_table)

# bins_for_display = np.arange(0.5, 10.6, 1)
bins_for_display = np.arange(2, 11)

values_table.hist(bins = bins_for_display, ec = 'w', normed=None, density=True)

In [None]:
# ten twos, twenty threes, ten nines
twos = 2 * np.ones(10, dtype=np.int8)
threes = 3 * np.ones(20, dtype=np.int8)
nines = 9 * np.ones(10, dtype=np.int8)

new_values = np.append(np.append(twos, threes), nines)
print(new_values)

print(np.average(new_values), np.average(values))

new_values_table = Table().with_column('Value', new_values)
new_values_table.hist(bins = bins_for_display, ec = 'w', normed=None, density=True)

Lec 1.2 Average and Median

In [None]:
# a simple distribution
data = np.array([1] + [2] * 2 + [3] * 3 + [4] * 2 + [5])
print(data)
table = Table().with_columns('Data', data)
table.hist(bins= range(1, 7), ec = 'w', normed=None, density=True)
print(np.median(data), np.mean(data))

In [None]:
# a variation on the above distribution
table['Data'][8] = 10
table.hist(bins= range(1, 11), ec = 'w', normed=None, density=True)
print(np.median(table['Data']), np.mean(table['Data']))

In [None]:
nba = Table.read_table('../../data/nba2013.csv')
print(nba)
nba.hist('Height', bins=np.arange(65.5, 90.5), ec='w', normed=None, density=True)

In [None]:
heights = nba.column('Height')
print(percentile(50, heights), np.median(heights), np.average(heights))

Lec 1.3 Standard Deviation

In [None]:
sd_table = Table().with_columns('Value', values)
print(values)

average = np.average(values)
print('Average:', average)

# deviations from average
deviations = values - average
sd_table = sd_table.with_column('Deviation', deviations)
print(sd_table)

print('Deviations added up are useless:', sum(deviations))

sd_table = sd_table.with_column('Squared Deviation', deviations ** 2)
print(sd_table)

# Variance of the data is the average of the squared deviations
variance = np.average(sd_table.column('Squared Deviation'))
print('Variance:', variance)

# Standard Deviation (SD) is the square root of the variance
sd = variance ** 0.5
print('Standard Deviation:', sd, np.std(values))

Lec 1.4 Chebyshev's Bounds

In [None]:
births = Table.read_table('../../data/baby.csv')
print(births)

In [None]:
births.hist('Maternal Pregnancy Weight', normed=None, density=True)

In [None]:
mpw = births.column('Maternal Pregnancy Weight')
average = np.average(mpw)
sd = np.std(mpw)
print(average, sd)

within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(average - 3*sd, average + 3*sd))
print('Percentage in +- 3SD:', within_3_SDs.num_rows / births.num_rows)

# Chebyshev's bound for the proportion in the range "average plus or minus 3 SDs"
# is at least - for whichever distribution!
print("Chebyshev's inequality:", 1 - 1/3**2)
# mean +- 2SD >= .75 (1-1/4)
# 3SD >= .88 (8/9)
# 4SD >= .9375 (15/16)
# 5SD >= .96 (24/25)

In [None]:
births.hist(overlay = False, normed=None, density=True)

In [None]:
# See if Chebyshev's bounds work
# for different shapes of distributions

for k in births.labels:
    values = births.column(k)
    average = np.average(values)
    sd = np.std(values)
    print()
    print(k)
    for z in range(2, 6):
        chosen = births.where(k, are.between(average - z*sd, average + z*sd))
        proportion = chosen.num_rows / births.num_rows
        percent = round(proportion * 100, 2)
        print('Average +/- {} SDs: {}%'.format(z, percent))