# Code for Chapter 1.  

Code for Chapter 2. Showcases tools for exploratory data analysis.

## Snippet 1

In [35]:
import pandas as pd

df = pd.read_csv('data/01_heights_weights_genders.csv')

df['Height'].describe()

count    10000.000000
mean        66.367560
std          3.847528
min         54.263133
25%         63.505620
50%         66.318070
75%         69.174262
max         78.998742
Name: Height, dtype: float64

## Snippet 2
Define our own mean and median functions.

In [36]:
def mean(x):
    return sum(x) / len(x)


def median(x):
    x = sorted(x)
    listlength = len(x) 
    num = listlength//2
    if listlength%2==0:
        middlenum = (x[num]+x[num-1])/2
    else:
        middlenum = x[num]
    return middlenum



## Snippet 3
Compare means and medians on toy examples.

In [37]:
myVector = range(1, 100)

myVector

range(1, 100)

In [38]:
mean(myVector)

50.0

In [39]:
median(myVector)

50

## Snippet 4
Confirm that our mean and median functions produce the correct answer.

In [40]:
import statistics 

mean(myVector) - statistics.mean(myVector)

0.0

In [41]:
median(myVector) - statistics.median(myVector)

0

## Snippet 5

Experiment with functions for assessing the range of a data set.

In [42]:
df['Height'].min()

54.2631333250971

## Snippet 6

In [43]:
df['Height'].max()

78.99874234638959

## Snippet 7

In [44]:
[df['Height'].min(), df['Height'].max()]

[54.2631333250971, 78.99874234638959]

## Snippet 8
Try out the quantile function for computing arbitrary quantiles.

In [45]:
df['Height'].describe(percentiles=[0, 0.25, 0.50, 0.75, 1])

count    10000.000000
mean        66.367560
std          3.847528
min         54.263133
0%          54.263133
25%         63.505620
50%         66.318070
75%         69.174262
100%        78.998742
max         78.998742
Name: Height, dtype: float64

## Snippet 9

In [46]:
import numpy as np
df['Height'].describe(percentiles=numpy.arange(0, 1, 0.2))

count    10000.000000
mean        66.367560
std          3.847528
min         54.263133
0%          54.263133
20%         62.859007
40%         65.194221
50%         66.318070
60.0%       67.435374
80%         69.811620
max         78.998742
Name: Height, dtype: float64

## Snippet 10

In [47]:
np.arange(0, 1, 0.2)

array([0. , 0.2, 0.4, 0.6, 0.8])

## Snippet 11
Define a variance function to assess the spread of data.

In [48]:
def var(x):
    m = median(x)
    sum = 0
    for x_i in x:
        sum += (x_i - m)**2
    return sum / len(x)

## Snippet 12
Test our variance function for correctness.

In [49]:
np.var(df['Height']) - var(df['Height'])

-0.002449227737226778

## Snippet 13
Update the variance function to make it unbiased.

In [50]:
def var(x):
    m = median(x)
    sum = 0
    for x_i in x:
        sum += (x_i - m)**2
    return sum / (len(x) - 1)

In [51]:
# Test our variance function again for correctness.
np.var(df['Height']) - var(df['Height'])


-0.003929819948510271

## Snipet 14
Check the range predicted by the variance function.

In [52]:
heights = df['Height']
[mean(heights) - var(heights), mean(heights) + var(heights)]

[51.56163764199576, 81.17348186764636]

## Snippet 15

In [53]:
[mean(heights) - var(heights), mean(heights) + var(heights)]

[51.56163764199576, 81.17348186764636]

In [54]:
[heights.min(), heights.max()]

[54.2631333250971, 78.99874234638959]

## Snippet 16
Switch to standard deviations instead for thinking about ranges.

In [55]:
def sd(x):
    return np.sqrt(var(x))

## Snippet 17
Test our standard deviation function for correctness.

In [56]:
sd(heights) - np.std(heights)

0.0005106857595196246

## Snippet 18

In [57]:
[mean(heights) - sd(heights), mean(heights) + sd(heights)]

[62.51971332950399, 70.21540618013812]

In [58]:
[heights.min(), heights.max()]

[54.2631333250971, 78.99874234638959]

## Snippet 19

In [59]:
[mean(heights) - sd(heights), mean(heights) + sd(heights)]

[62.51971332950399, 70.21540618013812]

In [62]:
[np.percentile(heights, 25), np.percentile(heights, 75)]

[63.505620481218955, 69.1742617268347]