In [None]:
from datascience import *
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(arr):
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def residuals(t, x, y):
    predictions = fitted_values(t, x, y)
    return t.column(y) - predictions

In [None]:
def last_two_characters(s):
    return s[-2:]

## Decisions and Subjective Prior Probabilities

In [None]:
.001 * 1 / (.999 * .05 + .001 * 1)

In [None]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = np.array(['Disease'] * disease  +  ['No disease'] * no_disease)
    result = np.array(['Test +'] * (disease) + ['Test +'] * (round(no_disease * 0.05))  + \
                 ['Test -'] * (round(no_disease * 0.95)))
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

In [None]:
create_population(.001, 10000)

In [None]:
create_population(.1, 10000)

In [None]:
create_population(.5, 10000)

In [None]:
.5 * 1 / (.5 * .05 + .5 * 1)

## Case study and review

In [None]:
# Size is in square miles
# Motor vehicle/walk are % of commuters who commute that way
# Tech workers is the number of people who work in a computer or math occupation
# Insured is % who have health insurance
# Broadband is % who have a computer with broadband internet access in their home
# Gini index measures income inequality (0-1, larger values mean higher inequality)
# College / High school / No high school are % of adults with that level of education

cities_full = Table.read_table('urban_areas_2017.csv')
cities_full

In [None]:
cities = cities_full.select('Urban area', 'Motor vehicle', 'Tech workers', 
                            'Broadband', 'Median rent', 'Gini index', 
                            'Population', 'College', 'Has rapid transit')
cities

In [None]:
cities.hist('Broadband')

In [None]:
cities.where('Broadband', are.below(0.5))

In [None]:
cities.sort('Median rent', descending=True)

In [None]:
cities = cities.with_column(
    'State', 
    cities.apply(last_two_characters, 'Urban area'))
cities

In [None]:
cities.group('State').sort(1, descending=True).barh('State')

In [None]:
cities.sort('Tech workers', descending=True)

In [None]:
cities.hist('Tech workers', group='Has rapid transit', bins=np.arange(0, 300000, 5000))

In [None]:
cities.hist('College', group='Has rapid transit')

In [None]:
cities.scatter('Broadband', 'Gini index')

In [None]:
cities.where('Broadband', are.above(0.5)).scatter('Broadband', 'Gini index', fit_line=True)