In [None]:
import numpy as np
from datascience import *

# Configure notebook (happens automatically on data8.berkeley.edu)
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

## Where

In [None]:
t = Table.read_table('top_movies.csv')
t

In [None]:
t.where('Year', 2015)

In [None]:
t.where('Year', are.above(2013))

In [None]:
t.where('Year', are.between(1980, 1990))

## Multinomial Distribution

In [None]:
two_dice = Table(['First', 'Second', 'Chance'])
for first in np.arange(1, 7):
    for second in np.arange(1, 7):
        two_dice.append([first, second, 1/36])
two_dice.set_format('Chance', PercentFormatter(1))

In [None]:
two_dice_sums = Table(['Sum', 'Chance']).with_rows([
        [ 2, 1/36], [ 3, 2/36], [ 4, 3/36], [5, 4/36], [6, 5/36], [7, 6/36],
        [12, 1/36], [11, 2/36], [10, 3/36], [9, 4/36], [8, 5/36],
    ]).sort(0)
two_dice_sums.set_format('Chance', PercentFormatter(1))

In [None]:
dice_sums = two_dice.column('First') + two_dice.column('Second')
sum_of_5 = two_dice.where(dice_sums == 5)
sum_of_5

In [None]:
sum(sum_of_5.column('Chance'))

In [None]:
def P(event):
    return sum(event.column('Chance'))

P(sum_of_5)

In [None]:
with_sums = two_dice.with_column('Sum', dice_sums)
with_sums

In [None]:
with_sums.group('Sum')

In [None]:
grouped = with_sums.select(['Sum', 'Chance']).group('Sum', sum)
grouped.relabeled(1, 'Chance').set_format('Chance', PercentFormatter(1))

In [None]:
P(with_sums.where('Sum', 8))

In [None]:
P(two_dice_sums.where('Sum', 8))

### U.S. Birth Times

In [None]:
birth = Table.read_table('birth_time.csv').select(['Time', 'Hour', 'Chance'])
birth.set_format('Chance', PercentFormatter(1)).show()

In [None]:
business_hours = birth.where('Hour', are.between(8, 18))
business_hours

In [None]:
P(business_hours)

In [None]:
P(birth.where('Hour', are.between(0, 6)))

## Conditional Distributions



In [None]:
above_8 = two_dice_sums.where('Sum', are.above(8))
given_8 = above_8.with_column('Chance', above_8.column('Chance') / P(above_8))
given_8

In [None]:
def given(event):
    return event.with_column('Chance', event.column('Chance') / P(event))

given(two_dice_sums.where('Sum', are.above(8)))

In [None]:
given(business_hours)

In [None]:
P(given(business_hours).where('Hour', are.below(12)))

In [None]:
morning = birth.where('Hour', are.between(8, 12))
P(morning)

In [None]:
P(birth.where('Hour', are.below(12)))

In [None]:
P(morning) / P(business_hours)

In [None]:
business_hours.where('Hour', are.below(12))

In [None]:
P(business_hours.where('Hour', are.below(12))) / P(business_hours)

## Discussion Question

In [None]:
after_noon = birth.where('Hour', are.above(11) )
P ( given ( after_noon ).where( 'Hour' , are.below(18) ) )

## Joint Distributions

In [None]:
two_dice

In [None]:
birth_day = Table.read_table('birth_time.csv').drop('Chance')
birth_day.set_format([2, 3], PercentFormatter(1))

In [None]:
weekday = birth_day.select(['Hour', 'Weekday']).relabeled(1, 'Chance')
weekend = birth_day.select(['Hour', 'Weekend']).relabeled(1, 'Chance')

In [None]:
birth_joint = Table(['Day', 'Hour', 'Chance'])
for row in weekday.rows:
    birth_joint.append(['Weekday', row.item('Hour'), row.item('Chance') * 0.7825])
for row in weekend.rows:
    birth_joint.append(['Weekend', row.item('Hour'), row.item('Chance') * 0.2175])
birth_joint.set_format('Chance', PercentFormatter(1))

In [None]:
P(birth_joint)

In [None]:
P(birth_joint.where('Day', 'Weekday').where('Hour', are.between(8, 12)))

In [None]:
early_morning = birth_joint.where('Hour', 5)
early_morning

In [None]:
P(given(early_morning).where('Day', 'Weekend'))

### Bayes' Rule: Diagnostic Example

In a population, there is a rare disease. Researchers have developed a medical test for the disease. Mostly, the test correctly identifies whether or not the tested person has the disease. But sometimes, the test is wrong. Here are the relevant proportions.

- 1% of the population has the disease
- If a person has the disease, the test returns the correct result with chance 99%.
- If a person does not have the disease, the test returns the correct result with chance 99.5%.

**One person is picked at random from the population.** Given that the person tests positive, what is the chance that the person has the disease?

We begin by partitioning the population into four categories in the tree diagram below.

<img src="disease1.png" />

By Bayes' Rule, the chance that the person has the disease given that he or she has tested positive is the chance of the top "Test Positive" branch relative to the total chance of the two "Test Positive" branches. The answer is
$$
\frac{0.01 \times 0.99}{0.01 \times 0.99 ~+~ 0.99 \times 0.005} ~=~ 0.667
$$

In [None]:
# The person is picked at random from the population.

# By Bayes' Rule:
# Chance that the person has the disease, given that test was +

(0.01*0.99)/(0.01*0.99 + 0.99*0.005)

In [None]:
rare = Table(['Health', 'Test', 'Chance']).with_rows([
        ['Diseased', 'Positive', 0.01 * 0.99],
        ['Diseased', 'Negative', 0.01 * 0.01],
        ['Not Diseased', 'Positive', 0.99 * 0.005],
        ['Not Diseased', 'Negative', 0.99 * 0.995]
    ])
rare

In [None]:
positive = rare.where('Test', 'Positive')
P(given(positive).where('Health', 'Diseased'))

If that probability were 10%, then the probabilities on the left side of the tree diagram would change accordingly, with the 0.1 and 0.9 now interpreted as subjective probabilities:

<img src="disease10.png" />

The change has a noticeable effect on the answer, as you can see by running the cell below.

In [None]:
# Subjective prior probability of 10% that the person has the disease

# By Bayes' Rule:
# Chance that the person has the disease, given that test was +

(0.1*0.99)/(0.1*0.99 + 0.9*0.005)

If the patient's prior probability of havng the disease is 50%, then the result changes yet again. 

<img src="disease50.png" />

In [None]:
# Subjective prior probability of 50% that the person has the disease

# By Bayes' Rule: 
# Chance that the person has the disease, given that test was +

(0.5*0.99)/(0.5*0.99 + 0.5*0.005)