In [1]:
import numpy as np
from datascience import *

# Configure notebook (happens automatically on data8.berkeley.edu)
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

## Where

In [2]:
t = Table.read_table('top_movies.csv')
t

Title,Studio,Gross,Gross (Adjusted),Year
Star Wars: The Force Awakens,Buena Vista (Disney),906723418,906723400,2015
Avatar,Fox,760507625,846120800,2009
Titanic,Paramount,658672302,1178627900,1997
Jurassic World,Universal,652270625,687728000,2015
Marvel's The Avengers,Buena Vista (Disney),623357910,668866600,2012
The Dark Knight,Warner Bros.,534858444,647761600,2008
Star Wars: Episode I - The Phantom Menace,Fox,474544677,785715000,1999
Star Wars,Fox,460998007,1549640500,1977
Avengers: Age of Ultron,Buena Vista (Disney),459005868,465684200,2015
The Dark Knight Rises,Warner Bros.,448139099,500961700,2012


In [3]:
t.where('Year', 2015)

Title,Studio,Gross,Gross (Adjusted),Year
Star Wars: The Force Awakens,Buena Vista (Disney),906723418,906723400,2015
Jurassic World,Universal,652270625,687728000,2015
Avengers: Age of Ultron,Buena Vista (Disney),459005868,465684200,2015
Inside Out,Buena Vista (Disney),356461711,375723400,2015
Furious 7,Universal,353007020,356907000,2015
Minions,Universal,336045770,354213900,2015


In [4]:
t.where('Year', are.above(2013))

Title,Studio,Gross,Gross (Adjusted),Year
Star Wars: The Force Awakens,Buena Vista (Disney),906723418,906723400,2015
Jurassic World,Universal,652270625,687728000,2015
Avengers: Age of Ultron,Buena Vista (Disney),459005868,465684200,2015
Inside Out,Buena Vista (Disney),356461711,375723400,2015
Furious 7,Universal,353007020,356907000,2015
American Sniper,Warner Bros.,350126372,374796000,2014
The Hunger Games: Mockingjay - Part 1,Lionsgate,337135885,354324000,2014
Minions,Universal,336045770,354213900,2015
Guardians of the Galaxy,Buena Vista (Disney),333176600,358244800,2014


In [5]:
t.where('Year', are.between(1980, 1990))

Title,Studio,Gross,Gross (Adjusted),Year
E.T.: The Extra-Terrestrial,Universal,435110554,1234132700,1982
Return of the Jedi,Fox,309306177,818316900,1983
The Empire Strikes Back,Fox,290475067,854171500,1980
Batman,Warner Bros.,251188924,547705200,1989
Raiders of the Lost Ark,Paramount,248159971,770183000,1981
Ghostbusters,Columbia,242212467,619211400,1984
Beverly Hills Cop,Paramount,234760478,584205200,1984
Back to the Future,Universal,210609762,513740700,1985
Indiana Jones and the Last Crusade,Paramount,197171806,429923500,1989
Indiana Jones and the Temple of Doom,Paramount,179870271,465735500,1984


## Multinomial Distribution

In [6]:
two_dice = Table(['First', 'Second', 'Chance'])
for first in np.arange(1, 7):
    for second in np.arange(1, 7):
        two_dice.append([first, second, 1/36])
two_dice.set_format('Chance', PercentFormatter(1))

First,Second,Chance
1,1,2.8%
1,2,2.8%
1,3,2.8%
1,4,2.8%
1,5,2.8%
1,6,2.8%
2,1,2.8%
2,2,2.8%
2,3,2.8%
2,4,2.8%


In [7]:
dice_sums = two_dice.column('First') + two_dice.column('Second')
sum_of_5 = two_dice.where(dice_sums == 5)
sum_of_5

First,Second,Chance
1,4,2.8%
2,3,2.8%
3,2,2.8%
4,1,2.8%


In [8]:
sum(sum_of_5.column('Chance'))

0.1111111111111111

In [9]:
def P(event):
    return sum(event.column('Chance'))

P(sum_of_5)

0.1111111111111111

In [10]:
with_sums = two_dice.with_column('Sum', dice_sums)
with_sums

First,Second,Chance,Sum
1,1,2.8%,2
1,2,2.8%,3
1,3,2.8%,4
1,4,2.8%,5
1,5,2.8%,6
1,6,2.8%,7
2,1,2.8%,3
2,2,2.8%,4
2,3,2.8%,5
2,4,2.8%,6


In [11]:
with_sums.group('Sum')

Sum,count
2,1
3,2
4,3
5,4
6,5
7,6
8,5
9,4
10,3
11,2


In [15]:
grouped = with_sums.select(['Sum', 'Chance']).group('Sum', sum)
two_dice_sums = grouped.relabeled(1, 'Chance').set_format('Chance', PercentFormatter(1))
two_dice_sums

Sum,Chance
2,2.8%
3,5.6%
4,8.3%
5,11.1%
6,13.9%
7,16.7%
8,13.9%
9,11.1%
10,8.3%
11,5.6%


In [16]:
P(with_sums.where('Sum', 8))

0.1388888888888889

In [17]:
P(two_dice_sums.where('Sum', 8))

0.1388888888888889

### U.S. Birth Times

In [18]:
birth = Table.read_table('birth_time.csv').select(['Time', 'Hour', 'Chance'])
birth.set_format('Chance', PercentFormatter(1)).show()

Time,Hour,Chance
6 a.m.,6,2.9%
7 a.m.,7,4.5%
8 a.m.,8,6.3%
9 a.m.,9,5.0%
10 a.m.,10,5.0%
11 a.m.,11,5.0%
Noon,12,6.0%
1 p.m.,13,5.7%
2 p.m.,14,5.1%
3 p.m.,15,4.8%


In [19]:
business_hours = birth.where('Hour', are.between(8, 18))
business_hours

Time,Hour,Chance
8 a.m.,8,6.3%
9 a.m.,9,5.0%
10 a.m.,10,5.0%
11 a.m.,11,5.0%
Noon,12,6.0%
1 p.m.,13,5.7%
2 p.m.,14,5.1%
3 p.m.,15,4.8%
4 p.m.,16,4.9%
5 p.m.,17,5.0%


In [20]:
P(business_hours)

0.52800000000000002

In [21]:
P(birth.where('Hour', are.between(0, 6)))

0.16799999999999998

## Conditional Distributions



In [22]:
given_business_hours = business_hours.with_column('Chance', 
                                                  business_hours.column('Chance') / P(business_hours))
given_business_hours

Time,Hour,Chance
8 a.m.,8,11.9%
9 a.m.,9,9.5%
10 a.m.,10,9.5%
11 a.m.,11,9.5%
Noon,12,11.4%
1 p.m.,13,10.8%
2 p.m.,14,9.7%
3 p.m.,15,9.1%
4 p.m.,16,9.3%
5 p.m.,17,9.5%


In [23]:
def given(event):
    return event.with_column('Chance', event.column('Chance') / P(event))

In [24]:
given(business_hours)

Time,Hour,Chance
8 a.m.,8,11.9%
9 a.m.,9,9.5%
10 a.m.,10,9.5%
11 a.m.,11,9.5%
Noon,12,11.4%
1 p.m.,13,10.8%
2 p.m.,14,9.7%
3 p.m.,15,9.1%
4 p.m.,16,9.3%
5 p.m.,17,9.5%


In [25]:
P(given(business_hours).where('Hour', are.below(12)))

0.40340909090909094

In [26]:
morning = birth.where('Hour', are.between(8, 12))
P(morning)

0.21300000000000002

In [27]:
P(birth.where('Hour', are.below(12)))

0.45500000000000007

In [28]:
P(morning) / P(business_hours)

0.40340909090909094

In [29]:
business_hours.where('Hour', are.below(12))

Time,Hour,Chance
8 a.m.,8,6.3%
9 a.m.,9,5.0%
10 a.m.,10,5.0%
11 a.m.,11,5.0%


In [30]:
P(business_hours.where('Hour', are.below(12))) / P(business_hours)

0.40340909090909094

## Discussion Question

In [31]:
after_noon = birth.where('Hour', are.above(11) )
P ( given ( after_noon ).where( 'Hour' , are.below(18) ) )

0.57798165137614688

## Joint Distributions

In [32]:
two_dice

First,Second,Chance
1,1,2.8%
1,2,2.8%
1,3,2.8%
1,4,2.8%
1,5,2.8%
1,6,2.8%
2,1,2.8%
2,2,2.8%
2,3,2.8%
2,4,2.8%


In [33]:
birth_day = Table.read_table('birth_time.csv').drop('Chance')
birth_day.set_format([2, 3], PercentFormatter(1))

Time,Hour,Weekday,Weekend
6 a.m.,6,2.7%,3.6%
7 a.m.,7,4.7%,3.8%
8 a.m.,8,6.7%,4.6%
9 a.m.,9,5.1%,5.0%
10 a.m.,10,5.0%,5.0%
11 a.m.,11,5.0%,4.9%
Noon,12,6.3%,5.0%
1 p.m.,13,5.9%,4.7%
2 p.m.,14,5.3%,4.6%
3 p.m.,15,4.9%,4.6%


In [34]:
weekday = birth_day.select(['Hour', 'Weekday']).relabeled(1, 'Chance')
weekend = birth_day.select(['Hour', 'Weekend']).relabeled(1, 'Chance')

In [35]:
birth_joint = Table(['Day', 'Hour', 'Chance'])
for row in weekday.rows:
    birth_joint.append(['Weekday', row.item('Hour'), row.item('Chance') * 0.7825])
for row in weekend.rows:
    birth_joint.append(['Weekend', row.item('Hour'), row.item('Chance') * 0.2175])
birth_joint.set_format('Chance', PercentFormatter(1))

Day,Hour,Chance
Weekday,6,2.1%
Weekday,7,3.7%
Weekday,8,5.2%
Weekday,9,4.0%
Weekday,10,3.9%
Weekday,11,3.9%
Weekday,12,4.9%
Weekday,13,4.6%
Weekday,14,4.1%
Weekday,15,3.8%


In [36]:
P(birth_joint)

1.0

In [37]:
P(birth_joint.where('Day', 'Weekday').where('Hour', are.between(8, 12)))

0.17058499999999999

In [38]:
early_morning = birth_joint.where('Hour', 5)
early_morning

Day,Hour,Chance
Weekday,5,2.0%
Weekend,5,0.8%


In [39]:
P(given(early_morning).where('Day', 'Weekend'))

0.28584466551063248

### Bayes' Rule: Diagnostic Example

In a population, there is a rare disease. Researchers have developed a medical test for the disease. Mostly, the test correctly identifies whether or not the tested person has the disease. But sometimes, the test is wrong. Here are the relevant proportions.

- 1% of the population has the disease
- If a person has the disease, the test returns the correct result with chance 99%.
- If a person does not have the disease, the test returns the correct result with chance 99.5%.

**One person is picked at random from the population.** Given that the person tests positive, what is the chance that the person has the disease?

We begin by partitioning the population into four categories in the tree diagram below.

<img src="disease1.png" />

By Bayes' Rule, the chance that the person has the disease given that he or she has tested positive is the chance of the top "Test Positive" branch relative to the total chance of the two "Test Positive" branches. The answer is
$$
\frac{0.01 \times 0.99}{0.01 \times 0.99 ~+~ 0.99 \times 0.005} ~=~ 0.667
$$

In [40]:
# The person is picked at random from the population.

# By Bayes' Rule:
# Chance that the person has the disease, given that test was +

(0.01*0.99)/(0.01*0.99 + 0.99*0.005)

0.6666666666666666

In [41]:
rare = Table(['Health', 'Test', 'Chance']).with_rows([
        ['Diseased', 'Positive', 0.01 * 0.99],
        ['Diseased', 'Negative', 0.01 * 0.01],
        ['Not Diseased', 'Positive', 0.99 * 0.005],
        ['Not Diseased', 'Negative', 0.99 * 0.995]
    ])
rare

Health,Test,Chance
Diseased,Positive,0.0099
Diseased,Negative,0.0001
Not Diseased,Positive,0.00495
Not Diseased,Negative,0.98505


In [42]:
positive = rare.where('Test', 'Positive')
P(given(positive).where('Health', 'Diseased'))

0.66666666666666663

If that probability were 10%, then the probabilities on the left side of the tree diagram would change accordingly, with the 0.1 and 0.9 now interpreted as subjective probabilities:

<img src="disease10.png" />

The change has a noticeable effect on the answer, as you can see by running the cell below.

In [43]:
# Subjective prior probability of 10% that the person has the disease

# By Bayes' Rule:
# Chance that the person has the disease, given that test was +

(0.1*0.99)/(0.1*0.99 + 0.9*0.005)

0.9565217391304347

If the patient's prior probability of havng the disease is 50%, then the result changes yet again. 

<img src="disease50.png" />

In [44]:
# Subjective prior probability of 50% that the person has the disease

# By Bayes' Rule: 
# Chance that the person has the disease, given that test was +

(0.5*0.99)/(0.5*0.99 + 0.5*0.005)

0.9949748743718593