In [143]:
import numpy as np
import pandas as pd

from empiricaldist import Pmf

import src.helpers as hlp
from src.helpers import prob, conditional, empty_df

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Chapter 1 -- Probability

## Problem 1

In [84]:
# Download and read data
hlp.download('https://github.com/AllenDowney/ThinkBayes2/raw/master/data/gss_bayes.csv')
gss = pd.read_csv('data/gss_bayes.csv', index_col=0)
gss.head(3)

Unnamed: 0_level_0,year,age,sex,polviews,partyid,indus10
caseid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1974,21.0,1,4.0,2.0,4970.0
2,1974,41.0,1,5.0,0.0,9160.0
5,1974,58.0,2,6.0,1.0,2670.0


In [85]:
# Create required variables
female = gss['sex'] == 2
banker = gss['indus10'] == 6870
liberal = gss['polviews'] <= 3
democrat = gss['partyid'] <= 1
is31 = gss['age'] == 31

In [86]:
p = prob(is31 & female & banker)
print(f"P(31 and female and banker) = {p:.5f}")

P(31 and female and banker) = 0.00026


In [77]:
p = prob(is31 & female & banker & democrat & liberal)
print(f"P(31 and female and banker and liberal and democrat) = {p:.4f}")

P(31 and female and banker and liberal and democrat) = 0.0001


## Problem 2

In [79]:
p = liberal[democrat].mean()
assert p == conditional(liberal, democrat)
print(f"P(liberal|democrat) = {p:.3f}")

P(liberal|democrat) = 0.389


In [80]:
p = democrat[liberal].mean()
print(f"P(demo|liberal) = {p:.3f}")

P(demo|liberal) = 0.521


## Problem 3

In [88]:
# Create required variables
young = gss.age < 30
old = gss.age >= 65
conservative = gss.polviews >= 5


In [97]:
p = prob(young & liberal)
print('P(young & liberal) =', p)

P(young & liberal) = 0.06579427875836884


In [93]:
p = prob(liberal[young])
print('P(liberal|young) =', p)

P(liberal|young) = 0.338517745302714


In [96]:
p = prob(conservative & old)
print('p(conservative & old) =', p)

p(conservative & old) = 0.06701156421180766


In [99]:
p = prob(old[conservative])
print('P(old|conservative) =', p)

P(old|conservative) = 0.19597721609113564


## Chapter 2 -- Bayes's Theorem

### Problem 1

In [131]:
df = pd.DataFrame(index=['Regular coin', 'Trick coin'])
df

Regular coin
Trick coin


In [132]:
df['prior'] = 0.5
df

Unnamed: 0,prior
Regular coin,0.5
Trick coin,0.5


In [127]:
df['likelihood'] = 0.5, 1
df

Unnamed: 0,prior,likelihood
Regular coin,0.5,0.5
Trick coin,0.5,1.0


In [128]:
df['normalisation'] = (df['prior'] * df['likelihood']).sum()
df

Unnamed: 0,prior,likelihood,normalisation
Regular coin,0.5,0.5,0.75
Trick coin,0.5,1.0,0.75


In [129]:
df['posterior'] = df['likelihood'] * df['prior'] / df['normalisation']
df

Unnamed: 0,prior,likelihood,normalisation,posterior
Regular coin,0.5,0.5,0.75,0.333333
Trick coin,0.5,1.0,0.75,0.666667


## Exercise 2

In [140]:
df = pd.DataFrame(index=['1 girl only', '2 Girls'])
df['prior'] = 1/2, 1/4
df['likelihood'] = 1/2
df['unnorm'] = df['prior'] * df['likelihood']
df['posterior'] = df['unnorm'] / df['unnorm'].sum()
df

Unnamed: 0,prior,likelihood,unnorm,posterior
1 girl only,0.5,0.5,0.25,0.666667
2 Girls,0.25,0.5,0.125,0.333333


- Intuitively, I first thought that the answer must be 1/2, since that's the probability of any child being a girl.
- However, the crucial bit here is that we only know that one of the children is a girl and not which one.
- The outcome space is `{BB, BG, GB, GG}`.
- If we knew that the first one was a girl, then the outcome space would be reduced to `{GB, GG}`, and `P(GG) = 1/2`, and similarly for the second child being a girl.
- If we only know that one child is a girl, the outcome space is reduced to `{BG, GB, GG}`, and `P(GG) = 1/3`.
- A more transparent way to get there is to start with the full outcome space and use appropriate likelihoods, as shown below.


In [156]:
df = pd.DataFrame(index=['BB', 'BG', 'GB', 'GG'])
df['prior'] = 1/4
df['likelihood'] = [0, 1/3, 1/3, 1/3]
df['unnorm'] = df['prior'] * df['likelihood']
df['posterior'] = df['unnorm'] / df['unnorm'].sum()
df

Unnamed: 0,prior,likelihood,unnorm,posterior
BB,0.25,0.0,0.0,0.0
BG,0.25,0.333333,0.083333,0.333333
GB,0.25,0.333333,0.083333,0.333333
GG,0.25,0.333333,0.083333,0.333333


## Problem 3

In [169]:
# Classic Monty Hall problem
# I choose door 1, Monty opens door 2, should I switch to door 3?

# If the car is behind door 1, Monty has to open door 2 or 3
# If the car is behind door 2, Monty has to open door 3
# If the car is behind door 3, Monty has to open door 2

df = pd.DataFrame(index=['Door 1', 'Door 2', 'Door 3'])
df['Prior'] = 1/3
df['likelihood'] = 1/2, 0, 1
df['unnorm'] = df['Prior'] * df['likelihood']
df['posterior'] = df['unnorm'] / df['unnorm'].sum()
df

Unnamed: 0,Prior,likelihood,unnorm,posterior
Door 1,0.333333,0.5,0.166667,0.333333
Door 2,0.333333,0.0,0.0,0.0
Door 3,0.333333,1.0,0.333333,0.666667


I got tripped up here initially! The proper way to assign likelihoods is by simply asking: "If the care were behind door x, what would be the probability of Monty opening the door he actually opens". More generically, it makes clear why thinking in terms of hypotheses is useful, because we can simply ask "If the hypothesis were true, what would be the probability of the data we observe".

In [167]:
# Modified Monty Hall problem
# Monty opens door 2 whenever possible, else door 3 
# I choose door 1, Monty opens door 2, should I switch to door 3?

# If the car is behind door 1, Monty opens door 2
# If the car is behind door 2, Monty opens door 3
# If the car is behind door 3, Monty opens door 2

df = empty_df(index=['Door 1', 'Door 2', 'Door 3'])
df['Prior'] = 1/3
df['likelihood'] = 1, 0, 1
df['unnorm'] = df['Prior'] * df['likelihood']
df['Posterior'] = df['unnorm'] / df['unnorm'].sum()
df

Unnamed: 0,Prior,likelihood,unnorm,Posterior
Door 1,0.333333,1,0.333333,0.5
Door 2,0.333333,0,0.0,0.0
Door 3,0.333333,1,0.333333,0.5


In [170]:
# Modified Monty Hall problem
# Monty opens door 2 whenever possible, else door 3 
# I choose door 1, Monty opens door 3, should I switch to door 2?

# If the care is behind door 1, Monty opens door 2
# If the care is behind door 2, Monty opens door 3
# If the care is behind door 3, Monty opens door 2

df = empty_df(index=['Door 1', 'Door 2', 'Door 3'])
df['Prior'] = 1/3
df['likelihood'] = 0, 1, 0
df['unnorm'] = df['Prior'] * df['likelihood']
df['posterior'] = df['unnorm'] / df['unnorm'].sum()
df

Unnamed: 0,Prior,likelihood,unnorm,posterior
Door 1,0.333333,0,0.0,0.0
Door 2,0.333333,1,0.333333,1.0
Door 3,0.333333,0,0.0,0.0


# Problem 4