In [2]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

import warnings
warnings.simplefilter("ignore")

In [3]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = np.array(['Disease'] * disease  +  ['No disease'] * no_disease)
    result = np.array(['Test +'] * (disease) + ['Test +'] * (round(no_disease * 0.05))  + \
                 ['Test -'] * (round(no_disease * 0.95)))
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

## New material

### Here's a scenario (college course)

Here is the data for this example(s). Verify that the data looks as is in the question statements.

In [7]:
n = 100
second = round(n * 0.6)
third = round(n * 0.4)

year = np.array(['Second'] * second + ['Third'] * third)
major = np.array(['Declared'] * (round(second * 0.5)) + ['Undeclared'] * (round(second * 0.5)) + \
                 ['Declared'] * (round(third * 0.8))  + ['Undeclared'] * (round(third * 0.2)))
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)

In [8]:
students.show(3)

Year,Major
Second,Declared
Second,Declared
Second,Declared


In [9]:
students.pivot('Major', 'Year')

Year,Declared,Undeclared
Second,30,30
Third,32,8


Given that the person is declared, which person is more likely?

$\mathbb{P}(\text{Third|Declared}) = \frac{32}{62}$

In [13]:
32/62

0.5161290322580645

$\mathbb{P}({\text{Second|Declared}}) = \frac{30}{62}$

In [14]:
30/62

0.4838709677419355

### Here's a second scenario (Doctors and clinical tests)

Create a population where the rate of prevalence is $\frac{1}{1000}$ and with size 1000. The function actually makes the dataset and then creates the pivot table for us.

**Challenge Question**: What are the dimensions (rows and columns) of the dataset from which the pivot table was created?

In [11]:
create_population(1/1000, 10000)

Status,Test +,Test -
Disease,10,0
No disease,500,9490


The probability we calculated, $\mathbb{P}(\text{Disease|Test +})$, is $\frac{10}{500}$

In [12]:
10/500

0.02

### Changing the prior can change our classification

$$\mathbb{P}(\text{Disease|Test +}) = \frac{\mathbb{P}(\text{Test +|Disease})}{\mathbb{P}(\text{Test +|Disease}) + \mathbb{P}(\text{Test +|No Disease})}$$

#### "Assume a patient is selected at random"

In [34]:
(0.001 * 1) / (0.001*1 + 0.999*0.05)

0.019627085377821395

In [17]:
random_selection_prior = 1/1000

In [21]:
(random_selection_prior * 1) / (random_selection_prior*1 + (1-random_selection_prior)*0.05)

0.019627085377821395

In [25]:
create_population(random_selection_prior, 10000)

Status,Test +,Test -
Disease,10,0
No disease,500,9490


In [24]:
10/500

0.02

#### One doctor's prior

In [26]:
one_doctors_prior = 100/1000

In [27]:
(one_doctors_prior * 1) / (one_doctors_prior*1 + (1-one_doctors_prior)*0.05)

0.689655172413793

In [28]:
create_population(one_doctors_prior, 10000)

Status,Test +,Test -
Disease,1000,0
No disease,450,8550


In [29]:
1000/1450

0.6896551724137931

#### Another doctor's prior

In [30]:
another_doctors_prior = 500/1000

In [31]:
(another_doctors_prior * 1) / (another_doctors_prior*1 + (1-another_doctors_prior)*0.05)

0.9523809523809523

In [32]:
create_population(0.5, 10000)

Status,Test +,Test -
Disease,5000,0
No disease,250,4750


In [33]:
5000/5250

0.9523809523809523