In [None]:
%matplotlib inline
import pymc3 as pm
import numpy as np
import pandas as pd
import theano

from scipy import stats
from sklearn.metrics import mean_squared_error
import math

import matplotlib.pyplot as plt
import arviz as az

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
d = pd.read_csv('./data/NWOGrants.csv', header=0, sep=';')
d['gid'] = pd.factorize(d['gender'])[0]
d['apps'] = d['applications']
d['disc'] = pd.factorize(d['discipline'])[0]
d

Unnamed: 0,discipline,gender,applications,awards,gid,apps,disc
0,Chemical sciences,m,83,22,0,83,0
1,Chemical sciences,f,39,10,1,39,0
2,Physical sciences,m,135,26,0,135,1
3,Physical sciences,f,39,9,1,39,1
4,Physics,m,67,18,0,67,2
5,Physics,f,9,2,1,9,2
6,Humanities,m,230,33,0,230,3
7,Humanities,f,166,32,1,166,3
8,Technical sciences,m,189,30,0,189,4
9,Technical sciences,f,62,13,1,62,4


## Exercise 1

##### DAG: [ G -> D -> A ] [ G -> A ]

In [None]:
# Create index
gidx = pd.Categorical(d.gid).codes
didx = pd.Categorical(d.disc).codes
len(gidx), len(didx)

(18, 18)

In [None]:
with pm.Model() as m1:
    # Prior
    ag = pm.Normal("ag", 0, 1.5, shape=d.gid.nunique())
    
    p = pm.Deterministic("p", pm.math.invlogit(ag[gidx]))
    
    award = pm.Binomial("award", n=d.apps, p=p, observed=d.awards)
    # Sample
    prior_m1 = pm.sample_prior_predictive()
    trace_m1 = pm.sample()
    posterior_m1 = pm.sample_posterior_predictive(trace_m1)    

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [ag]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1 seconds.


In [None]:
with pm.Model() as m2:
    # Prior
    ag = pm.Normal("ag", 0, 1.5, shape=d.gid.nunique())
    bd = pm.Normal("bd", 0, 1.5, shape=d.disc.nunique())
    
    p = pm.Deterministic("p", pm.math.invlogit(ag[gidx] + bd[didx]))
    
    award = pm.Binomial("award", n=d.apps, p=p, observed=d.awards)
    # Sample
    prior_m2 = pm.sample_prior_predictive()
    trace_m2 = pm.sample()
    posterior_m2 = pm.sample_posterior_predictive(trace_m2)    

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [bd, ag]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 3 seconds.
The number of effective samples is smaller than 10% for some parameters.


In [None]:
az.summary(trace_m1, var_names=["ag"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
ag[0],-1.531,0.066,-1.663,-1.417,0.001,0.001,3242.0,3239.0,3257.0,2709.0,1.0
ag[1],-1.74,0.082,-1.89,-1.586,0.001,0.001,3308.0,3260.0,3328.0,2788.0,1.0


In [None]:
az.summary(trace_m2, var_names=["ag"])

Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_mean,ess_sd,ess_bulk,ess_tail,r_hat
ag[0],-1.111,0.478,-2.012,-0.211,0.026,0.018,343.0,343.0,340.0,378.0,1.01
ag[1],-1.25,0.481,-2.165,-0.371,0.026,0.018,352.0,352.0,350.0,398.0,1.01


In [None]:
from scipy.special import expit as logistic

In [None]:
diff_a = trace_m1["ag"][:, 0] - trace_m1["ag"][:, 1]
diff_p = logistic(trace_m1["ag"][:, 0]) - logistic(trace_m1["ag"][:, 1])
az.summary({"diff_a":diff_a, "diff_p":diff_p}, kind="stats")

Unnamed: 0,mean,sd,hdi_3%,hdi_97%
diff_a,0.209,0.105,0.008,0.4
diff_p,0.028,0.014,0.003,0.055


In [None]:
diff_a = trace_m2["ag"][:, 0] - trace_m2["ag"][:, 1]
diff_p = logistic(trace_m2["ag"][:, 0]) - logistic(trace_m2["ag"][:, 1])
az.summary({"diff_a":diff_a, "diff_p":diff_p}, kind="stats")

Unnamed: 0,mean,sd,hdi_3%,hdi_97%
diff_a,0.139,0.107,-0.061,0.338
diff_p,0.025,0.02,-0.012,0.063
