In [59]:
import pandas as pd
from math import log
import numpy as np
from scipy.stats import beta

In [3]:
df = pd.read_csv('framingham_heart_disease.csv')

# Part A

# Hypothesis question:
## Third Experimental question - Does BMI (X) change between gender (Y)

## Question 1

In [18]:
df_observed = df.sample(n=100, random_state=1)
df_old_observation = df.drop(df_observed.index).sample(n=1000, random_state=1)

## Question 2 a

Let's define $\tau$ as the median of the bmi, and j=1 as male and j=2 as female

In [21]:
tau = df_old_observation["BMI"].median()

As we seen in class, the MLE estimator of $\psi$ is $log(\frac{(n - {X^{2}_{\tau}}){X^{1}_{\tau}}}{(m - {X^{1}_{\tau}}){X^{2}_{\tau}}})$ when:<br> m is number of male <br> n is number of female <br> $X^{1}_{\tau}=\sum_{i=1}^{m}I\{x_{i}^1>\tau\}$<br> $X^{2}_{\tau}=\sum_{i=1}^{n}I\{x_{i}^{2}>\tau\}$.

In [36]:
def calcualte_phi_mle(temp_df, tau):
    x_1 = len(temp_df[(temp_df["male"] == 1) & (temp_df["BMI"] > tau)]["BMI"])
    x_2 = len(temp_df[(temp_df["male"] == 0) & (temp_df["BMI"] > tau)]["BMI"])
    m = len(temp_df[temp_df["male"] == 1])
    n = len(temp_df[temp_df["male"] == 0])
    phi = log((x_1*(n - x_2))/(x_2)*(m- x_1))
    return phi, n, m

In [37]:
phi, n, m = calcualte_phi_mle(df, tau)

In [38]:
print(f"number of male: {n}, number of female: {m}")

number of male: 2419, number of female: 1819


In [39]:
phi

13.96508385570071

We can see that log Odd ratio is bigger then zero, from that can can infer that the males BMI are more likely to be bigger then $\tau$ then females BMI

Let's find the confident interval of $\psi$ using (quantile) Bootsrap

In [54]:
B = 500
alpha = 0.05
quantile_list = []
for _ in range(B):
    temp_df = df.sample(n=len(df), replace=True)
    phi, _, _ = calcualte_phi_mle(temp_df, tau)
    quantile_list.append(phi)
a = np.quantile(quantile_list, q=(alpha/2))
b = np.quantile(quantile_list, q=(1-(alpha/2)))
print(f"The confident interval is: [{a}, {b}]")

The confident interval is: [13.855689547347927, 14.069286716811986]


And indeed, the MLE estimator that we just calculate fall into the interval. More over we can see that the confident interval is short and informative, it suggest that the BMI of male is more likely to be above $\tau$ then BMI of female

## Question 2 b

We will assume that: <br> $\pi(p_1, p_2)=1I\{1\ge{p_1}\ge0\}$, <br> $\pi(p_2)=1I\{1\ge{p_2}\ge0\}$ <br> ${\mathcal{L}}_m(p_1)={p_1}^{X^{1}_{\tau}}{(1-p_1)}^{m-X^{1}_{\tau}}$ <br> ${\mathcal{L}}_n(p_2)={p_2}^{X^{2}_{\tau}}{(1-p_2)}^{n-X^{2}_{\tau}}$ <br> and as we seen in lecture the postriors are: <br> $p1|Z\sim{Beta(X^{1}_{\tau} + 1, m - X^{1}_{\tau} +1)}$ <br> $p2|Z\sim{Beta(X^{2}_{\tau} + 1, n - X^{2}_{\tau} +1)}$<br> We will compute the estimator using the excpectation on the postirior i.e : <br> $E[p_1|Z]=\frac{X^{1}_{\tau} + 1}{m + 2}$ <br> $E[p_2|Z]=\frac{X^{2}_{\tau} + 1}{n + 2}$

In [77]:
def simulate_p_beta(x_1, x_2, m, n, jeffery=False):
    a1 = x_1 if jeffery else x_1 + 1
    b1 = m - x_1 if jeffery else m - x_1 + 3
    a2 = x_2 if jeffery else x_2 + 1 
    b2 = m - x_2 if jeffery else  m - x_2 + 3
    return beta.rvs(a1, b1), beta.rvs(a2, b2)

In [78]:
B = 10000
psi_list = []
x_1 = len(df_observed[(df_observed["male"] == 1) & (df_observed["BMI"] > tau)]["BMI"])
x_2 = len(df_observed[(df_observed["male"] ==0) & (df_observed["BMI"] > tau)]["BMI"])
m = len(df_observed[df_observed["male"] == 1])
n = len(df_observed[df_observed["male"] == 0])
for _ in range(B):
    p1, p2 = simulate_p_beta(x_1, x_2, m, n)
    psi = p1/p2
    psi_list.append(psi)
psi_e = sum(psi_list)/len(psi_list)
a = np.quantile(psi_list, q=(alpha/2))
b = np.quantile(psi_list, q=(1-(alpha/2)))
print(f"The estimator of OR is: {psi_e}")
print(f"The credible interval is: [{a, b}]")

The estimator of OR is: 1.164513910597534
The credible interval is: [(0.772565503543092, 1.7144119606153525)]


As we excpect, the probabilty that men BMI will be above tau is bigger then the porobabilty that women will. 

In [75]:
for _ in range(B):
    p1, p2 = simulate_p_beta(x_1, x_2, m, n, True)
    psi = log((p1/(1-p1))/(p2/(1-p2)))
    psi_list.append(psi)
psi_e = sum(psi_list)/len(psi_list)
a = np.quantile(psi_list, q=(alpha/2))
b = np.quantile(psi_list, q=(1-(alpha/2)))
print(f"The estimator of OR is: {psi_e}")
print(f"The credible interval is: [{a, b}]")

The estimator of OR is: 0.32493221290499275
The credible interval is: [(-0.5886033273365838, 1.252496800508029)]


In [58]:
log((p2/(1-p2)))

-0.6466271649250525