# Step 1: The preliminaries
## 1(a) Import the libraries 

In [None]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.anova import AnovaRM
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from load_data_common_Andy import df_features

## 1(b) Load all the data - this time we remove those non-beating cells

In [None]:
df_features = df_features.dropna(subset=['voltage_beating_frequencies'])
Verapamil = df_features.loc[(df_features['drug'] == 'Verapamil') & (df_features['dose'] != '10uM') & (df_features['dose'] != '100uM')  ]

Control = df_features.loc[(df_features['drug'] == 'Control')]
BayK = df_features.loc[(df_features['drug'] == 'Bay_K')]
Dofetilide = df_features.loc[(df_features['drug'] == 'Dofetilide')]
df_reduced = pd.concat([Control,BayK,Verapamil,Dofetilide])

## Now we are going to make some inferences among those 4 groups:  
### (A) Control,  (B) Bay K8644 (C) Verapamil, (D) Dofetilide

### $APD_{80}$

In [None]:
sns.catplot(x='drug', y='voltage_apd80', data=df_reduced[['drug','voltage_apd80']], dodge=True, kind='violin', aspect=3)

In [None]:
sns.barplot(x = "drug", y = "voltage_apd80", data = df_reduced[['drug',"voltage_apd80"]], estimator = "mean", errorbar = 'se')

## Starting with the simplest question: is there a difference in mean $APD_{90}$ across drugs?

## (1) One-way (single factor) ANOVA
## Our model for the data:
$$\Large
APD_{80_{ij}}=\mu_{0} + \alpha_{i} + \epsilon_{ij}
$$

In [None]:
Control = df_features.loc[(df_features["drug"]=="Control"),["drug","dose","voltage_apd80","tissue","well"]]
Bay_K = df_features.loc[(df_features["drug"]=="Bay_K"),["drug","dose","voltage_apd80","tissue","well"]]
Verapamil = df_features.loc[(df_features["drug"]=="Verapamil"),["drug","dose","voltage_apd80","tissue","well"]]
Dofetilide = df_features.loc[(df_features["drug"]=="Dofetilide"),["drug","dose","voltage_apd80","tissue","well"]]

Control

In [None]:
F,p = stats.f_oneway(Control["voltage_apd80"],Bay_K["voltage_apd80"],Verapamil["voltage_apd80"],Dofetilide["voltage_apd80"])

print('F statistic value:',F)
print('p-value:',p)

## (2) Equivalent General Linear Model formulation

## Our simple one factor linear model:   

$$\Large
APD_{80_{ij}}=\beta_{0} + \beta_{1}drug_{i} + \epsilon_{ij}
$$

### Need to rearrange the data a little first

In [None]:
df_1way = pd.concat([Control,Bay_K,Verapamil,Dofetilide])

df_1way

In [None]:
one_way_glm_model = smf.ols(formula="voltage_apd80 ~ drug", data=df_1way)
one_way_glm_fit = one_way_glm_model.fit()
print(one_way_glm_fit.summary())
anova_table = sm.stats.anova_lm(one_way_glm_fit, typ=2)
print(anova_table)

## What about the effect of dose for a drug?

## Control ("dose" refers to [DMSO]): Let's first look at the distributions again

In [None]:
Control.groupby('dose')['voltage_apd80'].plot(kind='kde', xlim=[0,3000], ylim=[0,0.02])
dose = ['0.0006%','0.006%','0.06%','0.6%','baseline']
plt.legend(dose, title='DMSO dose')

In [None]:
Control_RM = Control.copy()
Control_RM["tissue_RM"] = Control_RM.apply(lambda _: ' ', axis=1)
wells_unique = Control_RM["well"].unique()
tissue_num = 1
for well in wells_unique:
    well_rows = Control_RM.loc[(Control_RM["well"] == well)]
    tissue_unique = well_rows["tissue"].unique()
    for tissue in tissue_unique:
        Control_RM.loc[(Control_RM["well"] == well) & (Control_RM["tissue"]==tissue),"tissue_RM"]=tissue_num
        tissue_num = tissue_num+1
Control_RM

## Running the Repeated Measures design using conventional ANOVA  
## The model:
$$\Large
APD_{80_{ij}}=\mu_{0} + \alpha_{i} + \lambda_{j} + \epsilon_{ij}
$$
$$
where, \alpha_{i} = doses 
$$
$$
\lambda_{j} = tissues
$$
$$
\epsilon_{ij} = residuals
$$

In [None]:
Control_RM_anova_table = AnovaRM(data=Control_RM, depvar='voltage_apd80', subject='tissue_RM', within=['dose']).fit()
print(Control_RM_anova_table)

## Now let's try Bay K8644 

### Those distributions again:

In [None]:
BayK.groupby('dose')['voltage_apd80'].plot(kind='kde', xlim=[0,3000], ylim=[0,0.02])
dose = ['1000nM','100nM','10nM','1nM','baseline']
plt.legend(dose, title='Bay K8644 dose')

### Mmm, mmm. Looking good. There must be a drug effect here.

### Reconfiguring the Bay K data for a "within subjects" tissue design for RMANOVA

In [None]:
Bay_K_RM = Bay_K.copy()
Bay_K_RM["tissue_RM"] = Bay_K_RM.apply(lambda _: ' ', axis=1)
wells_unique = Bay_K["well"].unique()
tissue_num = 1
for well in wells_unique:
    well_rows = Bay_K.loc[(Bay_K["well"] == well)]
    tissue_unique = well_rows["tissue"].unique()
    for tissue in tissue_unique:
        Bay_K_RM.loc[(Bay_K["well"] == well) & (Bay_K["tissue"]==tissue),"tissue_RM"]=tissue_num
        tissue_num = tissue_num+1
Bay_K_RM

In [None]:
AnovaRM(data=Bay_K_RM, depvar='voltage_apd80', subject='tissue_RM', within=['dose']).fit()

## What? Does this mean I can't use a "within subjects" design?

### Linear mixed models offer robustness and flexibility

### The linear mixed model for this 1-factor Repeated Measures design:
$$\Large
APD_{80_{ij}}=(\beta_{0}+b_{0j}) + (\beta_{1}+b_{1j})dose_{i} + \epsilon_{ij}
$$
$$
i = doses 
$$
$$
j = tissues
$$
$$
\epsilon_{ij} = residuals
$$

In [None]:
model = smf.mixedlm("voltage_apd80 ~ dose", Bay_K_RM, groups=Bay_K_RM["tissue_RM"])

mdf = model.fit()

print(mdf.summary())

In [None]:
model_Control = smf.mixedlm("voltage_apd80 ~ dose", Control_RM, groups=Control_RM["tissue_RM"])

mdf_Control = model_Control.fit()

print(mdf_Control.summary())