In [58]:
import pandas as pd
from scipy.stats import fisher_exact

df = pd.read_csv("../data/processed/00_covid_hiv_data.csv")

In [63]:
df = df[df["suppressed"] != -1]
df = df[df["rna"] != -1]
df[df["rna"]<1000]["suppressed"]

4        1
12       1
19       0
22       1
29       1
        ..
38283    1
38306    1
38316    0
38352    1
38357    1
Name: suppressed, Length: 1960, dtype: int64

In [57]:
def fisher_test(df, col, cutoff):
    df = df[df["covid_wave"] != -1]
    df = df[df[col] != -1]
    tl = df[df[col] <= cutoff]
    tg = df[df[col] >= cutoff]
    if (tl.shape[0] == 0) or (tg.shape[0] == 0):
        return
    if col == "suppressed":
        alternative = "less"
        x_data = []
        y_data = []
        for v in df[["covid_wave", col]].values:
            x_data += [v[0]]
            y_data += [v[1]]
        tab = pd.crosstab(x_data, y_data)
        if tab.shape != (2, 2):
            return
        odds_ratio, p_value = fisher_exact(tab, alternative=alternative)
        print("{0} (cut = {1}) Fisher's exact (alternative = {2}): N = {3}, OR = {4:.2f}, P-value = {5:.2g}".format(col.upper(), cutoff, alternative, len(df), odds_ratio, p_value))
        return len(df), odds_ratio, p_value, alternative
    if col == "rna":
        alternative = "greater"
        x_data = []
        y_data = []
        for v in df[["covid_wave", col]].values:
            x_data += [v[0]]
            if v[1] >= cutoff:
                y_data += [1]
            else:
                y_data += [0]
        tab = pd.crosstab(x_data, y_data)
        if tab.shape != (2, 2):
            return
        odds_ratio, p_value = fisher_exact(tab, alternative=alternative)
        print("{0} (cut = {1}) Fisher's exact (alternative = {2}): N = {3}, OR = {4:.2f}, P-value = {5:.2g}".format(col.upper(), cutoff, alternative, len(df), odds_ratio, p_value))
        return len(df), odds_ratio, p_value, alternative
    if col == "cd4":
        alternative = "greater"
        x_data = []
        y_data = []
        for v in df[["covid_wave", col]].values:
            x_data += [v[0]]
            if v[1] >= cutoff:
                y_data += [1]
            else:
                y_data += [0]
        tab = pd.crosstab(x_data, y_data)
        if tab.shape != (2, 2):
            return
        odds_ratio, p_value = fisher_exact(tab, alternative=alternative)
        print("{0} (cut = {1}) Fisher's exact (alternative = {2}): N = {3}, OR = {4:.2f}, P-value = {5:.2g}".format(col.upper(), cutoff, alternative, len(df), odds_ratio, p_value))
        return len(df), odds_ratio, p_value, alternative
    if col == "late":
        alternative = "greater"
        x_data = []
        y_data = []
        for v in df[["covid_wave", col]].values:
            x_data += [v[0]]
            if v[1] >= cutoff:
                y_data += [1]
            else:
                y_data += [0]
        tab = pd.crosstab(x_data, y_data)
        if tab.shape != (2, 2):
            return
        odds_ratio, p_value = fisher_exact(tab, alternative=alternative)
        print("{0} (cut = {1}) Fisher's exact (alternative = {2}): N = {3}, OR = {4:.2f}, P-value = {5:.2g}".format(col.upper(), cutoff, alternative, len(df), odds_ratio, p_value))
        return len(df), odds_ratio, p_value, alternative

R = []
for col in ["rna", "cd4", "suppressed", "late"]:
    for cut in [1, 2, 3, 4, 5, 6]:
        results = fisher_test(df, col, cut)
        if results is None:
            continue
        r = [col, cut, results[-1], results[0], results[1], results[2]]
        R += [r]

dr = pd.DataFrame(R, columns=["variable", "cutoff", "alternative", "num_samples", "odds_ratio", "p_value"])
dr.to_csv("../data/processed/01_fisher_test_results.csv", index=False)

RNA (cut = 2) Fisher's exact (alternative = greater): N = 1960, OR = 0.73, P-value = 0.99
RNA (cut = 3) Fisher's exact (alternative = greater): N = 1960, OR = 0.89, P-value = 0.8
RNA (cut = 4) Fisher's exact (alternative = greater): N = 1960, OR = 0.88, P-value = 0.8
CD4 (cut = 2) Fisher's exact (alternative = greater): N = 694, OR = 1.54, P-value = 0.55
CD4 (cut = 3) Fisher's exact (alternative = greater): N = 694, OR = 2.10, P-value = 0.1
CD4 (cut = 4) Fisher's exact (alternative = greater): N = 694, OR = 1.97, P-value = 0.083
SUPPRESSED (cut = 1) Fisher's exact (alternative = less): N = 1960, OR = 1.14, P-value = 0.8
LATE (cut = 2) Fisher's exact (alternative = greater): N = 34760, OR = 1.02, P-value = 0.46
LATE (cut = 3) Fisher's exact (alternative = greater): N = 34760, OR = 1.05, P-value = 0.36
LATE (cut = 4) Fisher's exact (alternative = greater): N = 34760, OR = 0.93, P-value = 0.79
LATE (cut = 5) Fisher's exact (alternative = greater): N = 34760, OR = 0.87, P-value = 0.96
LATE

In [13]:
df

Unnamed: 0,id,sex_female,late,rna,cd4,suppressed,vaccine,covid_wave
0,20,1,5,-1,-1,-1,0,0
1,106,1,6,-1,-1,-1,0,0
2,109,1,6,-1,-1,-1,0,0
3,128,0,6,-1,-1,-1,0,0
4,144,1,5,1,-1,1,0,0
...,...,...,...,...,...,...,...,...
38380,442243,1,6,-1,-1,-1,0,0
38381,442332,1,5,-1,-1,-1,0,0
38382,442358,0,6,-1,-1,-1,0,0
38383,443780,1,5,-1,-1,-1,0,0
