# Chi-squared values 

Here I calculate chisq values for any subset of the data in the thesis. 

In [1]:
import numpy as np 
import pandas as pd
from os.path import join 
import glob 
from scipy.stats import chi2_contingency 
import sys

DATA_DIR = "/ritter/share/data/IMAGEN"

# Local imports
sys.path.insert(1, "../analysis")
from mri_learn_quick import MRILearn, shuffle_y, pbcc
from confounds import *

In [2]:
def chisq(df, col0, col1):
    return chi2_contingency(pd.crosstab(df[col0], df[col1]))[0:2]

# Identification 

In [3]:
dfid = pd.read_csv(join(DATA_DIR, "h5files/ESPAD19a_01_56/fs-stats/fs-stats_FU2-FU2_n789.csv")).assign(setting="id")

In [4]:
chisq(dfid, "center", "label")

(155.6776979401473, 2.6026959369199047e-30)

In [5]:
chisq(dfid, "label", "sex")

(32.94329365577108, 9.488649742065925e-09)

In [6]:
dfid0 = dfid.query("sex == 0")
dfid1 = dfid.query("sex == 1")

In [7]:
chisq(dfid0, "label", "center")

(70.25015819471867, 1.3150642489913371e-12)

In [8]:
chisq(dfid1, "label", "sex")

(0.0, 1.0)

In [9]:
chisq(dfid1, "label", "center")

(94.85031768751665, 1.2444175029929092e-17)

# Prediction

In [10]:
dfpr = pd.read_csv(join(DATA_DIR, "h5files/ESPAD19a_01_56/fs-stats/fs-stats_BL-FU2_n507.csv")).assign(setting="pr")
chisq(dfpr, "center", "label")

(94.07483251723178, 1.79737097180415e-17)

In [11]:
chisq(dfpr, "sex", "label")

(27.296319716735503, 1.7454330890452667e-07)

In [12]:
dfpr0 = dfpr.query("sex == 0")
dfpr1 = dfpr.query("sex == 1")

In [13]:
chisq(dfpr0, "label", "center")

(39.55430178768702, 1.5317675660727557e-06)

In [14]:
chisq(dfpr1, "label", "center")

(65.8312055842272, 1.0234630416391106e-11)

# CB

In [15]:
random_state = np.load("../analysis/random_states.npy")[0]

params = {
    #"data_dir" : join(DATA_DIR, "h5files/ESPAD19a_01_56/fs-stats/fs-stats_FU2-FU2_n789.h5"),
    "data_dir" :join(DATA_DIR, "h5files/ESPAD19a_01_56/fs-stats/fs-stats_BL-FU2_n507.h5"),
    "verbose" : 3, 
    "n_jobs" : 10, 
    "conf_list" : ["c", "s", "group"]
}

m = MRILearn(params)
m.load_data()

cb = CounterBalance(m.conf_dict["s"], random_state)
m.X = cb.fit_transform(m.X, m.y)
m.y = cb.transform(m.y)
m.conf_dict["c"] = cb.transform(m.conf_dict["c"])
m.conf_dict["s"] = cb.transform(m.conf_dict["s"])
m.conf_dict["group"] = cb.transform(m.conf_dict["group"])

GROUPS [] ONLY HAVE ONE MEMBER, DELETING THEM, TOO.


In [18]:
df = pd.DataFrame({
    "label" : m.y, 
    "center" : m.conf_dict["c"], 
    "sex" : m.conf_dict["s"]
})


chisq(df, "label", "center")

(96.23097926837636, 6.464248373959242e-18)

In [19]:
chisq(df, "label", "sex")

(0.0, 1.0)