# Design

need for each spade pattern, the four counts:
 c1 = case +, control + for pattern
 c2 = case +, control - 
 c3 = case -, control + 
 c4 = case -, control -
 once we have these, for each pattern, we can calculate McNemar score

want dictionary mcnemar_counts = {pattern_key: [c1, c2, c3, c4]}

## setup
 1. create a dictionary case2ctrl = {case pid : control pid}
 2. create a dictionary case_edc_info = {case pid : [(EDC code pre), (EDC code idx), (EDC code post)]
 3. create a dictionary ctrl_edc_info = {controls pid : [(EDC code pre), (EDC code idx), (EDC code post)]
 4. create list of patterns

 mcnemar_counts = {}
 
 ```for pattern in patterns:
 
     mcnemar_counts[pattern] = [0, 0, 0, 0]
    
    pattern_edcs = [_.strip() for _ in pattern.split(",")]
    
    for case_pid, control_pid in case2ctrl:
        b_case_has_pattern = True
    
        b_ctrl_has_pattern = False
         case_visit_info = case_edc_info[case_pid]
         ctrl_visit_info = ctrl_edc_info[ctrl_pid]
         for edc in pattern_edcs:
             tpl_idx = int(edc[0])
             b_case_has_pattern = b_case_has_pattern & (edc in case_visit_info[tpl_idx])
             b_ctrl_has_pattern = b_ctrl_has_pattern & (edc in ctrl_visit_info[tpl_idx])
         if b_case_has_pattern and b_ctrl_has_pattern:
             jdx = 0
         if b_case_has_pattern and not b_ctrl_has_pattern:
             jdx = 1
         elif not b_case_has_pattern and b_ctrl_has_pattern:
             jdx = 2
         else:
             jdx = 3
         mcnemar_counts[pattern][jdx] = mcnemar_counts[pattern][jdx] + 1
 ```

In [13]:
import csv

In [14]:
# read in control case pairs
case2ctrl = {}
with open('final_pairs_IDs.csv', 'r') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for line in reader:
            case2ctrl[line[0]]=line[1]

In [15]:
with open('Sequence_support_values.csv', 'r') as f: 
    reader = csv.reader(f)
    next(reader) # skip header
    patterns = [l[0] for l in reader]

In [16]:
# case_edc_info = {case pid : [(EDC code pre), (EDC code idx), (EDC code post)]
def load_patients(file_name):
    edc_info = {}
    with open(file_name, 'r') as f:
        # file rows are
        #person_id,Timing_Class,times,EDCs
        #4239598,1,2,"1-NUR04, 1-PSY17"
        reader = csv.reader(f)
        next(reader) #skip header
        for line in reader:
            pid = line[0]
            timing = int(line[1])-1
            edcs = [_.strip() for _ in line[3].split(",")]
            if pid not in edc_info:
                edc_info[pid] = [[], [], []]
            edc_info[pid][timing] = edcs
    return edc_info

In [17]:
case_edc_info = load_patients('SPADE_input_cases.csv')

In [18]:
ctrl_edc_info = load_patients('SPADE_input_controls.csv')

In [19]:
mcnemar_counts = {}
missing_cases = []
missing_ctrls = []
for pattern in patterns:
    mcnemar_counts[pattern] = [0, 0, 0, 0]
    pattern_edcs = [_.strip() for _ in pattern.split(",")]
    for case_pid, ctrl_pid in case2ctrl.items():
        if case_pid not in case_edc_info:
            missing_cases.append(case_pid)
            case_edc_info[case_pid] = [[], [], []]
        if ctrl_pid not in ctrl_edc_info:
            missing_ctrls.append(ctrl_pid)
            ctrl_edc_info[ctrl_pid] = [[], [], []]
        b_case_has_pattern = True
        b_ctrl_has_pattern = True    
        case_visit_info = case_edc_info[case_pid]
        ctrl_visit_info = ctrl_edc_info[ctrl_pid]
        for edc in pattern_edcs:
            tpl_idx = int(edc[0])-1
            b_case_has_pattern = b_case_has_pattern and (edc in case_visit_info[tpl_idx])
            b_ctrl_has_pattern = b_ctrl_has_pattern and (edc in ctrl_visit_info[tpl_idx])
        if b_case_has_pattern and b_ctrl_has_pattern:
            jdx = 0
        elif b_case_has_pattern and (not b_ctrl_has_pattern):
            jdx = 1
        elif (not b_case_has_pattern) and b_ctrl_has_pattern:
            jdx = 2
        else:
            jdx = 3
        mcnemar_counts[pattern][jdx] = mcnemar_counts[pattern][jdx] + 1
missing_cases = list(set(missing_cases))
missing_ctrls = list(set(missing_ctrls))

In [20]:
mcnemar_counts

{'1-ADM05': [51, 1690, 1659, 46294],
 '1-ALL03': [215, 2779, 2688, 44012],
 '1-ALL03,1-ALL04': [35, 1112, 979, 47568],
 '1-ALL03,1-ALL04,2-ALL04': [14, 725, 532, 48423],
 '1-ALL03,2-ALL03': [25, 1029, 782, 47858],
 '1-ALL03,2-ALL03,2-ALL04': [6, 544, 353, 48791],
 '1-ALL03,2-ALL04': [25, 933, 672, 48064],
 '1-ALL04': [534, 4798, 3660, 40702],
 '1-ALL04,2-ALL03': [18, 1035, 634, 48007],
 '1-ALL04,2-ALL03,2-ALL04': [14, 907, 526, 48247],
 '1-ALL04,2-ALL04': [177, 2991, 1931, 44595],
 '1-ALL04,2-ALL04,2-NUT03': [0, 575, 0, 49119],
 '1-ALL04,2-ALL04,3-ALL04': [45, 1218, 1130, 47301],
 '1-ALL04,2-NUT03': [0, 794, 1, 48899],
 '1-ALL04,2-SKN02': [6, 566, 347, 48775],
 '1-EAR01': [744, 4656, 4590, 39704],
 '1-EAR01,1-EAR08': [19, 929, 616, 48130],
 '1-EAR01,1-EAR11': [22, 803, 924, 47945],
 '1-EAR01,2-ALL04': [5, 607, 319, 48763],
 '1-EAR01,2-EAR01': [89, 1408, 2117, 46080],
 '1-EAR01,2-EAR01,3-EAR01': [28, 850, 1147, 47669],
 '1-EAR01,2-NUT03': [0, 555, 0, 49139],
 '1-EAR07': [8, 531, 514, 48

In [21]:
missing_cases

['5445023', '4758869']

In [22]:
missing_ctrls

['5418135', '4547110', '4657364', '4464419', '4340750', '5933873']

In [23]:
len(mcnemar_counts)

189

In [25]:
with open('mc_nemar_counts.csv','w+') as f:
    for pattern, counts in mcnemar_counts.items():
        line = '\"{0}\"'.format(pattern)
        for c in counts:
            line = "{0},{1}".format(line, c)
        f.write("{0}\n".format(line))