In [4]:
# default_exp odds_ratio

# Calculating Odds Ratio

> API details.

In [5]:
#hide
%load_ext autoreload
%autoreload 2


In [6]:
#hide
from nbdev.showdoc import *
from nbdev import *

from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog
from fastcore.test import ExceptionExpected


In [7]:
#export
import pandas as pd
from typing import Any, Dict, List, Optional, Literal, Union
from dataclasses import dataclass
from fastcore.basics import basic_repr
from pydantic import BaseModel
from itertools import product
from ast import literal_eval
import numpy as np

In [8]:
geno_each_sample = test_data_catalog.load("geno_each_sample")
case_geno_each_sample = test_data_catalog.load("case_geno_each_sample")
control_geno_each_sample = test_data_catalog.load("control_geno_each_sample")
sample_file = test_data_catalog.load("sample_file")
all_geno_df = test_data_catalog.load("all_geno_df")

In [9]:
geno_each_sample

Unnamed: 0_level_0,rs77948203,rs1014626,rs9610458,rs5762201,rs1004237,rs134490,rs4821519,rs1003500,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WTCCCT473540,GG,TT,TT,GG,CC,,GG,CC,AG
WTCCCT473530,GG,TT,TT,GG,CC,TT,CG,CC,AA
WTCCCT473555,GG,TT,TT,,CC,TT,GG,CC,
WTCCCT473426,GG,TT,TT,GG,CC,TT,GG,CC,GG
WTCCCT473489,GG,TT,CT,GG,CC,,GG,CC,AA
...,...,...,...,...,...,...,...,...,...
WS574632,GG,TT,CT,GG,CC,TT,GG,CC,GG
WS574661,GG,TT,TT,GG,CC,TT,GG,CC,AA
BLOOD294452,GG,TT,CT,GG,CC,TT,GG,CC,AG
WTCCCT511021,GG,TT,CT,GG,CC,TT,GG,CC,AG


In [10]:
geno_each_sample.columns

Index(['rs77948203', 'rs1014626', 'rs9610458', 'rs5762201', 'rs1004237',
       'rs134490', 'rs4821519', 'rs1003500', 'rs5756405'],
      dtype='object')

---

In [11]:
#export

@dataclass
class RsidComboInfo():
    df: pd.DataFrame
    rsid_list: List[str]
    NA_val: str
    __repr__ = basic_repr("rsid_list,NA_val")
    
    #TODO: Query does not work when rsid is a number, need to find a way around that
    def query(self, **rsid_dict):
        if not all([rsid in self.rsid_list for rsid in rsid_dict.keys()]):
            raise ValueError("Some Rsid are not in the dataframe")
        filtered_df = self.df.copy()
        for rsid, geno in rsid_dict.items():
            filtered_df = filtered_df.query(f"{rsid} == '{geno}'")
        return filtered_df
    
    def get_all_genos(self, rsid:str):
        return self.df[rsid].unique()
    
    @property
    def total_samples_with_NA(self):
        return self.df.unique_samples_count.sum()
    
    @property
    def num_samples_NA(self):
        return self.df.loc[(self.df[self.rsid_list] == self.NA_val).any(axis=1)].unique_samples_count.sum()
    @property
    def total_samples_no_NA(self):
        return self.total_samples_with_NA  - self.num_samples_NA
    
def get_geno_combination_df(geno_each_sample_df: pd.DataFrame, rsid_list:List[str], NA_val="NA", as_df= False):
    #geno_each_sample_df[["rs77948203", "rs1014626", "rs1004237"]].reset_index().groupby(["rs77948203", "rs1014626", "rs1004237"])["sample_id"].unique().reset_index()
    geno_combination_df = geno_each_sample_df[rsid_list].fillna(NA_val).reset_index().groupby(rsid_list)["sample_id"].agg(**{"unique_samples_id":"unique", "unique_samples_count": "nunique"}).reset_index()
    if as_df:
        return geno_combination_df
    else:
        info_obj = RsidComboInfo(df= geno_combination_df, rsid_list = rsid_list, NA_val= NA_val)
    return info_obj
    

In [12]:
test_rsid_info_obj = get_geno_combination_df(geno_each_sample,["rs77948203"])
test_rsid_info_obj

RsidComboInfo(rsid_list=['rs77948203'], NA_val='NA')

In [13]:
test_rsid_info_obj.df

Unnamed: 0,rs77948203,unique_samples_id,unique_samples_count
0,AA,"[WTCCCT474448, WTCCCT474394, WTCCCT474560, WTC...",78
1,AG,"[WTCCCT473522, WTCCCT473497, WTCCCT473524, WTC...",1945
2,GG,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",12924


In [14]:
assert test_rsid_info_obj.df.equals(geno_each_sample["rs77948203"].fillna("NA").reset_index().groupby("rs77948203")["sample_id"].agg(**{"unique_samples_id":"unique", "unique_samples_count": "nunique"}).reset_index())
assert test_rsid_info_obj.rsid_list == ['rs77948203']

---

In [15]:
get_geno_combination_df(geno_each_sample,["rs77948203"]).query(rs77948203='AA')

Unnamed: 0,rs77948203,unique_samples_id,unique_samples_count
0,AA,"[WTCCCT474448, WTCCCT474394, WTCCCT474560, WTC...",78


In [16]:
get_geno_combination_df(geno_each_sample,["rs77948203", "rs1014626"]).df

Unnamed: 0,rs77948203,rs1014626,unique_samples_id,unique_samples_count
0,AA,TT,"[WTCCCT474448, WTCCCT474394, WTCCCT474560, WTC...",78
1,AG,CT,"[WTCCCT451162, WTCCCT476263]",2
2,AG,,[WTCCCT508004],1
3,AG,TT,"[WTCCCT473522, WTCCCT473497, WTCCCT473524, WTC...",1942
4,GG,CT,"[WTCCCT489603, WTCCCT470545, WTCCCT465946, WTC...",39
5,GG,,"[WTCCCT473298, WTCCCT465970, WTCCCT466158, WTC...",10
6,GG,TT,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",12875


In [17]:
get_geno_combination_df(geno_each_sample,["rs77948203", "rs1014626"]).get_all_genos("rs1014626")

array(['TT', 'CT', 'NA'], dtype=object)

You can query multiple genotypes by passing in keyword arguments or a dictionary

In [18]:
query_result_1 = get_geno_combination_df(geno_each_sample,["rs77948203", "rs1014626"]).query(rs77948203= 'AG', rs1014626= 'TT')
query_result_1

Unnamed: 0,rs77948203,rs1014626,unique_samples_id,unique_samples_count
3,AG,TT,"[WTCCCT473522, WTCCCT473497, WTCCCT473524, WTC...",1942


In [19]:
query_result_2 = get_geno_combination_df(geno_each_sample,["rs77948203", "rs1014626"]).query(**{"rs77948203": 'AG', "rs1014626": 'TT'})
query_result_2

Unnamed: 0,rs77948203,rs1014626,unique_samples_id,unique_samples_count
3,AG,TT,"[WTCCCT473522, WTCCCT473497, WTCCCT473524, WTC...",1942


In [20]:
assert query_result_1.equals(query_result_2)

Querying an unknown rsid will lead to an error

In [21]:
with ExceptionExpected(ex=ValueError, regex = "Some Rsid are not in the dataframe"):test_rsid_info_obj.query(rs7794820='AA') #missing the last digit of rsid

In [22]:
get_geno_combination_df(geno_each_sample,["rs77948203"]).query(rs77948203='NA').empty

True

You can choose to return a simple dataframe, or an object that has enhanced capabilities

In [23]:
get_geno_combination_df(geno_each_sample,["rs1014626"]).df

Unnamed: 0,rs1014626,unique_samples_id,unique_samples_count
0,CT,"[WTCCCT489603, WTCCCT470545, WTCCCT465946, WTC...",41
1,,"[WTCCCT508004, WTCCCT473298, WTCCCT465970, WTC...",11
2,TT,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",14895


In [24]:
get_geno_combination_df(geno_each_sample,["rs1014626"], as_df = True)

Unnamed: 0,rs1014626,unique_samples_id,unique_samples_count
0,CT,"[WTCCCT489603, WTCCCT470545, WTCCCT465946, WTC...",41
1,,"[WTCCCT508004, WTCCCT473298, WTCCCT465970, WTC...",11
2,TT,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",14895


In [25]:
assert get_geno_combination_df(geno_each_sample,["rs1014626"]).df.equals(get_geno_combination_df(geno_each_sample,["rs1014626"], as_df = True))

---

In [26]:
test_geno_combination_df = get_geno_combination_df(geno_each_sample,["rs77948203", "rs1014626"])
test_geno_combination_df.df

Unnamed: 0,rs77948203,rs1014626,unique_samples_id,unique_samples_count
0,AA,TT,"[WTCCCT474448, WTCCCT474394, WTCCCT474560, WTC...",78
1,AG,CT,"[WTCCCT451162, WTCCCT476263]",2
2,AG,,[WTCCCT508004],1
3,AG,TT,"[WTCCCT473522, WTCCCT473497, WTCCCT473524, WTC...",1942
4,GG,CT,"[WTCCCT489603, WTCCCT470545, WTCCCT465946, WTC...",39
5,GG,,"[WTCCCT473298, WTCCCT465970, WTCCCT466158, WTC...",10
6,GG,TT,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",12875


In [27]:
test_geno_combination_df.num_samples_NA

11

In [28]:
NA_samples_df = test_geno_combination_df.df.loc[(test_geno_combination_df.df[["rs77948203", "rs1014626"]] == "NA").any(axis=1)]
NA_samples_df

Unnamed: 0,rs77948203,rs1014626,unique_samples_id,unique_samples_count
2,AG,,[WTCCCT508004],1
5,GG,,"[WTCCCT473298, WTCCCT465970, WTCCCT466158, WTC...",10


In [29]:
test_geno_combination_df.total_samples_with_NA

14947

In [30]:
test_eq(test_geno_combination_df.num_samples_NA, NA_samples_df["unique_samples_count"].sum())
test_eq(test_geno_combination_df.num_samples_NA, 11) 
test_eq(test_geno_combination_df.total_samples_with_NA, 14947)
test_eq(test_geno_combination_df.total_samples_no_NA, 14936)

In [31]:
get_geno_combination_df(geno_each_sample,["rs77948203", "rs1014626", "rs134490"])

RsidComboInfo(rsid_list=['rs77948203', 'rs1014626', 'rs134490'], NA_val='NA')

---

In [73]:
sample_file_full_df = sample_file.load_all_files()
sample_file_full_df

Unnamed: 0_level_0,ID_1,missing,sex,case,missing_col_generated,plate
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
WTCCCT473540,95300_D10,0,2,1,0,
WTCCCT473530,95300_D11,0,1,1,0,
WTCCCT473555,95300_D12,0,1,1,0,
WTCCCT473426,95300_E01,0,2,1,0,
WTCCCT473489,95300_E02,0,1,1,0,
...,...,...,...,...,...,...
WS574632,101916_C07,0,1,0,0,101916.0
WS574661,101916_D07,0,1,0,0,101916.0
BLOOD294452,101806_F08,0,1,0,0,101806.0
WTCCCT511021,101816_D11,0,2,0,0,101816.0


In [74]:
geno_each_sample

Unnamed: 0_level_0,rs77948203,rs1014626,rs9610458,rs5762201,rs1004237,rs134490,rs4821519,rs1003500,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WTCCCT473540,GG,TT,TT,GG,CC,,GG,CC,AG
WTCCCT473530,GG,TT,TT,GG,CC,TT,CG,CC,AA
WTCCCT473555,GG,TT,TT,,CC,TT,GG,CC,
WTCCCT473426,GG,TT,TT,GG,CC,TT,GG,CC,GG
WTCCCT473489,GG,TT,CT,GG,CC,,GG,CC,AA
...,...,...,...,...,...,...,...,...,...
WS574632,GG,TT,CT,GG,CC,TT,GG,CC,GG
WS574661,GG,TT,TT,GG,CC,TT,GG,CC,AA
BLOOD294452,GG,TT,CT,GG,CC,TT,GG,CC,AG
WTCCCT511021,GG,TT,CT,GG,CC,TT,GG,CC,AG


In [79]:
case_geno_each_sample

Unnamed: 0_level_0,rs77948203,rs1014626,rs9610458,rs5762201,rs1004237,rs134490,rs4821519,rs1003500,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
WTCCCT473540,GG,TT,TT,GG,CC,,GG,CC,AG
WTCCCT473530,GG,TT,TT,GG,CC,TT,CG,CC,AA
WTCCCT473555,GG,TT,TT,,CC,TT,GG,CC,
WTCCCT473426,GG,TT,TT,GG,CC,TT,GG,CC,GG
WTCCCT473489,GG,TT,CT,GG,CC,,GG,CC,AA
...,...,...,...,...,...,...,...,...,...
WTCCCT473455,GG,TT,TT,,CC,TT,GG,CC,AG
WTCCCT473479,GG,TT,CT,GG,CC,CT,GG,CC,GG
WTCCCT473432,GG,TT,CT,GG,CC,CT,GG,CC,AG
WTCCCT473465,GG,TT,CT,GG,CC,,GG,CC,AA


In [75]:
test = get_geno_combination_df(case_geno_each_sample,["rs9610458", "rs134490"])
test

RsidComboInfo(rsid_list=['rs9610458', 'rs134490'], NA_val='NA')

In [76]:
test.df

Unnamed: 0,rs9610458,rs134490,unique_samples_id,unique_samples_count
0,CC,CC,"[WTCCCT489620, WTCCCT489645, WTCCCT473287, WTC...",49
1,CC,CT,"[WTCCCT473552, WTCCCT473505, WTCCCT489578, WTC...",468
2,CC,,"[WTCCCT489646, WTCCCT489580, WTCCCT488814, WTC...",249
3,CC,TT,"[WTCCCT473500, WTCCCT473539, WTCCCT473521, WTC...",1126
4,CT,CC,"[WTCCCT473297, WTCCCT473230, WTCCCT473244, WTC...",126
5,CT,CT,"[WTCCCT473447, WTCCCT473466, WTCCCT473492, WTC...",1067
6,CT,,"[WTCCCT473489, WTCCCT473524, WTCCCT473499, WTC...",652
7,CT,TT,"[WTCCCT473456, WTCCCT473515, WTCCCT473508, WTC...",2714
8,,CC,"[WTCCCT473436, WTCCCT469571]",2
9,,CT,"[WTCCCT488883, WTCCCT474387, WTCCCT474448, WTC...",91


In [36]:
all_geno_df

Unnamed: 0_level_0,alleleA,alleleB,AA,AB,BB
id_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs1014626,C,T,CC,CT,TT
rs9610458,C,T,CC,CT,TT
rs5762201,A,G,AA,AG,GG
rs1004237,C,T,CC,CT,TT
rs134490,C,T,CC,CT,TT
rs4821519,G,C,GG,CG,CC
rs1003500,C,T,CC,CT,TT
rs5756405,A,G,AA,AG,GG


In [81]:
#export
class CaseControlOddsRatio(BaseModel):
    case: RsidComboInfo
    control: RsidComboInfo
    geno_df: pd.DataFrame
    
    
#     def __init__(self, *, case_df, control_df, geno_df):
    
    @property
    def snp_cols(self):
        case_snp_cols = self.case.df.columns.difference(['unique_samples_id', 'unique_samples_count'])
        control_snp_cols = self.control.df.columns.difference(['unique_samples_id', 'unique_samples_count'])
        assert set(case_snp_cols) == set(control_snp_cols)
        snp_cols_sorted = sorted(list(set(case_snp_cols)))
        return snp_cols_sorted
    
    @property
    def possible_genotypes_single(self):
        return self.geno_df[["AA", "AB", "BB"]]
    
    @property
    def possible_genotypes_combo(self):
        geno_combo_df = pd.DataFrame(product(*[self.possible_genotypes_single.loc[rsid].tolist() 
                        for rsid in self.possible_genotypes_single.index]), columns = self.possible_genotypes_single.index)
        return geno_combo_df
    
    @property
    def case_total_no_NA(self):
        return self.case.total_samples_no_NA
    
    @property
    def case_total_with_NA(self):
        return self.case.total_samples_with_NA
    
    @property
    def control_total_no_NA(self):
        return self.control.total_samples_no_NA
    
    @property
    def control_total_with_NA(self):
        return self.control.total_samples_with_NA
    
    
    def calculate_odds_ratio(self, query_geno_dict: Dict[str, str], ndigits = 5):
        try:
            geno_case = self.case.query(**query_geno_dict).unique_samples_count.item()
        # except ValueError:
        #     print(self.case.query(**query_geno_dict).unique_samples_count)
        #try:
            geno_control = self.control.query(**query_geno_dict).unique_samples_count.item()
        # except ValueError:
        #     print(self.control.query(**query_geno_dict).unique_samples_count)
            odds_ratio = odds_ratio_calculator(geno_case= geno_case,
                                  geno_control=geno_control,
                                  case_total_no_NA = self.case_total_no_NA, 
                                  control_total_no_NA = self.control_total_no_NA)
            odds_ratio_rounded = round(odds_ratio, ndigits = ndigits)
        except ValueError:
            return np.nan
        return odds_ratio_rounded
    
    @property
    def odds_ratios_df(self):
        odds_ratio = [self.calculate_odds_ratio(query_dict) for query_dict in self.possible_genotypes_combo.to_dict(orient="records")]
        odds_ratio_df = self.possible_genotypes_combo.assign(odds_ratio = odds_ratio)
        return odds_ratio_df

    
    class Config:
        arbitrary_types_allowed = True
        
        
def odds_ratio_calculator(geno_case, geno_control, case_total_no_NA, control_total_no_NA):
        """
        calculates odds ratio using formula specified by type of pipeline
        """
        try:
#             if self.odds_ratio_type == 1:
            case_odds = geno_case / (case_total_no_NA - geno_case)
            control_odds = geno_control / (control_total_no_NA - geno_control)
#             else:
#                 case_odds = case / case_total
#                 control_odds = control / control_total

            odds_ratio = case_odds / control_odds
#             if odds_ratio == 0:
#                 odds_ratio = "NA"

            return odds_ratio
        except ZeroDivisionError:
            return "NA"

In [39]:
test_case_control_odds_ratio_single = CaseControlOddsRatio(case = get_geno_combination_df(case_geno_each_sample, ["rs9610458"]),
                    control =  get_geno_combination_df(control_geno_each_sample, ["rs9610458"]),
                    geno_df = all_geno_df.loc[["rs9610458"]])

In [40]:
test_case_control_odds_ratio_single 

CaseControlOddsRatio(case=_Pydantic_RsidComboInfo_94916286233744(df=  rs9610458                                  unique_samples_id  \
0        CC  [WTCCCT473500, WTCCCT473552, WTCCCT473505, WTC...   
1        CT  [WTCCCT473489, WTCCCT473456, WTCCCT473515, WTC...   
2        NA  [WTCCCT473549, WTCCCT489615, WTCCCT489614, WTC...   
3        TT  [WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...   

   unique_samples_count  
0                  1892  
1                  4559  
2                   387  
3                  2934  , rsid_list=['rs9610458'], NA_val='NA'), control=_Pydantic_RsidComboInfo_94916286233744(df=  rs9610458                                  unique_samples_id  \
0        CC  [WTCCCT443026, WTCCCT443028, WTCCCT442386, WTC...   
1        CT  [WTCCCT443065, WTCCCT443066, WTCCCT443059, WTC...   
2        NA  [WTCCCT442413, WTCCCT444179, WTCCCT444212, WTC...   
3        TT  [WTCCCT443025, WTCCCT443063, WTCCCT443064, WTC...   

   unique_samples_count  
0                  1080 

In [41]:
test_case_control_odds_ratio_single.possible_genotypes_combo.to_dict(orient="records")

[{'rs9610458': 'CC'}, {'rs9610458': 'CT'}, {'rs9610458': 'TT'}]

In [48]:
test_eq(test_case_control_odds_ratio_single.case.total_samples_no_NA, 9385)
test_eq(test_case_control_odds_ratio_single.control.total_samples_no_NA, 5076)
test_eq(test_case_control_odds_ratio_single.calculate_odds_ratio({"rs9610458": "CC"}), 0.93426)
test_eq(test_case_control_odds_ratio_single.odds_ratios_df["odds_ratio"].tolist(), [0.93426, 0.93578, 1.14175])

In [49]:
test_case_control_odds_ratio = CaseControlOddsRatio(case = get_geno_combination_df(case_geno_each_sample, ["rs9610458", "rs134490"]),
                    control =  get_geno_combination_df(control_geno_each_sample, ["rs9610458", "rs134490"]),
                    geno_df = all_geno_df.loc[["rs9610458", "rs134490"]])

In [50]:
test_case_control_odds_ratio.possible_genotypes_single

Unnamed: 0_level_0,AA,AB,BB
id_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs9610458,CC,CT,TT
rs134490,CC,CT,TT


In [51]:
assert test_case_control_odds_ratio.possible_genotypes_single.equals(pd.DataFrame.from_dict({"rs9610458": ["CC", "CT", "TT"], 
                                                                                      "rs134490": ["CC", "CT", "TT"]},
                                                                                     orient = "index", columns = ["AA", "AB", "BB"]).rename_axis("id_col"))

In [52]:
assert test_case_control_odds_ratio.geno_df.loc[test_case_control_odds_ratio.snp_cols].equals(pd.DataFrame.from_dict({"rs134490": ["C", "T", "CC", "CT", "TT"], 
                                                                                      "rs9610458": ["C", "T", "CC", "CT", "TT"]},
                                                                                                              orient = "index",
                                                                                                              columns = ["alleleA", "alleleB", "AA", "AB", "BB"]).rename_axis("id_col"))

In [53]:
test_case_control_odds_ratio.case.df

Unnamed: 0,rs9610458,rs134490,unique_samples_id,unique_samples_count
0,CC,CC,"[WTCCCT489620, WTCCCT489645, WTCCCT473287, WTC...",49
1,CC,CT,"[WTCCCT473552, WTCCCT473505, WTCCCT489578, WTC...",468
2,CC,,"[WTCCCT489646, WTCCCT489580, WTCCCT488814, WTC...",249
3,CC,TT,"[WTCCCT473500, WTCCCT473539, WTCCCT473521, WTC...",1126
4,CT,CC,"[WTCCCT473297, WTCCCT473230, WTCCCT473244, WTC...",126
5,CT,CT,"[WTCCCT473447, WTCCCT473466, WTCCCT473492, WTC...",1067
6,CT,,"[WTCCCT473489, WTCCCT473524, WTCCCT473499, WTC...",652
7,CT,TT,"[WTCCCT473456, WTCCCT473515, WTCCCT473508, WTC...",2714
8,,CC,"[WTCCCT473436, WTCCCT469571]",2
9,,CT,"[WTCCCT488883, WTCCCT474387, WTCCCT474448, WTC...",91


In [54]:
test_case_control_odds_ratio.odds_ratios_df

id_col,rs9610458,rs134490,odds_ratio
0,CC,CC,1.001
1,CC,CT,1.00885
2,CC,TT,0.92318
3,CT,CC,1.07029
4,CT,CT,0.86588
5,CT,TT,0.98498
6,TT,CC,1.05632
7,TT,CT,1.06943
8,TT,TT,1.15646


In [56]:
test_eq(test_case_control_odds_ratio.case.total_samples_no_NA, 8048)

test_eq(test_case_control_odds_ratio.control.total_samples_no_NA, 4439)

test_eq(test_case_control_odds_ratio.odds_ratios_df["odds_ratio"].tolist(), [1.001, 1.00885, 0.92318, 1.07029, 0.86588, 0.98498, 1.05632, 1.06943, 1.15646])

test_eq(test_case_control_odds_ratio.case.query(rs9610458="CC", rs134490="CC").unique_samples_count.item(), 49)

In [57]:
#export
def odds_ratio_df_single_combined(*,case_geno_each_sample:pd.DataFrame, control_geno_each_sample:pd.DataFrame, all_geno_df:pd.DataFrame, single_rsid:str, combo_rsid_list:List[str]):
    """ `combo_rsid_list` has to contain `single_rsid`
    """
    assert single_rsid in combo_rsid_list
    
    test_case_control_odds_ratio_single = CaseControlOddsRatio(case = get_geno_combination_df(case_geno_each_sample, [single_rsid]),
                        control =  get_geno_combination_df(control_geno_each_sample, [single_rsid]),
                        geno_df = all_geno_df.loc[[single_rsid]])

    test_case_control_odds_ratio_combo = CaseControlOddsRatio(case = get_geno_combination_df(case_geno_each_sample, combo_rsid_list),
                        control =  get_geno_combination_df(control_geno_each_sample, combo_rsid_list),
                        geno_df = all_geno_df.loc[combo_rsid_list])

    odds_ratio_df_combined = test_case_control_odds_ratio_combo.odds_ratios_df.merge(test_case_control_odds_ratio_single.odds_ratios_df, on = single_rsid, suffixes = ["_combo", "_single"])
    #odds_ratio_df_combined = odds_ratio_df_combined.reset_index(drop=True)
    odds_ratio_df_combined = odds_ratio_df_combined.merge(test_case_control_odds_ratio_combo.case.df, on = combo_rsid_list, how = "left")
    odds_ratio_df_combined = odds_ratio_df_combined.merge(test_case_control_odds_ratio_combo.control.df, on = combo_rsid_list, how = "left", suffixes = ["_case", "_control"])
    
    odds_ratio_df_combined["case_total_no_NA"] = test_case_control_odds_ratio_combo.case_total_no_NA
    odds_ratio_df_combined["case_total_with_NA"] = test_case_control_odds_ratio_combo.case_total_with_NA
    odds_ratio_df_combined["control_total_no_NA"] = test_case_control_odds_ratio_combo.control_total_no_NA
    odds_ratio_df_combined["control_total_with_NA"] = test_case_control_odds_ratio_combo.control_total_with_NA
    
    
    return odds_ratio_df_combined

In [58]:
odds_ratio_df_single_combined(case_geno_each_sample = case_geno_each_sample,
                              control_geno_each_sample = control_geno_each_sample,
                             single_rsid = "rs9610458",
                              all_geno_df = all_geno_df,
                             combo_rsid_list = ["rs9610458", "rs77948203"])

Unnamed: 0,rs9610458,rs77948203,odds_ratio_combo,odds_ratio_single,unique_samples_id_case,unique_samples_count_case,unique_samples_id_control,unique_samples_count_control,case_total_no_NA,case_total_with_NA,control_total_no_NA,control_total_with_NA
0,CC,GG,0.94258,0.93426,"[WTCCCT473500, WTCCCT473552, WTCCCT473505, WTC...",1654,"[WTCCCT443026, WTCCCT443028, WTCCCT442386, WTC...",939,9385,9772,5076,5175
1,CC,AG,0.92122,0.93426,"[WTCCCT466268, WTCCCT489637, WTCCCT488814, WTC...",227,"[WTCCCT442411, WTCCCT444199, WTCCCT444742, WTC...",133,9385,9772,5076,5175
2,CC,AA,0.74339,0.93426,"[WTCCCT470057, WTCCCT489315, WTCCCT508408, WTC...",11,"[WTCCCT444162, WTCCCT442647, WTCCCT542697, WTC...",8,9385,9772,5076,5175
3,CT,GG,0.97583,0.93578,"[WTCCCT473489, WTCCCT473456, WTCCCT473515, WTC...",3956,"[WTCCCT443065, WTCCCT443066, WTCCCT443059, WTC...",2170,9385,9772,5076,5175
4,CT,AG,0.84428,0.93578,"[WTCCCT473524, WTCCCT473551, WTCCCT489609, WTC...",581,"[WTCCCT442429, WTCCCT443470, WTCCCT444145, WTC...",368,9385,9772,5076,5175
5,CT,AA,0.99156,0.93578,"[WTCCCT474394, WTCCCT470264, WTCCCT470548, WTC...",22,"[WTCCCT443346, WTCCCT444633, WTCCCT443601, WTC...",12,9385,9772,5076,5175
6,TT,GG,1.16218,1.14175,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",2549,"[WTCCCT443025, WTCCCT443063, WTCCCT443064, WTC...",1233,9385,9772,5076,5175
7,TT,AG,1.00665,1.14175,"[WTCCCT473522, WTCCCT473497, WTCCCT473514, WTC...",374,"[WTCCCT443058, WTCCCT442418, WTCCCT443471, WTC...",201,9385,9772,5076,5175
8,TT,AA,0.4952,1.14175,"[WTCCCT474560, WTCCCT469955, WTCCCT470219, WTC...",11,"[WTCCCT443119, WTCCCT442524, WTCCCT442733, WTC...",12,9385,9772,5076,5175


---

In [59]:
odds_ratio_df_rs9610458_rs77948203 = test_data_catalog.load("odds_ratio_df_rs9610458_rs77948203")
odds_ratio_df_rs9610458_rs77948203

Unnamed: 0,rs9610458,rs77948203,odds_ratio_combo,odds_ratio_single,unique_samples_id_case,unique_samples_count_case,unique_samples_id_control,unique_samples_count_control,case_total_no_NA,case_total_with_NA,control_total_no_NA,control_total_with_NA
0,CC,GG,0.943,0.934,"[WTCCCT473500, WTCCCT473552, WTCCCT473505, WTC...",1654,"[WTCCCT443026, WTCCCT443028, WTCCCT442386, WTC...",939,9385,9772,5076,5175
1,CC,AG,0.921,0.934,"[WTCCCT466268, WTCCCT489637, WTCCCT488814, WTC...",227,"[WTCCCT442411, WTCCCT444199, WTCCCT444742, WTC...",133,9385,9772,5076,5175
2,CC,AA,0.743,0.934,"[WTCCCT470057, WTCCCT489315, WTCCCT508408, WTC...",11,"[WTCCCT444162, WTCCCT442647, WTCCCT542697, WTC...",8,9385,9772,5076,5175
3,CT,GG,0.976,0.936,"[WTCCCT473489, WTCCCT473456, WTCCCT473515, WTC...",3956,"[WTCCCT443065, WTCCCT443066, WTCCCT443059, WTC...",2170,9385,9772,5076,5175
4,CT,AG,0.844,0.936,"[WTCCCT473524, WTCCCT473551, WTCCCT489609, WTC...",581,"[WTCCCT442429, WTCCCT443470, WTCCCT444145, WTC...",368,9385,9772,5076,5175
5,CT,AA,0.992,0.936,"[WTCCCT474394, WTCCCT470264, WTCCCT470548, WTC...",22,"[WTCCCT443346, WTCCCT444633, WTCCCT443601, WTC...",12,9385,9772,5076,5175
6,TT,GG,1.162,1.142,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",2549,"[WTCCCT443025, WTCCCT443063, WTCCCT443064, WTC...",1233,9385,9772,5076,5175
7,TT,AG,1.007,1.142,"[WTCCCT473522, WTCCCT473497, WTCCCT473514, WTC...",374,"[WTCCCT443058, WTCCCT442418, WTCCCT443471, WTC...",201,9385,9772,5076,5175
8,TT,AA,0.495,1.142,"[WTCCCT474560, WTCCCT469955, WTCCCT470219, WTC...",11,"[WTCCCT443119, WTCCCT442524, WTCCCT442733, WTC...",12,9385,9772,5076,5175


In [60]:
#export
def reconstruct_genetic_info(summary_df, rsid_list:List[str], exclude_NA=True):
    #TODO: handle the case where no sample ids are provided
    summary_df_copy = summary_df.copy()
    
    if exclude_NA:
        summary_df_copy = summary_df_copy.loc[summary_df_copy.notna().all(axis=1),:]
    
    case_geno_each_sample_reconstructed = summary_df_copy.loc[:, rsid_list + ["unique_samples_id_case"]].explode("unique_samples_id_case")
    case_geno_each_sample_reconstructed = case_geno_each_sample_reconstructed.set_index("unique_samples_id_case")
    case_geno_each_sample_reconstructed.index.name = "sample_id"
    control_geno_each_sample_reconstructed = summary_df_copy.loc[:, rsid_list + ["unique_samples_id_control"]].explode("unique_samples_id_control")
    control_geno_each_sample_reconstructed = control_geno_each_sample_reconstructed.set_index("unique_samples_id_control")
    control_geno_each_sample_reconstructed.index.name = "sample_id"
    return {"case_geno_each_sample": case_geno_each_sample_reconstructed, "control_geno_each_sample": control_geno_each_sample_reconstructed}

In [61]:
test_data_catalog.reload().load("odds_ratio_df_rs134490_rs1004237")

Unnamed: 0,rs134490,rs1004237,odds_ratio_combo,odds_ratio_single,unique_samples_id_case,unique_samples_count_case,unique_samples_id_control,unique_samples_count_control,case_total_no_NA,case_total_with_NA,control_total_no_NA,control_total_with_NA
0,CC,CC,1.002,1.015,"[WTCCCT489604, WTCCCT489620, WTCCCT489645, WTC...",241.0,"[WTCCCT443063, WTCCCT444200, WTCCCT444186, WTC...",130.0,8361,9772,4521,5175
1,CC,CT,,1.015,,,,,8361,9772,4521,5175
2,CC,TT,,1.015,,,,,8361,9772,4521,5175
3,CT,CC,0.95,0.948,"[WTCCCT473552, WTCCCT473447, WTCCCT473505, WTC...",2297.0,"[WTCCCT443025, WTCCCT443065, WTCCCT443026, WTC...",1289.0,8361,9772,4521,5175
4,CT,CT,,0.948,,,,,8361,9772,4521,5175
5,CT,TT,,0.948,,,,,8361,9772,4521,5175
6,TT,CC,1.051,1.05,"[WTCCCT473530, WTCCCT473555, WTCCCT473426, WTC...",5823.0,"[WTCCCT443066, WTCCCT443064, WTCCCT443028, WTC...",3101.0,8361,9772,4521,5175
7,TT,CT,,1.05,,,[WTCCCT510084],1.0,8361,9772,4521,5175
8,TT,TT,,1.05,,,,,8361,9772,4521,5175


In [62]:
reconstruct_genetic_info(odds_ratio_df_rs9610458_rs77948203, rsid_list = ["rs9610458", "rs77948203"])

{'case_geno_each_sample':              rs9610458 rs77948203
 sample_id                        
 WTCCCT473500        CC         GG
 WTCCCT473552        CC         GG
 WTCCCT473505        CC         GG
 WTCCCT473539        CC         GG
 WTCCCT473521        CC         GG
 ...                ...        ...
 WTCCCT447717        TT         AA
 WTCCCT449504        TT         AA
 WTCCCT449584        TT         AA
 WTCCCT508309        TT         AA
 WTCCCT470666        TT         AA
 
 [9385 rows x 2 columns],
 'control_geno_each_sample':              rs9610458 rs77948203
 sample_id                        
 WTCCCT443026        CC         GG
 WTCCCT443028        CC         GG
 WTCCCT442386        CC         GG
 WTCCCT443786        CC         GG
 WTCCCT443782        CC         GG
 ...                ...        ...
 WTCCCT542520        TT         AA
 WTCCCT543369        TT         AA
 WTCCC88658          TT         AA
 WTCCCT511508        TT         AA
 BLOOD293241         TT         AA
 
 [5076 

In [63]:
test_df_reconstructed = odds_ratio_df_single_combined(**reconstruct_genetic_info(odds_ratio_df_rs9610458_rs77948203,
                                                       rsid_list = ["rs9610458", "rs77948203"]),
                                                       single_rsid = "rs9610458",
                                                        all_geno_df = all_geno_df,
                                                    combo_rsid_list = ["rs9610458", "rs77948203"])
test_df_reconstructed

Unnamed: 0,rs9610458,rs77948203,odds_ratio_combo,odds_ratio_single,unique_samples_id_case,unique_samples_count_case,unique_samples_id_control,unique_samples_count_control,case_total_no_NA,case_total_with_NA,control_total_no_NA,control_total_with_NA
0,CC,GG,0.94258,0.93426,"[WTCCCT473500, WTCCCT473552, WTCCCT473505, WTC...",1654,"[WTCCCT443026, WTCCCT443028, WTCCCT442386, WTC...",939,9385,9385,5076,5076
1,CC,AG,0.92122,0.93426,"[WTCCCT466268, WTCCCT489637, WTCCCT488814, WTC...",227,"[WTCCCT442411, WTCCCT444199, WTCCCT444742, WTC...",133,9385,9385,5076,5076
2,CC,AA,0.74339,0.93426,"[WTCCCT470057, WTCCCT489315, WTCCCT508408, WTC...",11,"[WTCCCT444162, WTCCCT442647, WTCCCT542697, WTC...",8,9385,9385,5076,5076
3,CT,GG,0.97583,0.93578,"[WTCCCT473489, WTCCCT473456, WTCCCT473515, WTC...",3956,"[WTCCCT443065, WTCCCT443066, WTCCCT443059, WTC...",2170,9385,9385,5076,5076
4,CT,AG,0.84428,0.93578,"[WTCCCT473524, WTCCCT473551, WTCCCT489609, WTC...",581,"[WTCCCT442429, WTCCCT443470, WTCCCT444145, WTC...",368,9385,9385,5076,5076
5,CT,AA,0.99156,0.93578,"[WTCCCT474394, WTCCCT470264, WTCCCT470548, WTC...",22,"[WTCCCT443346, WTCCCT444633, WTCCCT443601, WTC...",12,9385,9385,5076,5076
6,TT,GG,1.16218,1.14175,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",2549,"[WTCCCT443025, WTCCCT443063, WTCCCT443064, WTC...",1233,9385,9385,5076,5076
7,TT,AG,1.00665,1.14175,"[WTCCCT473522, WTCCCT473497, WTCCCT473514, WTC...",374,"[WTCCCT443058, WTCCCT442418, WTCCCT443471, WTC...",201,9385,9385,5076,5076
8,TT,AA,0.4952,1.14175,"[WTCCCT474560, WTCCCT469955, WTCCCT470219, WTC...",11,"[WTCCCT443119, WTCCCT442524, WTCCCT442733, WTC...",12,9385,9385,5076,5076


In [64]:
odds_ratio_df_rs9610458_rs77948203

Unnamed: 0,rs9610458,rs77948203,odds_ratio_combo,odds_ratio_single,unique_samples_id_case,unique_samples_count_case,unique_samples_id_control,unique_samples_count_control,case_total_no_NA,case_total_with_NA,control_total_no_NA,control_total_with_NA
0,CC,GG,0.943,0.934,"[WTCCCT473500, WTCCCT473552, WTCCCT473505, WTC...",1654,"[WTCCCT443026, WTCCCT443028, WTCCCT442386, WTC...",939,9385,9772,5076,5175
1,CC,AG,0.921,0.934,"[WTCCCT466268, WTCCCT489637, WTCCCT488814, WTC...",227,"[WTCCCT442411, WTCCCT444199, WTCCCT444742, WTC...",133,9385,9772,5076,5175
2,CC,AA,0.743,0.934,"[WTCCCT470057, WTCCCT489315, WTCCCT508408, WTC...",11,"[WTCCCT444162, WTCCCT442647, WTCCCT542697, WTC...",8,9385,9772,5076,5175
3,CT,GG,0.976,0.936,"[WTCCCT473489, WTCCCT473456, WTCCCT473515, WTC...",3956,"[WTCCCT443065, WTCCCT443066, WTCCCT443059, WTC...",2170,9385,9772,5076,5175
4,CT,AG,0.844,0.936,"[WTCCCT473524, WTCCCT473551, WTCCCT489609, WTC...",581,"[WTCCCT442429, WTCCCT443470, WTCCCT444145, WTC...",368,9385,9772,5076,5175
5,CT,AA,0.992,0.936,"[WTCCCT474394, WTCCCT470264, WTCCCT470548, WTC...",22,"[WTCCCT443346, WTCCCT444633, WTCCCT443601, WTC...",12,9385,9772,5076,5175
6,TT,GG,1.162,1.142,"[WTCCCT473540, WTCCCT473530, WTCCCT473555, WTC...",2549,"[WTCCCT443025, WTCCCT443063, WTCCCT443064, WTC...",1233,9385,9772,5076,5175
7,TT,AG,1.007,1.142,"[WTCCCT473522, WTCCCT473497, WTCCCT473514, WTC...",374,"[WTCCCT443058, WTCCCT442418, WTCCCT443471, WTC...",201,9385,9772,5076,5175
8,TT,AA,0.495,1.142,"[WTCCCT474560, WTCCCT469955, WTCCCT470219, WTC...",11,"[WTCCCT443119, WTCCCT442524, WTCCCT442733, WTC...",12,9385,9772,5076,5175


In [65]:
#the reconstructed df doesn't have NA 
test_df_reconstructed.drop(columns=["case_total_with_NA", "control_total_with_NA"]).equals(odds_ratio_df_rs9610458_rs77948203.drop(columns=["case_total_with_NA", "control_total_with_NA"]))

False