In [None]:
# default_exp datasets.CombinedGenoPheno

# module name here

> API details.

In [None]:
#hide
from nbdev.showdoc import *
import numpy as np
from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog
from corradin_ovp_utils.datasets.genetic_file import triplicate_converter
from corradin_ovp_utils.odds_ratio import get_geno_combination_df

In [None]:
#export
from typing import Any, Dict, List, Optional, Literal, Union
from pydantic import BaseModel
import pandas as pd
import corradin_ovp_utils
from fastcore.basics import typed
from fastcore.dispatch import typedispatch
from corradin_ovp_utils.datasets import OVPDataset 

In [None]:
#export
class CombinedGenoPheno(BaseModel):
    genetic_file_df: pd.DataFrame
    sample_file_df: pd.DataFrame
        
    @classmethod
    def init_from_OVPDataset(cls,
                             genetic_dataset: OVPDataset.OVPDataset,
                             sample_dataset: OVPDataset.OVPDataset,
                            rsid_list: List[str],
                             chrom=None,
                            ):
        
        genetic_dict = cls.process_datasets(genetic_dataset, sample_dataset)
        all_samples_geno_df = [file.get_geno_each_sample(chrom=chrom, rsid_list= rsid_list) for file in genetic_dict.values()]
        return pd.concat(all_samples_geno_df)
            
        
    @classmethod
    def process_datasets(cls, genetic_dataset: OVPDataset.OVPDataset, sample_dataset: OVPDataset.OVPDataset, ):
        combine_genetic_sample_func = cls._process_file_type(genetic_dataset._file_path, sample_dataset._file_path)
        genetic_dict = combine_genetic_sample_func(genetic_dataset, sample_dataset)
        return genetic_dict
    
    @typedispatch    
    @classmethod
    def _process_file_type(cls, genetic_file_schema:OVPDataset.SingleFilePathSchema, sample_file_schema: OVPDataset.SingleFilePathSchema):
        return lambda x, y: x
    
    @typedispatch
    @classmethod
    def _process_file_type(cls, genetic_file_schema:OVPDataset.MultipleFilePathSchema, sample_file_schema: OVPDataset.MultipleFilePathSchema):
        assert genetic_file_schema.__class__ == sample_file_schema.__class__
        def combine_genetic_sample_multiple(genetic_dataset, sample_file_dataset):
            genetic_dict = genetic_dataset.files.__dict__
            sample_dict = sample_file_dataset.files.__dict__
            shared_keys = set(genetic_dict.keys()) & set(sample_dict.keys())
            
            #make sure the two datasets only have shared keys
            assert shared_keys == genetic_dict.keys() == sample_dict.keys()
            for key in shared_keys:
                genetic_dict[key].sample_ids = sample_dict[key].load().index
            return genetic_dict
        
        return combine_genetic_sample_multiple
                
    class Config:
        arbitrary_types_allowed = True

In [None]:
genetic_file = test_data_catalog.load("genetic_file")
sample_file = test_data_catalog.load("sample_file")

In [None]:
test = CombinedGenoPheno.init_from_OVPDataset(genetic_file, sample_file, rsid_list = ["rs77948203", "rs9610458", "rs134490", "rs5756405"])#["case"]
test

  and should_run_async(code)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

rsid,rs77948203,rs9610458,rs134490,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WTCCCT473540,GG,TT,,AG
WTCCCT473530,GG,TT,TT,AA
WTCCCT473555,GG,TT,TT,
WTCCCT473426,GG,TT,TT,GG
WTCCCT473489,GG,CT,,AA
...,...,...,...,...
WS574632,GG,CT,TT,GG
WS574661,GG,TT,TT,AA
BLOOD294452,GG,CT,TT,AG
WTCCCT511021,GG,CT,TT,AG


In [None]:
get_geno_combination_df(test, rsid_list=["rs9610458", "rs134490"]).df

Unnamed: 0,rs9610458,rs134490,unique_samples_id,unique_samples_count
0,CC,CC,"[WTCCCT489620, WTCCCT489645, WTCCCT473287, WTC...",76
1,CC,CT,"[WTCCCT473552, WTCCCT473505, WTCCCT489578, WTC...",724
2,CC,,"[WTCCCT489646, WTCCCT489580, WTCCCT488814, WTC...",381
3,CC,TT,"[WTCCCT473500, WTCCCT473539, WTCCCT473521, WTC...",1791
4,CT,CC,"[WTCCCT473297, WTCCCT473230, WTCCCT473244, WTC...",191
5,CT,CT,"[WTCCCT473447, WTCCCT473466, WTCCCT473492, WTC...",1733
6,CT,,"[WTCCCT473489, WTCCCT473524, WTCCCT473499, WTC...",959
7,CT,TT,"[WTCCCT473456, WTCCCT473515, WTCCCT473508, WTC...",4226
8,,CC,"[WTCCCT473436, WTCCCT469571, WTCCCT443738, WTC...",5
9,,CT,"[WTCCCT488883, WTCCCT474387, WTCCCT474448, WTC...",111


In [None]:
get_geno_combination_df(test, rsid_list=["rs9610458", "rs134490", "rs5756405"]).df

  and should_run_async(code)


Unnamed: 0,rs9610458,rs134490,rs5756405,unique_samples_id,unique_samples_count
0,CC,CC,AA,"[WTCCCT489620, WTCCCT505862, WTCCCT467112, WTC...",13
1,CC,CC,AG,"[WTCCCT489645, WTCCCT473287, WTCCCT470000, WTC...",29
2,CC,CC,GG,"[WTCCCT474572, WTCCCT466155, WTCCCT508245, WTC...",33
3,CC,CC,,[WTCCCT473042],1
4,CC,CT,AA,"[WTCCCT473505, WTCCCT507950, WTCCCT507923, WTC...",156
...,...,...,...,...,...
58,TT,,,"[WTCCCT515370, WTCCCT467086, WTCCCT508785, WTC...",15
59,TT,TT,AA,"[WTCCCT473530, WTCCCT473468, WTCCCT473462, WTC...",628
60,TT,TT,AG,"[WTCCCT473435, WTCCCT473522, WTCCCT473537, WTC...",1266
61,TT,TT,GG,"[WTCCCT473426, WTCCCT473527, WTCCCT489641, WTC...",668


In [None]:
test_file = CombinedGenoPheno.process_datasets(genetic_file, sample_file)["case"]

  and should_run_async(code)


In [None]:
genetic_file_split_by_chrom = test_data_catalog.load("genetic_file_split_by_chrom")
genetic_file_split_by_chrom


  and should_run_async(code)


<nbdev_tutorial.datasets.OVPDataset.OVPDataset at 0x7fe38198fd00>

In [None]:
test = CombinedGenoPheno.init_from_OVPDataset(genetic_file_split_by_chrom, sample_file, chrom=22, rsid_list = ["rs77948203", "rs9610458"])#["case"]


<function CombinedGenoPheno._process_file_type.<locals>.combine_genetic_sample_multiple at 0x7fe390909d30>


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
test

  and should_run_async(code)


rsid,rs77948203,rs9610458
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
WTCCCT473540,GG,TT
WTCCCT473530,GG,TT
WTCCCT473555,GG,TT
WTCCCT473426,GG,TT
WTCCCT473489,GG,CT
...,...,...
WS574632,GG,CT
WS574661,GG,TT
BLOOD294452,GG,CT
WTCCCT511021,GG,CT


In [None]:
genetic_file_split_by_chrom = test_data_catalog.load("genetic_file_split_by_chrom")


<nbdev_tutorial.datasets.OVPDataset.OVPDataset at 0x7fdf702f4d60>

In [None]:
genetic_file_single = test_data_catalog.load("genetic_file_single")


In [None]:
sample_file.files.__dict__.items()

dict_items([('case', SampleFileFormat(filepath=Path('data/test_data/sample_file/MS_impute2_ALL_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing')), ('control', SampleFileFormat(filepath=Path('data/test_data/sample_file/ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing'))])

In [None]:
for set(genetic_file.files.__dict__.keys()) & set(sample_file.files.__dict__.keys())

{'case', 'control'}

In [None]:
vars(genetic_file.files)


{'case': SampleFileFormat(filepath=Path('data/test_data/sample_file/MS_impute2_ALL_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing'),
 'control': SampleFileFormat(filepath=Path('data/test_data/sample_file/ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing')}

In [None]:
vars(sample_file.files)

{'case': SampleFileFormat(filepath=Path('data/test_data/sample_file/MS_impute2_ALL_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing'),
 'control': SampleFileFormat(filepath=Path('data/test_data/sample_file/ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing')}