In [1]:
# default_exp datasets.CombinedGenoPheno

# Combining Genotype and Phenotype files

> API details.

In [2]:
#hide
from nbdev.showdoc import *
import numpy as np
from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog, get_catalog
from corradin_ovp_utils.datasets.genetic_file import triplicate_converter
from corradin_ovp_utils.odds_ratio import get_geno_combination_df
from fastcore.test import ExceptionExpected
import nbdev

In [None]:
#export
from typing import Any, Dict, List, Optional, Literal, Union
from pydantic import BaseModel
import pandas as pd
import corradin_ovp_utils
from fastcore.basics import typed, basic_repr
from fastcore.dispatch import typedispatch
from corradin_ovp_utils.datasets import OVPDataset
from corradin_ovp_utils.datasets.genetic_file import GeneticFileFormat
from corradin_ovp_utils.datasets.schemas import SingleFilePathSchema, MultipleFilePathSchema
import copy

In [None]:
#export
class CombinedGenoPheno(BaseModel):
    #genetic_file_df: pd.DataFrame
    #sample_file_df: pd.DataFrame
    all_samples_geno_df: pd.DataFrame
    all_geno_df: pd.DataFrame
    sample_dict: Dict[str, pd.DataFrame]
    genetic_files_dict: Dict[str, GeneticFileFormat]
        
    __repr__ = basic_repr("num_snps,num_samples")
    
    @property
    def num_snps(self):
        return len(self.all_samples_geno_df.columns)
    
    @property
    def num_samples(self):
        return {key: value.shape[0] for key, value in self.sample_dict.items()}
    
    def get_geno_each_sample_subset(self, key):
        subset_df = self.all_samples_geno_df.loc[self.sample_dict[key].index]
        return subset_df
    
    @property
    def sample_subsets(self):
        return list(self.sample_dict.keys())
    
    @classmethod
    def init_from_OVPDataset(cls,
                             genetic_dataset: OVPDataset.OVPDataset,
                             sample_dataset: OVPDataset.OVPDataset,
                            rsid_dict: Dict[int,List[str]],
                             id_col_list=["rsid"],
                             batch_size: int =1_000,
                             excluded_sample_ids : List[str] = []
                            ):
        
        genetic_files_dict, sample_dict_loaded = cls.process_datasets(genetic_dataset, sample_dataset, excluded_sample_ids = excluded_sample_ids)
        all_samples_geno_df, all_geno_df, *extra_info = zip(*[genetic_file.get_geno_each_sample(rsid_dict, id_col_list=id_col_list, batch_size=1_000, excluded_sample_ids= excluded_sample_ids) for key, genetic_file in genetic_files_dict.items()])
        
        return CombinedGenoPheno(all_samples_geno_df = pd.concat(all_samples_geno_df), sample_dict= sample_dict_loaded, genetic_files_dict= genetic_files_dict, all_geno_df = all_geno_df[0])
            
    

    @classmethod
    def process_datasets(cls, genetic_dataset: OVPDataset.OVPDataset, sample_dataset: OVPDataset.OVPDataset, excluded_sample_ids:List[str]=[]):
        combine_genetic_sample_func = cls._process_file_type(genetic_dataset._file_path, sample_dataset._file_path)
        genetic_dict, sample_dict_loaded = combine_genetic_sample_func(genetic_dataset, sample_dataset, excluded_sample_ids= excluded_sample_ids)
        return genetic_dict, sample_dict_loaded
    
    
#     @typedispatch    
#     @classmethod
#     def _process_file_type(cls, genetic_file_schema: SingleFilePathSchema, sample_file_schema: SingleFilePathSchema):
#         return lambda genetic, sample: genetic
    
    
#     @typedispatch
    @classmethod
    def _process_file_type(cls, genetic_file_schema: MultipleFilePathSchema,
                           sample_file_schema: MultipleFilePathSchema):
        assert genetic_file_schema.__class__ == sample_file_schema.__class__
        def combine_genetic_sample_multiple(genetic_dataset, sample_file_dataset, excluded_sample_ids:List[str]):
            genetic_dict = copy.deepcopy(vars(genetic_dataset.files))
            sample_dict = copy.deepcopy(vars(sample_file_dataset.files))
            shared_keys = set(genetic_dict.keys()) & set(sample_dict.keys())
            sample_dict_loaded = {}
            
            #make sure the two datasets only have shared keys
            assert set(genetic_dict.keys()) == set(sample_dict.keys())
            for key in shared_keys:
                sample_file_loaded = sample_dict[key].load(with_missing_samples = True)
                genetic_dict[key].sample_ids = list(sample_file_loaded.index)
                genetic_dict[key].sample_file = sample_dict[key].file_path.get_full_file_path()
                sample_dict_loaded[key] = sample_dict[key].load(with_missing_samples = False).query("index not in @excluded_sample_ids")
            
            return genetic_dict, sample_dict_loaded
        
        return combine_genetic_sample_multiple
                
    class Config:
        arbitrary_types_allowed = True

In [None]:
genetic_file = test_data_catalog.load("genetic_file")
sample_file = test_data_catalog.load("sample_file")

In [6]:
vars(genetic_file.files)


{'case': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22)),
 'control': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr22.gen', file_name='test_CONTROL_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22))}

In [7]:
vars(sample_file.files)

{'case': SampleFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/sample_file', full_file_name='MS_impute2_ALL_sample_out.tsv', file_name='MS_impute2_ALL_sample_out', extension='tsv', split_by_chromosome=None, chrom_num=None), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing'),
 'control': SampleFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/sample_file', full_file_name='ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv', file_name='ALL_controls_58C_NBS_WTC2_impute2_sample_out', extension='tsv', split_by_chromosome=None, chrom_num=None), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing')}

In [8]:
geno_dict, sample_dict = CombinedGenoPheno.process_datasets(genetic_file, sample_file)

In [9]:
test_file = geno_dict["case"]
test_file

GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22))

In [10]:
test_file_chrom22 = test_file.load(chrom=22)
test_file_chrom22

GenFileObject(chrom=22, file_path=Path('data/test_data/gen_file/test_CASE_MS_chr22.gen'))

In [11]:
test_file_chrom22.load_df()

Unnamed: 0_level_0,dashes,rsid,position,alleleA,alleleB,WTCCCT473540,WTCCCT473540,WTCCCT473540,WTCCCT473530,WTCCCT473530,...,WTCCCT473479,WTCCCT473432,WTCCCT473432,WTCCCT473432,WTCCCT473465,WTCCCT473465,WTCCCT473465,WTCCCT473421,WTCCCT473421,WTCCCT473421
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,AA,AB,BB,AA,AB,...,BB,AA,AB,BB,AA,AB,BB,AA,AB,BB
0,---,rs77948203,21249165,G,A,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
1,---,rs1014626,21461017,C,T,0,0.0,1.0,0,0.0,...,1,0,0,1,0,0.0,1.0,0,0,1
2,---,rs9610458,22205353,C,T,0,0.0,1.0,0,0.0,...,0,0,1,0,0,1.0,0.0,0,0,1
3,---,rs5762201,27888455,A,G,0,0.0,1.0,0,0.012,...,1,0,0,1,0,0.0,1.0,0,0,1
4,---,rs1004237,28068501,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
5,---,rs134490,28730175,C,T,0,0.232,0.768,0,0.014,...,0,0,1,0,0,0.356,0.644,0,0,1
6,---,rs4821519,37102100,G,C,1,0.0,0.0,0,1.0,...,0,1,0,0,1,0.0,0.0,1,0,0
7,---,rs1003500,37262769,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
8,---,rs5756405,37310954,A,G,0,1.0,0.0,1,0.0,...,1,0,1,0,1,0.0,0.0,0,1,0


In [12]:
with ExceptionExpected(ex=MemoryError, regex = "is too big, input limit is 10K."): test_file_chrom22.load_df(size_limit=10_000)

In [13]:
test = CombinedGenoPheno.init_from_OVPDataset(genetic_file, sample_file, rsid_dict = {22: ["rs77948203", "rs9610458", "rs134490", "rs5756405"]})#["case"]
test

reading genetic file and collecting found SNPs for file data/test_data/gen_file/test_CASE_MS_chr22.gen


0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

reading genetic file and collecting found SNPs for file data/test_data/gen_file/test_CONTROL_MS_chr22.gen


0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

CombinedGenoPheno(num_snps=4, num_samples={'case': 9772, 'control': 5175})

In [14]:
test.genetic_files_dict

{'case': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22)),
 'control': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr22.gen', file_name='test_CONTROL_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22))}

In [15]:
test.genetic_files_dict

{'case': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22)),
 'control': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr22.gen', file_name='test_CONTROL_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22))}

In [16]:
test.all_geno_df

first,alleleA,alleleB,AA,AB,BB
id_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs9610458,C,T,CC,CT,TT
rs134490,C,T,CC,CT,TT
rs5756405,A,G,AA,AG,GG


In [17]:
test.all_samples_geno_df

id_col,rs77948203,rs9610458,rs134490,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WTCCCT473540,GG,TT,,AG
WTCCCT473530,GG,TT,TT,AA
WTCCCT473555,GG,TT,TT,
WTCCCT473426,GG,TT,TT,GG
WTCCCT473489,GG,CT,,AA
...,...,...,...,...
WS574632,GG,CT,TT,GG
WS574661,GG,TT,TT,AA
BLOOD294452,GG,CT,TT,AG
WTCCCT511021,GG,CT,TT,AG


In [18]:
test.get_geno_each_sample_subset("case")

id_col,rs77948203,rs9610458,rs134490,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WTCCCT473540,GG,TT,,AG
WTCCCT473530,GG,TT,TT,AA
WTCCCT473555,GG,TT,TT,
WTCCCT473426,GG,TT,TT,GG
WTCCCT473489,GG,CT,,AA
...,...,...,...,...
WTCCCT473455,GG,TT,TT,AG
WTCCCT473479,GG,CT,CT,GG
WTCCCT473432,GG,CT,CT,AG
WTCCCT473465,GG,CT,,AA


In [19]:
test.get_geno_each_sample_subset("control")

id_col,rs77948203,rs9610458,rs134490,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WTCCCT443025,GG,TT,CT,AG
WTCCCT443065,GG,CT,CT,AG
WTCCCT443063,GG,TT,CC,GG
WTCCCT443026,GG,CC,CT,AG
WTCCCT443066,GG,CT,TT,GG
...,...,...,...,...
WS574632,GG,CT,TT,GG
WS574661,GG,TT,TT,AA
BLOOD294452,GG,CT,TT,AG
WTCCCT511021,GG,CT,TT,AG


In [20]:
test.sample_dict

{'case':                     ID_1  missing  sex  case
 sample_id                                   
 WTCCCT473540   95300_D10        0    2     1
 WTCCCT473530   95300_D11        0    1     1
 WTCCCT473555   95300_D12        0    1     1
 WTCCCT473426   95300_E01        0    2     1
 WTCCCT473489   95300_E02        0    1     1
 ...                  ...      ...  ...   ...
 WTCCCT473455  116733_C03        0    1     1
 WTCCCT473479  116733_D03        0    2     1
 WTCCCT473432  116733_E03        0    2     1
 WTCCCT473465  116733_F02        0    1     1
 WTCCCT473421  116733_F03        0    1     1
 
 [9772 rows x 4 columns],
 'control':                     ID_1  missing   plate  sex  case
 sample_id                                           
 WTCCCT443025   74717_A02        0   74717    2     0
 WTCCCT443065   74717_A05        0   74717    2     0
 WTCCCT443063   74717_A06        0   74717    2     0
 WTCCCT443026   74717_B02        0   74717    2     0
 WTCCCT443066   74717_B05      

In [21]:
get_geno_combination_df(test.all_samples_geno_df, rsid_list=["rs9610458", "rs134490"]).df

Unnamed: 0,rs9610458,rs134490,unique_samples_id,unique_samples_count
0,CC,CC,"[WTCCCT489620, WTCCCT489645, WTCCCT473287, WTC...",76
1,CC,CT,"[WTCCCT473552, WTCCCT473505, WTCCCT489578, WTC...",724
2,CC,,"[WTCCCT489646, WTCCCT489580, WTCCCT488814, WTC...",381
3,CC,TT,"[WTCCCT473500, WTCCCT473539, WTCCCT473521, WTC...",1791
4,CT,CC,"[WTCCCT473297, WTCCCT473230, WTCCCT473244, WTC...",191
5,CT,CT,"[WTCCCT473447, WTCCCT473466, WTCCCT473492, WTC...",1733
6,CT,,"[WTCCCT473489, WTCCCT473524, WTCCCT473499, WTC...",959
7,CT,TT,"[WTCCCT473456, WTCCCT473515, WTCCCT473508, WTC...",4226
8,,CC,"[WTCCCT473436, WTCCCT469571, WTCCCT443738, WTC...",5
9,,CT,"[WTCCCT488883, WTCCCT474387, WTCCCT474448, WTC...",111


In [22]:
get_geno_combination_df(test.all_samples_geno_df, rsid_list=["rs9610458", "rs134490", "rs5756405"]).df

Unnamed: 0,rs9610458,rs134490,rs5756405,unique_samples_id,unique_samples_count
0,CC,CC,AA,"[WTCCCT489620, WTCCCT505862, WTCCCT467112, WTC...",13
1,CC,CC,AG,"[WTCCCT489645, WTCCCT473287, WTCCCT470000, WTC...",29
2,CC,CC,GG,"[WTCCCT474572, WTCCCT466155, WTCCCT508245, WTC...",33
3,CC,CC,,[WTCCCT473042],1
4,CC,CT,AA,"[WTCCCT473505, WTCCCT507950, WTCCCT507923, WTC...",156
...,...,...,...,...,...
58,TT,,,"[WTCCCT515370, WTCCCT467086, WTCCCT508785, WTC...",15
59,TT,TT,AA,"[WTCCCT473530, WTCCCT473468, WTCCCT473462, WTC...",628
60,TT,TT,AG,"[WTCCCT473435, WTCCCT473522, WTCCCT473537, WTC...",1266
61,TT,TT,GG,"[WTCCCT473426, WTCCCT473527, WTCCCT489641, WTC...",668


In [23]:
geno_dict, sample_dict= CombinedGenoPheno.process_datasets(genetic_file, sample_file)
test_file = geno_dict["case"]

---

### Testing genetic file split by chromosome

In [24]:
genetic_file_split_by_chrom = test_data_catalog.load("genetic_file_split_by_chrom")
genetic_file_split_by_chrom


<corradin_ovp_utils.datasets.OVPDataset.OVPDataset at 0x150c1cb70100>

In [25]:
test_split_by_chrom = CombinedGenoPheno.init_from_OVPDataset(genetic_file_split_by_chrom, sample_file, rsid_dict = {22: ["rs77948203", "rs9610458"]})#["case"]


Cannot find file data/test_data/gen_file/test_CASE_MS_chr1.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr2.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr3.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr4.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr5.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr6.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr7.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr8.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr9.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr10.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr11.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr12.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr13.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr14.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr15.gen
Cannot find file data/test_data/gen_file/test_CAS

0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr1.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr2.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr3.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr4.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr5.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr6.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr7.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr8.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr9.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr10.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr11.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr12.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr13.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr14.gen
Cannot find file data/test_data/gen_file/test_CONTROL_MS_chr15.gen
Cann

0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

Test case where we query from a chromosome that we have no file for

In [26]:
with nbdev_test.ExceptionExpected(ex=AttributeError, regex="'NoneType' object has no attribute 'get_geno_each_sample'"): CombinedGenoPheno.init_from_OVPDataset(genetic_file_split_by_chrom, sample_file, rsid_dict = {21: ["rs77948203", "rs9610458"]})

Cannot find file data/test_data/gen_file/test_CASE_MS_chr1.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr2.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr3.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr4.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr5.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr6.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr7.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr8.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr9.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr10.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr11.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr12.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr13.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr14.gen
Cannot find file data/test_data/gen_file/test_CASE_MS_chr15.gen
Cannot find file data/test_data/gen_file/test_CAS

In [27]:
test_split_by_chrom

CombinedGenoPheno(num_snps=2, num_samples={'case': 9772, 'control': 5175})

---

In [5]:
genetic_file_single = test_data_catalog.load("genetic_file_single")
sample_file_single = test_data_catalog.load("sample_file_single")

In [10]:
CombinedGenoPheno.process_datasets(genetic_file_single, sample_file_single)

({'single_file': GenFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None, chrom_num=22))},
 {'single_file':                     ID_1  missing  sex  case
  ID_2                                        
  WTCCCT473540   95300_D10        0    2     1
  WTCCCT473530   95300_D11        0    1     1
  WTCCCT473555   95300_D12        0    1     1
  WTCCCT473426   95300_E01        0    2     1
  WTCCCT473489   95300_E02        0    1     1
  ...                  ...      ...  ...   ...
  WTCCCT473455  116733_C03        0    1     1
  WTCCCT473479  116733_D03        0    2     1
  WTCCCT473432  116733_E03        0    2     1
  WTCCCT473465  116733_F02        0    1     1
  WTCCCT473421  116733_F03        0    1     1
  
  [9772 rows x 4 columns]})

In [12]:
test_single_file = CombinedGenoPheno.init_from_OVPDataset(genetic_file_single, sample_file_single, rsid_dict = {22: ["rs77948203", "rs9610458", "rs134490", "rs5756405"]}, )
test_single_file

reading genetic file and collecting found SNPs


0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

CombinedGenoPheno(num_snps=4, num_samples={'single_file': 9772})

In [14]:
test_single_file.all_samples_geno_df

id_col,rs77948203,rs9610458,rs134490,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WTCCCT473540,GG,TT,,AG
WTCCCT473530,GG,TT,TT,AA
WTCCCT473555,GG,TT,TT,
WTCCCT473426,GG,TT,TT,GG
WTCCCT473489,GG,CT,,AA
...,...,...,...,...
WTCCCT473455,GG,TT,TT,AG
WTCCCT473479,GG,CT,CT,GG
WTCCCT473432,GG,CT,CT,AG
WTCCCT473465,GG,CT,,AA


In [16]:
test_single_file.get_geno_each_sample_subset("single_file")

id_col,rs77948203,rs9610458,rs134490,rs5756405
ID_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
WTCCCT473540,GG,TT,,AG
WTCCCT473530,GG,TT,TT,AA
WTCCCT473555,GG,TT,TT,
WTCCCT473426,GG,TT,TT,GG
WTCCCT473489,GG,CT,,AA
...,...,...,...,...
WTCCCT473455,GG,TT,TT,AG
WTCCCT473479,GG,CT,CT,GG
WTCCCT473432,GG,CT,CT,AG
WTCCCT473465,GG,CT,,AA


### Testing a mix of rsids and position

In [18]:
test_position_and_rsid = CombinedGenoPheno.init_from_OVPDataset(genetic_file, sample_file,
                                                                rsid_dict = {22: ["rs77948203", "21461017"]},
                                                                id_col_list = ["rsid", "position"] )#["case"]
test_position_and_rsid

reading genetic file and collecting found SNPs


0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

reading genetic file and collecting found SNPs


0it [00:00, ?it/s]

processing last batch


0it [00:00, ?it/s]

CombinedGenoPheno(num_snps=2, num_samples={'case': 9772, 'control': 5175})

In [20]:
test_position_and_rsid.all_samples_geno_df

id_col,rs77948203,21461017
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
WTCCCT473540,GG,TT
WTCCCT473530,GG,TT
WTCCCT473555,GG,TT
WTCCCT473426,GG,TT
WTCCCT473489,GG,TT
...,...,...
WS574632,GG,TT
WS574661,GG,TT
BLOOD294452,GG,TT
WTCCCT511021,GG,TT


In [26]:
nbdev.test_eq(test_position_and_rsid.all_samples_geno_df.shape[1],2)

---

### Testing with BGEN

In [16]:
bgen_catalog = get_catalog(env="cluster", patterns = ['catalog*', 'catalog*/*/','catalog*/*/*'])
bgen_catalog = bgen_catalog.reload()
bgen_catalog.list()

['MS_genetic_file_split_by_chrom',
 'MS_sample_file',
 'UKB_genetic_file_bgen_split_by_chrom',
 'UKB_sample_file_basic_March_27_2021',
 'test_UKB_sample_file_with_pheno_col',
 'test_MS_genetic_file_split_by_chrom',
 'genetic_file_bgen',
 'test_MS_sample_file']

In [17]:
genetic_file_bgen = bgen_catalog.load("genetic_file_bgen")
sample_file_bgen = bgen_catalog.load("test_UKB_sample_file_with_pheno_col")

In [18]:
genetic_file_bgen.files

namespace(single_file=BgenFileFormat(file_path=SingleFilePathSchema(folder='/lab/corradin_biobank/Raw_UKB_downloads/BGEN/', full_file_name='ukb_imp_chr{chrom_num}_v3.bgen', file_name='ukb_imp_chr{chrom_num}_v3', extension='bgen', split_by_chromosome=True, chrom_num=None)))

In [19]:
sample_file_bgen.files

namespace(single_file=SampleFileFormat(file_path=SingleFilePathSchema(folder='/lab/corradin_biobank/Raw_UKB_downloads/sample_files/', full_file_name='ukb45624_imp_chr21_v3_s487275.sample', file_name='ukb45624_imp_chr21_v3_s487275', extension='sample', split_by_chromosome=None, chrom_num=None), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], pheno_col_file_info='data/test_data/pheno_col/UKB_risk_taking_2040_pheno_col.tsv|f.eid', ignore_neg_id_samples=True, missing_col=None))

In [20]:
genetic_file_bgen.files.single_file

BgenFileFormat(file_path=SingleFilePathSchema(folder='/lab/corradin_biobank/Raw_UKB_downloads/BGEN/', full_file_name='ukb_imp_chr{chrom_num}_v3.bgen', file_name='ukb_imp_chr{chrom_num}_v3', extension='bgen', split_by_chromosome=True, chrom_num=None))

In [21]:
sample_file_bgen.files.single_file

SampleFileFormat(file_path=SingleFilePathSchema(folder='/lab/corradin_biobank/Raw_UKB_downloads/sample_files/', full_file_name='ukb45624_imp_chr21_v3_s487275.sample', file_name='ukb45624_imp_chr21_v3_s487275', extension='sample', split_by_chromosome=None, chrom_num=None), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], pheno_col_file_info='data/test_data/pheno_col/UKB_risk_taking_2040_pheno_col.tsv|f.eid', ignore_neg_id_samples=True, missing_col=None)

In [22]:
test_combine_geno_pheno_bgen = CombinedGenoPheno.init_from_OVPDataset(genetic_file_bgen, sample_file_bgen, rsid_dict = {22: ["rs77948203", "rs9610458", "rs134490", "rs5756405"]})




Loading chromosome 22


Found variants: 4/4
 Not found: 0/4.
 Percent found 100%


reading -- time=0:00:00.00, thread 1 of 4, part 1 of 1


In [24]:
test_combine_geno_pheno_bgen.sample_dict

{'single_file':                ID_1  missing  sex  case  missing_col_generated
 -134         -134.0      0.0  0.0   NaN                      1
 -133         -133.0      0.0  0.0   NaN                      1
 -132         -132.0      0.0  0.0   NaN                      1
 -131         -131.0      0.0  0.0   NaN                      1
 -130         -130.0      0.0  0.0   NaN                      1
 ...             ...      ...  ...   ...                    ...
  5873167  5873167.0      0.0  2.0   1.0                      0
  5873175  5873175.0      0.0  1.0   1.0                      0
  5873180  5873180.0      0.0  2.0   1.0                      0
  5873199  5873199.0      0.0  2.0   0.0                      0
  5873208  5873208.0      0.0  1.0   NaN                      1
 
 [487440 rows x 5 columns]}