In [None]:
# default_exp datasets.genetic_file

# Genetic file formats

> API details.

In [None]:
#hide
from nbdev.showdoc import *
import pandas as pd
from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog
from fastcore.test import ExceptionExpected

In [None]:
#export

from typing import Any, Dict, List, Optional, Literal, Union
from enum import Enum
import numpy as np
from kedro.io import AbstractVersionedDataSet
from pydantic import BaseModel
from pathlib import Path, PosixPath
from pydantic.dataclasses import dataclass
from dataclasses import InitVar, asdict
from copy import deepcopy
import fsspec
from kedro.io.core import (
    AbstractVersionedDataSet,
    DataSetError,
    Version,
    get_filepath_str,
    get_protocol_and_path,
)
from kedro.extras.datasets.pandas import CSVDataSet
from fastcore.meta import delegates
from functools import partial, wraps, lru_cache
from types import SimpleNamespace
from tqdm.auto import tqdm
import pandas as pd
import itertools

In [None]:
#hide

def print_with_space(*args, **kwargs):
    print(*args, **kwargs)
    print("------")


In [None]:
test_data_catalog.list()

['genetic_file',
 'genetic_file_common_folder',
 'genetic_file_single',
 'genetic_file_split_by_chrom',
 'sample_file',
 'geno_each_sample',
 'case_geno_each_sample',
 'control_geno_each_sample']

In [None]:
test_data_catalog.load("genetic_file")._file_path.full_file_path

{'case': Path('data/test_data/gen_file/test_CASE_MS_chr22.gen'),
 'control': Path('data/test_data/gen_file/test_CONTROL_MS_chr22.gen')}

In [None]:
#export

def row_vectorize(f):
    @wraps(f)
    def wrapped_f(X, **kwargs):
        rows = X.reshape(-1, X.shape[-1])
        return np.reshape([f(row, **kwargs) for row in rows],
                          X.shape[:-1] + (-1,))
    return wrapped_f


def triplicate_converter(sample_df, *, genotype_df, sample_id, high_lim=.9, low_lim=0.3, NA_val = "NA"):
        """
        takes in allele_list in gen triplicate format and
        return new_list in letter format
        """
        geno_1, geno_2, geno_3 = [sample_df.astype(float)[col] for col in sample_df]
        
        rsid_genotype_df = genotype_df.loc[sample_df.index]
        #all_geno = SimpleNamespace(**test_genfile_format.get_genotypes_df().loc[rsid])
        homo_ref_cond = (geno_1 >=high_lim) & (geno_2 < low_lim) & (geno_3 < low_lim)
        het_cond = (geno_2 >= high_lim) & (geno_1 < low_lim) & (geno_3 < low_lim)
        homo_alt_cond = (geno_3 >= high_lim) & (geno_1 < low_lim) & (geno_2 < low_lim)
        geno_df =  np.select([homo_ref_cond, het_cond, homo_alt_cond],
                 [rsid_genotype_df["homo_ref"], rsid_genotype_df["het"], rsid_genotype_df["homo_alt"]],
                 default = NA_val)
        geno_df = pd.DataFrame(geno_df, index = sample_df.index, columns = [sample_id])
        geno_df.columns.name= "sample_id"
        return geno_df


  and should_run_async(code)


In [None]:
#export

class GenFileFormat(BaseModel):
    filepath: Union[str, Path, Dict[int, Union[str, Path]]]
    prob_n_cols: int
    initial_cols: List[str]
    rsid_col: str
    ref_col: str
    alt_col: str
    ref_alt_delim: Optional[str]
    pandas_args: Dict[str, Any]
    sample_ids: Optional[str] 
    current_file_path: Optional[Union[str, Path]]
        
    # check that num_cols - initial cols is divisible by prob_n_cols
    # if ref col == alt col then ref_alt_delim needs to be specified, so ref,alt = ref_col.str.split(delim)
    
    def col_name_generator(self, include_initial_cols=True):
        
        if include_initial_cols:
            for col in self.initial_cols:
                yield col
        
        if self.sample_ids is None:
            sample_ids_gen = (f"sample{i}" for i in itertools.count(1))
        else:
            sample_ids_gen = (sample_id for sample_id in self.sample_ids)
        
        while True:
            cur_sample_id = next(sample_ids_gen)
            for allele_prob_col_num in range(1, self.prob_n_cols + 1):
                yield f"{cur_sample_id}_{allele_prob_col_num}"
    
    @property
    def gen_info_cols(self):
        return [self.rsid_col, self.ref_col, self.alt_col]
    
    @property
    def column_headers(self):
        #read the first non-header line
        first_line_df = next(self._load_unprocessed(chunksize=1))
        generator = self.col_name_generator()
        return [next(generator) for col in first_line_df.columns]
    
    @property
    def sample_cols(self):
        cols = [col for col in self.column_headers if col not in set(self.initial_cols)]
        return cols
    
    @property
    def _sample_list(self):
        if self.sample_ids is not None:
            return self.sample_ids
        else:
            return sorted(list(set([col.split("_")[0] for col in self.sample_cols])))
        
    @property
    def num_samples(self):
        return len(self.sample_cols)/self.prob_n_cols
        
    def get_resolved_file_path(self, chrom=None):
        if isinstance(self.filepath, dict):
            if chrom is None: 
                raise ValueError("Need to specify `chrom` argument")
            resolved_file_path = self.filepath[chrom]
        else:
            resolved_file_path = self.filepath
        return resolved_file_path
    
    @property
    def load_args(self):
        load_args = deepcopy(self.pandas_args)
        col_names = self.column_headers
        load_args["names"] = col_names
        return load_args
    
    @delegates(pd.read_csv)
    def _load_unprocessed(self, **kwargs):
        return pd.read_csv(filepath_or_buffer = self.current_file_path, **self.pandas_args, **kwargs)
    
    
    @delegates(pd.read_csv)
    def load(self, chrom=None, **kwargs):
        self.current_file_path = self.get_resolved_file_path(chrom) # use another attribute so `load` and `_load_unprocessed` can share
        df = pd.read_csv(filepath_or_buffer = self.current_file_path, **self.load_args, **kwargs)
        #df.columns = self.column_headers
        return df
    
    
    @delegates(pd.read_csv)
    def get_rsid_df(self,chrom=None, rsid_list=None, **kwargs):
        rsid_df = self.load(chrom = chrom, usecols = [self.rsid_col])
        if rsid_list is not None:
            found_rsid_df = rsid_df.query(f"{self.rsid_col} in @rsid_list")
        else:
            found_rsid_df = rsid_df
        found_index = found_rsid_df.index
        found_rsids = set(found_rsid_df[self.rsid_col].unique())
        found_rsid_df_full = self.load(chrom= chrom, skiprows = lambda x: x not in found_index, **kwargs)
        return found_rsid_df_full
    
    @delegates(load)
    def get_genotypes_df(self, chrom=None, rsid_list: List = None, **kwargs):
        if rsid_list is not None:
            geno_df = self.get_rsid_df(chrom=chrom, rsid_list = rsid_list, usecols = self.gen_info_cols, **kwargs)
        else:
            geno_df = self.load(chrom = chrom, usecols = self.gen_info_cols,**kwargs)
        geno_df["homo_ref"] = geno_df.loc[:,[self.ref_col]] *2
        make_het_geno_func = lambda row: ''.join(sorted([row[self.ref_col],
                                   row[self.alt_col]]))
        geno_df["het"] = geno_df[[self.ref_col, self.alt_col]].apply(make_het_geno_func, axis=1)
        geno_df["homo_alt"] = geno_df.loc[:,self.alt_col] *2
        
        geno_df = geno_df.set_index(self.rsid_col)
        return geno_df
    
    
    def sample_columns_iter(self, chrom=None, rsid_list=None, **kwargs):
        
        df = self.get_rsid_df(chrom=chrom, rsid_list = rsid_list, **kwargs)
        df = df.set_index(self.rsid_col)
        
        for sample in self._sample_list:
            sample_cols = [f"{sample}_{allele_prob_col_num}" for allele_prob_col_num in range(1, self.prob_n_cols + 1)]
            sample_cols_df = df[sample_cols]
            yield SimpleNamespace(sample_id = sample,
                                  sample_df = sample_cols_df)
            
    
    #the function has to accept sample_id kwarg
    def apply_func_to_all_samples(self, func, rsid_list=None, chrom=None, **kwargs):
        result_dict = [func(sample_obj.sample_df, sample_id = sample_obj.sample_id, **kwargs) for sample_obj in tqdm(self.sample_columns_iter(chrom= chrom, rsid_list=rsid_list))]#{sample_obj.sample_id: func(sample_obj.sample_df, sample_id = sample_obj.sample_id, **kwargs) for sample_obj in tqdm(self.sample_columns_iter(rsid_list=rsid_list))}
        result_df = pd.concat(result_dict, axis=1)
        #, orient="records")
        return result_df
    
    def get_geno_each_sample(self, *, chrom=None, rsid_list:List[str]):
        geno_each_sample_df = self.apply_func_to_all_samples(triplicate_converter, 
                                        rsid_list = rsid_list,
                                        genotype_df = self.get_genotypes_df(chrom=chrom, rsid_list = rsid_list),
                                        chrom=chrom)
        return geno_each_sample_df.T
    
#     @property
#     def single_line_iter(self, **runtime_kwargs):
#         return pd.read_csv(self.filepath,
#                            **self.load_args,
#                            **runtime_kwargs,
#                            chunksize=1)
    
#     #@delegates(pd.read_csv, but= list(self.load_args.keys()))
#     def load_full(self, **kwargs):
#         return pd.read_csv(self.filepath,
#                            **self.load_args,
#                           **kwargs)
    
#     #if ref_col = alt_col then ref_alt_delim need to be specified
    

---
### Test functionalities of `GenFileFormat`

In [None]:
conf_test_data_catalog["genetic_file"]["load_args"]

{'prob_n_cols': 3,
 'initial_cols': ['dashes', 'rsid', 'position', 'ref', 'alt'],
 'rsid_col': 'rsid',
 'ref_col': 'ref',
 'alt_col': 'alt',
 'pandas_args': {'sep': ' ', 'header': None}}

In [None]:
conf_test_data_catalog["genetic_file"]["file_path"]["case"]

{'folder': 'data/test_data/gen_file',
 'full_file_name': 'test_CASE_MS_chr22.gen'}

In [None]:
test_genetic_file_single_file_path = Path(conf_test_data_catalog["genetic_file_single"]["file_path"]["folder"])/(conf_test_data_catalog["genetic_file_single"]["file_path"]["full_file_name"])
test_genetic_file_single_file_path

Path('data/test_data/gen_file/test_CASE_MS_chr22.gen')

In [None]:
test_genfile_format = GenFileFormat(filepath = test_genetic_file_single_file_path,
                                    **conf_test_data_catalog["genetic_file"]["load_args"])
test_genfile_format

GenFileFormat(filepath=Path('data/test_data/gen_file/test_CASE_MS_chr22.gen'), prob_n_cols=3, initial_cols=['dashes', 'rsid', 'position', 'ref', 'alt'], rsid_col='rsid', ref_col='ref', alt_col='alt', ref_alt_delim=None, pandas_args={'sep': ' ', 'header': None}, sample_ids=None, current_file_path=None)

In [None]:
test_genfile_format.load() #genotypes()

Unnamed: 0,dashes,rsid,position,ref,alt,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,...,sample9769_3,sample9770_1,sample9770_2,sample9770_3,sample9771_1,sample9771_2,sample9771_3,sample9772_1,sample9772_2,sample9772_3
0,---,rs77948203,21249165,G,A,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
1,---,rs1014626,21461017,C,T,0,0.0,1.0,0,0.0,...,1,0,0,1,0,0.0,1.0,0,0,1
2,---,rs9610458,22205353,C,T,0,0.0,1.0,0,0.0,...,0,0,1,0,0,1.0,0.0,0,0,1
3,---,rs5762201,27888455,A,G,0,0.0,1.0,0,0.012,...,1,0,0,1,0,0.0,1.0,0,0,1
4,---,rs1004237,28068501,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
5,---,rs134490,28730175,C,T,0,0.232,0.768,0,0.014,...,0,0,1,0,0,0.356,0.644,0,0,1
6,---,rs4821519,37102100,G,C,1,0.0,0.0,0,1.0,...,0,1,0,0,1,0.0,0.0,1,0,0
7,---,rs1003500,37262769,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
8,---,rs5756405,37310954,A,G,0,1.0,0.0,1,0.0,...,1,0,1,0,1,0.0,0.0,0,1,0


In [None]:
test_header = test_genfile_format.column_headers
assert test_header[-1] == 'sample9772_3'

  and should_run_async(code)


In [None]:
test_genfile_format.get_rsid_df(["rs77948203", "rs9610458"])

Unnamed: 0,dashes,rsid,position,ref,alt,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,...,sample9769_3,sample9770_1,sample9770_2,sample9770_3,sample9771_1,sample9771_2,sample9771_3,sample9772_1,sample9772_2,sample9772_3
0,---,rs77948203,21249165,G,A,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
1,---,rs1014626,21461017,C,T,0,0.0,1.0,0,0.0,...,1,0,0,1,0,0.0,1.0,0,0,1
2,---,rs9610458,22205353,C,T,0,0.0,1.0,0,0.0,...,0,0,1,0,0,1.0,0.0,0,0,1
3,---,rs5762201,27888455,A,G,0,0.0,1.0,0,0.012,...,1,0,0,1,0,0.0,1.0,0,0,1
4,---,rs1004237,28068501,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
5,---,rs134490,28730175,C,T,0,0.232,0.768,0,0.014,...,0,0,1,0,0,0.356,0.644,0,0,1
6,---,rs4821519,37102100,G,C,1,0.0,0.0,0,1.0,...,0,1,0,0,1,0.0,0.0,1,0,0
7,---,rs1003500,37262769,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
8,---,rs5756405,37310954,A,G,0,1.0,0.0,1,0.0,...,1,0,1,0,1,0.0,0.0,0,1,0


In [None]:
next(test_genfile_format.get_rsid_df(["rs77948203", "rs9610458"], chunksize=1))

  and should_run_async(code)


Unnamed: 0,dashes,rsid,position,ref,alt,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,...,sample9769_3,sample9770_1,sample9770_2,sample9770_3,sample9771_1,sample9771_2,sample9771_3,sample9772_1,sample9772_2,sample9772_3
0,---,rs77948203,21249165,G,A,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0


In [None]:
test_genfile_format.get_genotypes_df()

  and should_run_async(code)


Unnamed: 0_level_0,ref,alt,homo_ref,het,homo_alt
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs1014626,C,T,CC,CT,TT
rs9610458,C,T,CC,CT,TT
rs5762201,A,G,AA,AG,GG
rs1004237,C,T,CC,CT,TT
rs134490,C,T,CC,CT,TT
rs4821519,G,C,GG,CG,CC
rs1003500,C,T,CC,CT,TT
rs5756405,A,G,AA,AG,GG


In [None]:
test_genfile_format.get_genotypes_df(rsid_list = ["rs77948203", "rs9610458"])

  and should_run_async(code)


Unnamed: 0_level_0,ref,alt,homo_ref,het,homo_alt
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs9610458,C,T,CC,CT,TT


In [None]:
assert test_genfile_format.num_samples == 9772
assert test_genfile_format.gen_info_cols == ['rsid', 'ref', 'alt']

  and should_run_async(code)


In [None]:
first_sample = next(test_genfile_format.sample_columns_iter(rsid_list = ["rs9610458", "rs4821519"]))
first_sample.sample_df

Unnamed: 0_level_0,sample1_1,sample1_2,sample1_3
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rs9610458,0,0.0,1.0
rs4821519,1,0.0,0.0


In [None]:
first_sample.sample_id

  and should_run_async(code)


'sample1'

In [None]:
triplicate_converter(first_sample.sample_df,
                     sample_id = first_sample.sample_id,
                    genotype_df = test_genfile_format.get_genotypes_df())

sample_id,sample1
rsid,Unnamed: 1_level_1
rs9610458,TT
rs4821519,GG


In [None]:
test_genfile_format.get_genotypes_df()

  and should_run_async(code)


Unnamed: 0_level_0,ref,alt,homo_ref,het,homo_alt
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs1014626,C,T,CC,CT,TT
rs9610458,C,T,CC,CT,TT
rs5762201,A,G,AA,AG,GG
rs1004237,C,T,CC,CT,TT
rs134490,C,T,CC,CT,TT
rs4821519,G,C,GG,CG,CC
rs1003500,C,T,CC,CT,TT
rs5756405,A,G,AA,AG,GG


In [None]:
test_genfile_format.apply_func_to_all_samples(triplicate_converter, 
                                              rsid_list = ["rs9610458", "rs4821519"],
                                              genotype_df = test_genfile_format.get_genotypes_df(["rs9610458", "rs4821519"]))

  and should_run_async(code)


0it [00:00, ?it/s]

sample_id,sample1,sample10,sample100,sample1000,sample1001,sample1002,sample1003,sample1004,sample1005,sample1006,...,sample990,sample991,sample992,sample993,sample994,sample995,sample996,sample997,sample998,sample999
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rs9610458,TT,TT,TT,CT,TT,CC,CT,CT,CT,CC,...,TT,CT,CT,CT,TT,CT,CT,CT,CT,CT
rs4821519,GG,GG,GG,GG,CG,GG,GG,GG,GG,GG,...,GG,GG,GG,GG,GG,CC,GG,GG,GG,GG


If you don't input a list of rsids, it will use all the rsids in the dataset, which could take a **LONG TIME**

In [None]:
test_geno_each_sample = test_genfile_format.apply_func_to_all_samples(triplicate_converter, 
                                              genotype_df = test_genfile_format.get_genotypes_df()).T # transposed

test_geno_each_sample

  and should_run_async(code)


0it [00:00, ?it/s]

rsid,rs77948203,rs1014626,rs9610458,rs5762201,rs1004237,rs134490,rs4821519,rs1003500,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sample1,GG,TT,TT,GG,CC,,GG,CC,AG
sample10,GG,TT,TT,GG,CC,TT,GG,CC,AG
sample100,GG,TT,TT,GG,CC,TT,GG,CC,AG
sample1000,AG,TT,CT,GG,CC,CT,GG,CC,AA
sample1001,GG,TT,TT,GG,CC,CT,CG,CC,AG
...,...,...,...,...,...,...,...,...,...
sample995,GG,TT,CT,GG,CC,CT,CC,CC,AG
sample996,GG,TT,CT,GG,CC,TT,GG,CC,AG
sample997,GG,TT,CT,AG,CC,TT,GG,CC,AG
sample998,GG,TT,CT,GG,CC,TT,GG,CC,AA


In [None]:
test_data_catalog.save("geno_each_sample", test_geno_each_sample)

  and should_run_async(code)


In [None]:
test_geno_each_sample[["rs77948203", "rs1014626", "rs1004237"]].reset_index().groupby(["rs77948203", "rs1014626", "rs1004237"])["sample_id"].unique().reset_index() #.melt(value_vars = ["rs77948203", "rs1014626"], ignore_index=False).reset_index().groupby(["rsid", "value"])["index"].unique()

Unnamed: 0,rs77948203,rs1014626,rs1004237,sample_id
0,AA,TT,CC,"[sample1445, sample1545, sample2271, sample237..."
1,AA,TT,,"[sample465, sample6764]"
2,AG,CT,CC,"[sample5663, sample5840]"
3,AG,,CC,[sample297]
4,AG,TT,CC,"[sample1000, sample1002, sample1025, sample102..."
5,AG,TT,,"[sample6578, sample678]"
6,GG,CT,CC,"[sample1264, sample1359, sample1960, sample220..."
7,GG,,CC,"[sample1378, sample1496, sample2247, sample391..."
8,GG,TT,CC,"[sample1, sample10, sample100, sample1001, sam..."
9,GG,TT,,"[sample1601, sample2147, sample4016, sample421..."


In [None]:
test_geno_each_sample

  and should_run_async(code)


rsid,rs77948203,rs1014626,rs9610458,rs5762201,rs1004237,rs134490,rs4821519,rs1003500,rs5756405
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sample1,GG,TT,TT,GG,CC,,GG,CC,AG
sample10,GG,TT,TT,GG,CC,TT,GG,CC,AG
sample100,GG,TT,TT,GG,CC,TT,GG,CC,AG
sample1000,AG,TT,CT,GG,CC,CT,GG,CC,AA
sample1001,GG,TT,TT,GG,CC,CT,CG,CC,AG
...,...,...,...,...,...,...,...,...,...
sample995,GG,TT,CT,GG,CC,CT,CC,CC,AG
sample996,GG,TT,CT,GG,CC,TT,GG,CC,AG
sample997,GG,TT,CT,AG,CC,TT,GG,CC,AG
sample998,GG,TT,CT,GG,CC,TT,GG,CC,AA


In [None]:
test_genfile = test_genfile_format.load()
test_genfile

  and should_run_async(code)


Unnamed: 0,dashes,rsid,position,ref,alt,sample1_1,sample1_2,sample1_3,sample2_1,sample2_2,...,sample9769_3,sample9770_1,sample9770_2,sample9770_3,sample9771_1,sample9771_2,sample9771_3,sample9772_1,sample9772_2,sample9772_3
0,---,rs77948203,21249165,G,A,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
1,---,rs1014626,21461017,C,T,0,0.0,1.0,0,0.0,...,1,0,0,1,0,0.0,1.0,0,0,1
2,---,rs9610458,22205353,C,T,0,0.0,1.0,0,0.0,...,0,0,1,0,0,1.0,0.0,0,0,1
3,---,rs5762201,27888455,A,G,0,0.0,1.0,0,0.012,...,1,0,0,1,0,0.0,1.0,0,0,1
4,---,rs1004237,28068501,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
5,---,rs134490,28730175,C,T,0,0.232,0.768,0,0.014,...,0,0,1,0,0,0.356,0.644,0,0,1
6,---,rs4821519,37102100,G,C,1,0.0,0.0,0,1.0,...,0,1,0,0,1,0.0,0.0,1,0,0
7,---,rs1003500,37262769,C,T,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
8,---,rs5756405,37310954,A,G,0,1.0,0.0,1,0.0,...,1,0,1,0,1,0.0,0.0,0,1,0


---

### Test files split by chromosomes

In [None]:
test_split_by_chrom = test_data_catalog.load("genetic_file_split_by_chrom")


In [None]:
assert test_split_by_chrom.files.case.load(chrom=22).shape == (9, 29321)
assert test_split_by_chrom.files.case.get_rsid_df(chrom=22, rsid_list=["rs77948203", "rs1014626"]).shape == (2, 29321)

In [None]:
with ExceptionExpected(ex=ValueError, regex = "Need to specify `chrom` argument"):
    test_split_by_chrom.files.case.load()
with ExceptionExpected(ex=ValueError, regex = "Need to specify `chrom` argument"):
    test_split_by_chrom.files.case.get_rsid_df(rsid_list=["rs77948203", "rs1014626"])

In [None]:
test_split_by_chrom.files.case.get_genotypes_df(chrom=22)

  and should_run_async(code)


Unnamed: 0_level_0,ref,alt,homo_ref,het,homo_alt
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs1014626,C,T,CC,CT,TT
rs9610458,C,T,CC,CT,TT
rs5762201,A,G,AA,AG,GG
rs1004237,C,T,CC,CT,TT
rs134490,C,T,CC,CT,TT
rs4821519,G,C,GG,CG,CC
rs1003500,C,T,CC,CT,TT
rs5756405,A,G,AA,AG,GG


In [None]:
test_split_by_chrom.files.case.get_genotypes_df(chrom=22, rsid_list=["rs77948203", "rs1014626"])

Unnamed: 0_level_0,ref,alt,homo_ref,het,homo_alt
rsid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs77948203,G,A,GG,AG,AA
rs1014626,C,T,CC,CT,TT
