In [1]:
# default_exp datasets.utils

# Utilities

> API details.

In [2]:
#hide
from nbdev.showdoc import *
import pandas as pd
import numpy as np
from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog
%load_ext autoreload
%autoreload 1

In [3]:
#export
from typing import Any, Dict, List, Optional, Literal, Union
from pydantic import BaseModel
import corradin_ovp_utils
from fastcore.basics import typed
from fastcore.dispatch import typedispatch

In [None]:
#export

class MissingAttributeError(Exception):
    pass

def requires(*required_attrs):        
    def wrapper(method):

        @functools.wraps(method)
        def inner_wrapper(self, *args, **kargs):
            if not all(hasattr(self, attr) for attr in required_attrs):
                raise MissingAttributeError()
            return method(self, *args, **kargs)

        return inner_wrapper
    return wrapper


from contextlib import contextmanager
import os

@contextmanager
def cd(newdir):
    prevdir = os.getcwd()
    os.chdir(os.path.expanduser(newdir))
    try:
        yield
    finally:
        os.chdir(prevdir)

In [None]:
genetic_file = test_data_catalog.load("genetic_file")
sample_file = test_data_catalog.load("sample_file")

In [None]:
test_data_catalog.load("genetic_file_split_by_chrom")._file_path.full_file_path

{'case': {1: Path('data/test_data/gen_file/test_CASE_MS_chr1.gen'),
  2: Path('data/test_data/gen_file/test_CASE_MS_chr2.gen'),
  3: Path('data/test_data/gen_file/test_CASE_MS_chr3.gen'),
  4: Path('data/test_data/gen_file/test_CASE_MS_chr4.gen'),
  5: Path('data/test_data/gen_file/test_CASE_MS_chr5.gen'),
  6: Path('data/test_data/gen_file/test_CASE_MS_chr6.gen'),
  7: Path('data/test_data/gen_file/test_CASE_MS_chr7.gen'),
  8: Path('data/test_data/gen_file/test_CASE_MS_chr8.gen'),
  9: Path('data/test_data/gen_file/test_CASE_MS_chr9.gen'),
  10: Path('data/test_data/gen_file/test_CASE_MS_chr10.gen'),
  11: Path('data/test_data/gen_file/test_CASE_MS_chr11.gen'),
  12: Path('data/test_data/gen_file/test_CASE_MS_chr12.gen'),
  13: Path('data/test_data/gen_file/test_CASE_MS_chr13.gen'),
  14: Path('data/test_data/gen_file/test_CASE_MS_chr14.gen'),
  15: Path('data/test_data/gen_file/test_CASE_MS_chr15.gen'),
  16: Path('data/test_data/gen_file/test_CASE_MS_chr16.gen'),
  17: Path('data/t

In [None]:
vars(sample_file.files)

{'case': SampleFileFormat(filepath=Path('data/test_data/sample_file/MS_impute2_ALL_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing'),
 'control': SampleFileFormat(filepath=Path('data/test_data/sample_file/ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv'), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], missing_col='missing')}

In [None]:
sample_file._file_path

CaseControlFilePathSchema(case=SingleFilePathSchema(folder='data/test_data/sample_file', full_file_name='MS_impute2_ALL_sample_out.tsv', file_name='MS_impute2_ALL_sample_out', extension='tsv', split_by_chromosome=None), control=SingleFilePathSchema(folder='data/test_data/sample_file', full_file_name='ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv', file_name='ALL_controls_58C_NBS_WTC2_impute2_sample_out', extension='tsv', split_by_chromosome=None))

In [None]:
sample_file.files

In [None]:
genetic_file.files.__dict__["case"].sample_ids

In [None]:
genetic_file.files.case.sample_ids = sample_file.files.case.load().index


In [None]:
genetic_file.files.case.sample_ids

Index(['WTCCCT473540', 'WTCCCT473530', 'WTCCCT473555', 'WTCCCT473426',
       'WTCCCT473489', 'WTCCCT473456', 'WTCCCT473435', 'WTCCCT473522',
       'WTCCCT473500', 'WTCCCT473537',
       ...
       'WTCCCT466427', 'WTCCCT449701', 'WTCCCT449713', 'WTCCCT449725',
       'WTCCCT473041', 'WTCCCT473455', 'WTCCCT473479', 'WTCCCT473432',
       'WTCCCT473465', 'WTCCCT473421'],
      dtype='object', name='ID_2', length=9772)

In [None]:
genetic_file.files.case.get_rsid_df(rsid_list=["rs77948203", "rs1014626", "rs134490"])

Unnamed: 0,dashes,rsid,position,ref,alt,WTCCCT473540_1,WTCCCT473540_2,WTCCCT473540_3,WTCCCT473530_1,WTCCCT473530_2,...,WTCCCT473479_3,WTCCCT473432_1,WTCCCT473432_2,WTCCCT473432_3,WTCCCT473465_1,WTCCCT473465_2,WTCCCT473465_3,WTCCCT473421_1,WTCCCT473421_2,WTCCCT473421_3
0,---,rs77948203,21249165,G,A,1,0.0,0.0,1,0.0,...,0,1,0,0,1,0.0,0.0,1,0,0
1,---,rs1014626,21461017,C,T,0,0.0,1.0,0,0.0,...,1,0,0,1,0,0.0,1.0,0,0,1
2,---,rs134490,28730175,C,T,0,0.232,0.768,0,0.014,...,0,0,1,0,0,0.356,0.644,0,0,1


In [None]:
sample_file.files.case.load().index

Index(['WTCCCT473540', 'WTCCCT473530', 'WTCCCT473555', 'WTCCCT473426',
       'WTCCCT473489', 'WTCCCT473456', 'WTCCCT473435', 'WTCCCT473522',
       'WTCCCT473500', 'WTCCCT473537',
       ...
       'WTCCCT466427', 'WTCCCT449701', 'WTCCCT449713', 'WTCCCT449725',
       'WTCCCT473041', 'WTCCCT473455', 'WTCCCT473479', 'WTCCCT473432',
       'WTCCCT473465', 'WTCCCT473421'],
      dtype='object', name='ID_2', length=9772)