In [None]:
#default_exp datasets.schemas

# Schemas

> API details.

In [None]:
#hide
from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog


In [None]:
#export

from typing import Any, Dict, List, Optional, Literal, Union

from pydantic import BaseModel
from pydantic.dataclasses import dataclass
from dataclasses import InitVar, asdict
from kedro.io.core import (
    get_protocol_and_path,
)
from pathlib import Path
from fastcore.meta import delegates

In [None]:
#export

class SingleFilePathSchema(BaseModel):
    folder: Union[str, Path]
    full_file_name: Union[str, Path]
    file_name: Optional[str] = None
    extension: Optional[str] = None
    split_by_chromosome: Optional[bool] = None
    chrom_num: Optional[int] = None
        
    def __init__(self, **data: Any):
        super().__init__(**data)
        if self.file_name is None or self.extension is None:
            self.file_name, *_, self.extension = self.full_file_name.split(".")
            
    def get_full_file_path(self, chrom:Optional[int]=None):
        if self.split_by_chromosome and chrom is None:
            raise ValueError("Need to specify `chrom` argument")
        else:
            formatted_file_name = self.full_file_name.format(chrom_num=chrom)
            return (Path(self.folder)/formatted_file_name)
    
    @property
    def file_path_obj(self):
        return self
    
    @property
    def protocol_and_path(self):
        return get_protocol_and_path(self.get_full_file_path(chrom="test").as_posix())
        
    @property
    def protocol(self):
        return self.protocol_and_path[0]
    
    #validate full file name when split by chrom here
    #throw error when files doesn't have an extension

class MultipleFilePathSchema():
    def __getattr__(self, attr, *args, **kwargs):
        initial_dict = {
            k: getattr(v, attr)
            for k, v in self.to_dict().items() 
        }
        
        first_ele = list(initial_dict.values())[0]
        
        #if we are accessing a function
        if callable(first_ele):
            return self.func_factory(initial_dict, first_ele)
        else:
            return initial_dict
    
    @staticmethod
    def func_factory(func_dict, sample_func):
        @delegates(sample_func)
        def returned_func(**kwargs):
            return {
                k: func(**kwargs) for k,func in func_dict.items()
            }
        return returned_func
   
    def to_dict(self):
        return asdict(self)
    
    def apply_func(self, func, **kwargs):
        print(func)
        print(self.to_dict().items())
        return {
            k: func(v, **kwargs) for k, v in self.to_dict().items()
        }
    
@dataclass
class CaseControlFilePathSchema(MultipleFilePathSchema):
    case: SingleFilePathSchema
    control: SingleFilePathSchema
    common_folder : InitVar(Optional[str]) = None

    def __post_init__(self, common_folder):
        if common_folder is not None:
            self.case = SingleFilePathSchema(folder=common_folder, **self.case)
            self.control = SingleFilePathSchema(folder=common_folder, **self.control)
    
    @property
    def protocol(self):
        if self.case.protocol != self.control.protocol:
            raise ValueError(f"Currently only the same file system for case and control file is supported.\n Case is located in {self.case.protocol} system. Control is located in {self.control.protocol} ")
        return self.case.protocol
    
    
#     def __post_init_post_parse__(self, common_folder):
#         self.protocol, _ = self.case.protocol_and_path

            

---

### Testing single file path

In [None]:
conf_test_data_catalog["genetic_file_single"]["file_path"]

{'folder': 'data/test_data/gen_file',
 'full_file_name': 'test_CASE_MS_chr22.gen'}

In [None]:
test_genetic_file_single_file_path = SingleFilePathSchema(**conf_test_data_catalog["genetic_file_single"]["file_path"])
assert test_genetic_file_single_file_path.file_path_obj == test_genetic_file_single_file_path
assert test_genetic_file_single_file_path.protocol == "file"

---

### Testing case control file path

In [None]:
conf_test_data_catalog["genetic_file"]["file_path"]

{'case': {'folder': 'data/test_data/gen_file',
  'full_file_name': 'test_CASE_MS_chr22.gen'},
 'control': {'folder': 'data/test_data/gen_file',
  'full_file_name': 'test_CONTROL_MS_chr22.gen'}}

In [None]:
conf_test_data_catalog["genetic_file"]["file_path"]["case"]

{'folder': 'data/test_data/gen_file',
 'full_file_name': 'test_CASE_MS_chr22.gen'}

In [None]:
test_genetic_file_cc_file_path = CaseControlFilePathSchema(**conf_test_data_catalog["genetic_file"]["file_path"])
assert test_genetic_file_cc_file_path.file_path_obj == {"case": SingleFilePathSchema(**conf_test_data_catalog["genetic_file"]["file_path"]["case"]),
                                                "control": SingleFilePathSchema(**conf_test_data_catalog["genetic_file"]["file_path"]["control"])}

In [None]:
test_genetic_file_cc_file_path.to_dict()

{'case': SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None),
 'control': SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr22.gen', file_name='test_CONTROL_MS_chr22', extension='gen', split_by_chromosome=None)}

Common folder

In [None]:
conf_test_data_catalog["genetic_file_common_folder"]

{'type': 'corradin_ovp_utils.datasets.OVPDataset.OVPDataset',
 'file_format': 'genetic_file.Gen.GenFileFormat',
 'load_args': {'prob_n_cols': 3,
  'initial_cols': ['dashes', 'rsid', 'position', 'ref', 'alt'],
  'rsid_col': 'rsid',
  'ref_col': 'ref',
  'alt_col': 'alt',
  'pandas_args': {'sep': ' ', 'header': None}},
 'file_type': 'OVPDataset.CaseControlFilePathSchema',
 'file_path': {'common_folder': 'data/test_data/gen_file',
  'case': {'full_file_name': 'test_CASE_MS_chr22.gen'},
  'control': {'full_file_name': 'test_CONTROL_MS_chr22.gen'}}}

In [None]:
test_genetic_file_cc_file_path_common_folder = CaseControlFilePathSchema(**conf_test_data_catalog["genetic_file_common_folder"]["file_path"])
test_genetic_file_cc_file_path_common_folder.file_path_obj

{'case': SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None),
 'control': SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr22.gen', file_name='test_CONTROL_MS_chr22', extension='gen', split_by_chromosome=None)}

In [None]:
test_genetic_file_cc_file_path_common_folder

CaseControlFilePathSchema(case=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr22.gen', file_name='test_CASE_MS_chr22', extension='gen', split_by_chromosome=None), control=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr22.gen', file_name='test_CONTROL_MS_chr22', extension='gen', split_by_chromosome=None))

### Testing case control file path split by chromosome

In [None]:
conf_test_data_catalog["genetic_file_split_by_chrom"]["file_path"]

{'common_folder': 'data/test_data/gen_file',
 'case': {'split_by_chromosome': True,
  'full_file_name': 'test_CASE_MS_chr{chrom_num}.gen'},
 'control': {'split_by_chromosome': True,
  'full_file_name': 'test_CONTROL_MS_chr{chrom_num}.gen'}}

In [None]:
test_genetic_file_split_by_chrom = CaseControlFilePathSchema(**conf_test_data_catalog["genetic_file_split_by_chrom"]["file_path"])
test_genetic_file_split_by_chrom

CaseControlFilePathSchema(case=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr{chrom_num}.gen', file_name='test_CASE_MS_chr{chrom_num}', extension='gen', split_by_chromosome=True), control=SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr{chrom_num}.gen', file_name='test_CONTROL_MS_chr{chrom_num}', extension='gen', split_by_chromosome=True))

In [None]:
test_genetic_file_split_by_chrom.file_path_obj

{'case': SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CASE_MS_chr{chrom_num}.gen', file_name='test_CASE_MS_chr{chrom_num}', extension='gen', split_by_chromosome=True),
 'control': SingleFilePathSchema(folder='data/test_data/gen_file', full_file_name='test_CONTROL_MS_chr{chrom_num}.gen', file_name='test_CONTROL_MS_chr{chrom_num}', extension='gen', split_by_chromosome=True)}

In [None]:
(test_genetic_file_split_by_chrom.to_dict().keys())

dict_keys(['case', 'control'])

In [None]:
test_genetic_file_split_by_chrom.get_full_file_path(chrom=22)

{'case': PosixPath('data/test_data/gen_file/test_CASE_MS_chr22.gen'),
 'control': PosixPath('data/test_data/gen_file/test_CONTROL_MS_chr22.gen')}

In [None]:
test_genetic_file_split_by_chrom.extension

{'case': 'gen', 'control': 'gen'}

---