In [1]:
# default_exp datasets.sample_file

# Sample file formats

> API details.

In [2]:
#hide
from nbdev.showdoc import *
from corradin_ovp_utils.catalog import test_data_catalog, conf_test_data_catalog, get_catalog

In [3]:
#export
from corradin_ovp_utils.datasets.schemas import SingleFilePathSchema
from typing import Any, Dict, List, Optional, Literal, Union
from pydantic import BaseModel
from pathlib import Path, PosixPath
from fastcore.meta import delegates
import pandas as pd
import numpy as np
from copy import deepcopy

# from enum import Enum
# import numpy as np
# from kedro.io import AbstractVersionedDataSet

# from pydantic.dataclasses import dataclass
# from dataclasses import InitVar, asdict
# from corradin_ovp_utils.datasets import genetic_datasets
# from kedro.io.core import (
#     AbstractVersionedDataSet,
#     DataSetError,
#     Version,
#     get_filepath_str,
#     get_protocol_and_path,
# )
# import fsspec


# from types import SimpleNamespace

In [4]:
#export

class SampleFileFormat(BaseModel):
    file_path: SingleFilePathSchema
    pandas_args: Dict[str, Any]
    sample_id_col: str
    cov_cols: List[str]
    pheno_col_name: Optional[str]
    pheno_col_file_info: Optional[str]
    ignore_neg_id_samples: Optional[bool]
    missing_col: Optional[str]
    
    
    @delegates(pd.read_csv)
    def load(self, with_missing_samples=True, subset: Literal["case","control"]=None,  **kwargs):
        df = pd.read_csv(filepath_or_buffer = self.file_path.get_full_file_path(), **self.load_args, **kwargs)
        df.index.name = "sample_id"
        if self.pheno_col_file_info:
            assert len(self.pheno_col_file_info.split("|")) == 3
            file_path, index_col_name, pheno_col_name = self.pheno_col_file_info.split("|")
            pheno_col_df = pd.read_csv(file_path, sep="\t", index_col=index_col_name)
            df = df.join(pheno_col_df, how = "outer")
            self.pheno_col_name = pheno_col_name
            
        if self.ignore_neg_id_samples:
            missing_cond = (df[self.pheno_col_name].isna()) | (df.index.astype(int) < 0)
        else:
            missing_cond = df[self.pheno_col_name].isna()
        df["missing_col_generated"] = np.where(missing_cond, 1, 0)
        
        if self.missing_col:
            assert (df["missing_col_generated"]).equals(df[self.missing_col])
        else:
            df.missing_col = "missing_col_generated"
        
        if with_missing_samples:
            return_df = df
        else:
            return_df = df.query("missing_col_generated == 0")
        
        if subset:
            assert list(sorted(df.query("missing_col_generated == 0")[pheno_col_name].unique())) == [0,1]
        if subset == "case":
            return_df = return_df.query(f"{self.pheno_col_name} == 1")
        if subset == "control":
            return_df = return_df.query(f"{self.pheno_col_name} == 0")
        
        return return_df
    
    
    @property
    def load_args(self):
        load_args = deepcopy(self.pandas_args)
        load_args["index_col"] = self.sample_id_col
        return load_args

In [5]:
test_sample_dataset = test_data_catalog.load("sample_file")
test_sample_dataset.files.case.load()

Unnamed: 0_level_0,ID_1,missing,sex,case,missing_col_generated
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WTCCCT473540,95300_D10,0,2,1,0
WTCCCT473530,95300_D11,0,1,1,0
WTCCCT473555,95300_D12,0,1,1,0
WTCCCT473426,95300_E01,0,2,1,0
WTCCCT473489,95300_E02,0,1,1,0
...,...,...,...,...,...
WTCCCT473455,116733_C03,0,1,1,0
WTCCCT473479,116733_D03,0,2,1,0
WTCCCT473432,116733_E03,0,2,1,0
WTCCCT473465,116733_F02,0,1,1,0


In [6]:
test_sample_dataset.files

namespace(case=SampleFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/sample_file', full_file_name='MS_impute2_ALL_sample_out.tsv', file_name='MS_impute2_ALL_sample_out', extension='tsv', split_by_chromosome=None, chrom_num=None), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], pheno_col_file_info=None, ignore_neg_id_samples=None, missing_col='missing'), control=SampleFileFormat(file_path=SingleFilePathSchema(folder='data/test_data/sample_file', full_file_name='ALL_controls_58C_NBS_WTC2_impute2_sample_out.tsv', file_name='ALL_controls_58C_NBS_WTC2_impute2_sample_out', extension='tsv', split_by_chromosome=None, chrom_num=None), pandas_args={'sep': ' ', 'skiprows': [1]}, sample_id_col='ID_2', cov_cols=['sex'], pheno_col_file_info=None, ignore_neg_id_samples=None, missing_col='missing'))

In [6]:
test_sample_dataset.files.control.load()

Unnamed: 0_level_0,ID_1,missing,plate,sex,case,missing_col_generated
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
WTCCCT443025,74717_A02,0,74717,2,0,0
WTCCCT443065,74717_A05,0,74717,2,0,0
WTCCCT443063,74717_A06,0,74717,2,0,0
WTCCCT443026,74717_B02,0,74717,2,0,0
WTCCCT443066,74717_B05,0,74717,2,0,0
...,...,...,...,...,...,...
WS574632,101916_C07,0,101916,1,0,0
WS574661,101916_D07,0,101916,1,0,0
BLOOD294452,101806_F08,0,101806,1,0,0
WTCCCT511021,101816_D11,0,101816,2,0,0


In [7]:
index_case = test_sample_dataset.files.case.load().index
index_control = test_sample_dataset.files.control.load().index
assert index_case.is_unique
assert index_control.is_unique
assert set(index_case) & set(index_control) == set() #no overlap between cases and controls

---

In [25]:
test_UKB_sample_file_with_pheno_col = get_catalog(env="cluster", patterns = ['catalog*', 'catalog*/*/','catalog*/*/*']).reload().load("test_UKB_sample_file_with_pheno_col")

In [24]:
test_UKB_sample_file_with_pheno_col.files.single_file.file_path.get_full_file_path()

Path('/lab/corradin_biobank/Raw_UKB_downloads/sample_files/ukb45624_imp_chr21_v3_s487275.sample')

In [12]:
test_UKB_sample_file_with_pheno_col_df = test_UKB_sample_file_with_pheno_col.files.single_file.load()
assert test_UKB_sample_file_with_pheno_col_df.shape == (487440, 5)
test_UKB_sample_file_with_pheno_col_df

Unnamed: 0,ID_1,missing,sex,risk_taking_2040,missing_col_generated
-134,-134.0,0.0,0.0,,1
-133,-133.0,0.0,0.0,,1
-132,-132.0,0.0,0.0,,1
-131,-131.0,0.0,0.0,,1
-130,-130.0,0.0,0.0,,1
...,...,...,...,...,...
5873167,5873167.0,0.0,2.0,1.0,0
5873175,5873175.0,0.0,1.0,1.0,0
5873180,5873180.0,0.0,2.0,1.0,0
5873199,5873199.0,0.0,2.0,0.0,0


In [14]:
#load with no missing samples

assert test_UKB_sample_file_with_pheno_col.files.single_file.load(with_missing_samples=False).shape == (340520, 5)