In [None]:
#| default_exp catalogs

In [None]:
#| export
from fastcore.basics import basic_repr
import pandas as pd
from pathlib import Path
import types

from fastcore.meta import delegates
from fastcore.test import *
from typing import List, Literal, Dict, Union

import yaml
import intake
from intake.catalog.local import LocalCatalogEntry
from intake import Catalog, open_catalog


In [None]:
#| export

def has_error(func, *args, **kwargs):
    try:
        func(*args,**kwargs)
        return False
    except:
        return True

In [None]:
#| export
def load_readable_name_to_catalog_name(intake_catalog):
    return {intake_catalog[data_name].description: data_name for data_name in intake_catalog}

In [None]:
#| export
intake_catalog_folder = "/lab/corradin_biobank/FOR_AN/lab-central-webapp/data/00_intake_catalogs/"


### Reading catalogs

In [None]:
#| export

class IntakeCatalogWrapper():
    __repr__ = basic_repr("num_datasets")
    def __init__(self, cat_uri, process_func=None):
        self.catalog = intake.open_catalog(cat_uri)
        if process_func:
            for entry in self.catalog._entries:
                read_and_process_func = self._read_and_process_factory(process_func)
                self.catalog[entry].read_and_process = types.MethodType(read_and_process_func,  self.catalog[entry])
            
    
    def _read_and_process_factory(self, process_func):
        def read_and_process(self, **kwargs):
            df = self.read()
            processed_df = process_func(df, self.name, **kwargs)
            return processed_df
        return read_and_process
    
    @property
    def num_datasets(self):
        return f"Intake catalog. Number of datasets: {len(list(self.catalog.keys()))}"
    
    @property
    def readable_name_to_data_name_dict(self):
        return load_readable_name_to_catalog_name(self.catalog)
    
    def load_data(self, data_name):
        df = self.catalog[data_name].read()
        return df

    def load_data_by_readable_name(self,readable_name):
        data_name = self.readable_name_to_data_name_dict[readable_name]
        return self.load_data(data_name)
    
    def load_data_dict_by_readable_names(readable_names):
        return {readable_name: load_data_by_readable_name(readable_name) for readable_name in readable_names}
        

In [None]:
#| export
intake_catalog = IntakeCatalogWrapper(f"{intake_catalog_folder}/*.yaml")
intake_catalog

__main__.IntakeCatalogWrapper(num_datasets='Intake catalog. Number of datasets: 0')

In [None]:
intake_catalog.readable_name_to_data_name_dict

{}

In [None]:
intake_catalog.load_data_by_readable_name('scATAC-seq peaks from Yang et al 2022, biorxiv')

KeyError: 'scATAC-seq peaks from Yang et al 2022, biorxiv'

In [None]:
Path('/Users/ahoang/Documents/Work/webapp/HIC_database/hg38/ensembl_all_genes_TSS_hg38').is_file()

True

### Creating and updating catalogs

In [None]:
#| export

def generate_data_catalog(*, path: str,
                          glob_string:str,
                          catalog_name :str = None,
                          catalog_description:str = None,
                          driver = "csv",
                          csv_kwargs={"sep": "\t"},
                          data_types: List[str] = [],
                          tags: List[str] = [],
                          cat_metadata: Union[Dict[str, str], None]= None):
    catalog_metadata_combined = {"version": 1, "data_types": data_types} if not cat_metadata else {**{"data_types": data_types}, **cat_metadata}
    abs_path = Path(path).absolute()
    cat_name = catalog_name if catalog_name else f"catalog_generated_from_{path}"
    cat_description = catalog_description if catalog_description else f"Catalog generated from {path}"
    all_entries_dict = {}
    file_paths = abs_path.glob(glob_string)
    for file in file_paths:
        file_name = file.stem
        entry = LocalCatalogEntry(name = file_name,
                                  description = "Fill me in please!",
                                  driver = driver,
                                  args = {"urlpath": file.as_posix(), "csv_kwargs":csv_kwargs},
                                  metadata = {"tags" : tags,
                                              "citation":{"paper_name": "Fill me in please!",
                                                        "first_author_lastname": "Fill me in please!",
                                                        "published_year": "Fill me in please!",
                                                        "journal": "Fill me in please!",
                                                        "link": "Fill me in please!",
                                                        "full_citation": "Fill me in please!"
                                                       }})
        all_entries_dict[entry.name] = entry
    catalog = Catalog(name = cat_name, description = catalog_description, metadata = catalog_metadata_combined, entries = all_entries_dict)
    return catalog

def combine_catalogs(old_cat_path, new_cat, attrs_priority_dict={"name": "old",
                                                                 "description": "old",
                                                                 "metadata": "old",
                                                                 }):
    assert(all([key in attrs_priority_dict for key in ["name", "description", "metadata"]]))

    old_cat = IntakeCatalogWrapper(old_cat_path).catalog
    
    #remove old catalog entry file does not exist anymore
    old_entries_dict = {key:entry for key, entry in old_cat._entries.items() if (has_error(test_fail, f = old_cat[key].discover, contains = "No such file or directory:", args =None))}
    
    #add entry if old catalog doesn't have it, or `description` field was default
    new_entries_dict = {key:entry for key, entry in new_cat._entries.items() if\
                        (((key not in old_entries_dict)\
                        or (entry.describe()["description"] == "Fill me in please!"))\
                        or (entry.metadata["citation"]["full_citation"] == "Fill me in please!")
                        )}
    
    print(f"Added {len(new_entries_dict.keys())} new entries: {list(new_entries_dict.keys())}")
    combined_entries_dict = {**old_entries_dict, **new_entries_dict}
    
    extra_cat_kwargs = {}
    for key, item in attrs_priority_dict.items():
        if item == "old":
            curr_cat = old_cat
        elif item == "new":
            curr_cat = new_cat
            
        extra_cat_kwargs[key] = curr_cat.__dict__.get("metadata")

    combined_cat = Catalog.from_dict(combined_entries_dict, **extra_cat_kwargs)
    return combined_cat
    
def handle_file_name_collision(path: Path, mode: Literal["error", "overwrite", "append"], content, append_func= None):
    if mode == "error":
        raise ValueError(f"{path.as_posix()} is an existing file and current mode is `error`. please remove or switch to `append` or `overwrite` mode")
    elif mode == "overwrite":
        return_content = content
        # if write_func:
        #     write_func(path, content)
        # else:
        #     raise ValueError("Mode is `overwite`, need to provide `write_func`")
    elif mode == "append":
        if append_func:
            return_content = append_func(path, content)
            #write_func(path, combined_content)
            
        else:
            raise ValueError("Mode is `append`, need to provide both `append_func`")
    else:
        raise KeyError(f"cannot proceed in mode {mode}")
    return return_content

def write_catalog_content(path,cat):
    with open(path, "w") as file:
        file.write(cat.serialize())
    
    

def safe_write_data_catalog(catalog, cat_path, same_name_cat_found: Literal["error", "overwrite", "append"] = "error", **kwargs):
    abs_path = Path(cat_path).absolute()
    if abs_path.is_file():
        print(f"Found existing file at {abs_path}. Mode is `{same_name_cat_found}`")
        catalog = handle_file_name_collision(abs_path, mode = same_name_cat_found, content = catalog, append_func = combine_catalogs)
    write_catalog_content(cat_path, catalog)
    print(f"Finished writing new catalog at {cat_path}")

In [None]:
test_hg19_HiC_database_path = "/Users/ahoang/Documents/Work/webapp/HIC_database/hg19/"
test_hg38_HiC_database_path = "/Users/ahoang/Documents/Work/webapp/HIC_database/hg38/"

In [None]:
test_entry = LocalCatalogEntry(name = list(Path(test_hg38_HiC_database_path).glob("*hg38*"))[0].stem, driver = "csv", args = {"urlpath": list(Path(test_hg38_HiC_database_path).glob("*hg38*"))[0].as_posix()}, description = "Fill me in please!")
test_entry.describe()

{'name': 'PLAC_hg38_microglia',
 'container': 'dataframe',
 'plugin': ['csv'],
 'driver': ['csv'],
 'description': 'Fill me in please!',
 'direct_access': True,
 'user_parameters': [],
 'metadata': {},
 'args': {'urlpath': '/Users/ahoang/Documents/Work/webapp/HIC_database/hg38/PLAC_hg38_microglia'}}

In [None]:
test_data_catalog = generate_data_catalog(path = test_hg38_HiC_database_path, glob_string= "*hg38*", catalog_name = "HiC_database_generated")
test_data_catalog

HiC_database_generated:
  args:
    entries:
      PCHIC_Geschwind_NeuN+_hg38: !!python/object:intake.catalog.local.LocalCatalogEntry
        args: []
        cls: intake.catalog.local.LocalCatalogEntry
        kwargs:
          name: PCHIC_Geschwind_NeuN+_hg38
          description: Fill me in please!
          driver: csv
          args:
            csv_kwargs: &id001
              sep: "\t"
            urlpath: /Users/ahoang/Documents/Work/webapp/HIC_database/hg38/PCHIC_Geschwind_NeuN+_hg38
          metadata:
            citation:
              citation: Fill me in please!
              first_author_lastname: Fill me in please!
              journal: Fill me in please!
              link: Fill me in please!
              paper_name: Fill me in please!
              published_year: Fill me in please!
      PCHIC_Geschwind_NeuN-_hg38: !!python/object:intake.catalog.local.LocalCatalogEntry
        args: []
        cls: intake.catalog.local.LocalCatalogEntry
        kwargs:
          n

In [None]:
!rm -rf test.yaml
safe_write_data_catalog(catalog = test_data_catalog, cat_path = "test.yaml", catalog_name = "HiC_database_generated")

Finished writing new catalog at test.yaml


#### Test name collision handling

In [None]:
safe_write_data_catalog(catalog = test_data_catalog, cat_path = "test.yaml", catalog_name = "HiC_database_generated")

Found existing file at /Users/ahoang/Documents/Work/webapp/lab-central-webapp/notebooks/package/ChIP_peaks_merge/test.yaml. Mode is `error`


ValueError: /Users/ahoang/Documents/Work/webapp/lab-central-webapp/notebooks/package/ChIP_peaks_merge/test.yaml is an existing file and current mode is `error`. please remove or switch to `append` or `overwrite` mode

In [None]:
safe_write_data_catalog(catalog = test_data_catalog,
                        cat_path = "test.yaml",
                        catalog_name = "HiC_database_generated", 
                        same_name_cat_found= "append")

Found existing file at /Users/ahoang/Documents/Work/webapp/lab-central-webapp/notebooks/package/ChIP_peaks_merge/test.yaml. Mode is `append`
Added 6 new entries: ['PLAC_hg38_microglia', 'PCHIC_per_tissue_Neural_Progenitor_Cell_hg38', 'PCHIC_Geschwind_NeuN+_hg38', 'PCHIC_Geschwind_NeuN-_hg38', 'PCHIC_per_tissue_Dorsolateral_Prefrontal_Cortex_hg38', 'PCHIC_per_tissue_Hippocampus_hg38']
Finished writing new catalog at test.yaml


In [None]:
safe_write_data_catalog(catalog = test_data_catalog,
                        cat_path = "test.yaml",
                        catalog_name = "HiC_database_generated", 
                        same_name_cat_found= "overwrite")

Found existing file at /Users/ahoang/Documents/Work/webapp/lab-central-webapp/notebooks/package/ChIP_peaks_merge/test.yaml. Mode is `overwrite`
Finished writing new catalog at test.yaml
