In [37]:
"""
Name: term_page_manager.py
definition: a script to generate and delete annotation term page
Contributors: Dan Lu
"""
# load modules
import argparse
import glob
import os

import pdb
import re

import string
from functools import partial

import frontmatter
import numpy as np
import pandas as pd

from mdutils import fileutils

import yaml

with open("./_config.yml", "r") as f:
    config = yaml.safe_load(f)


def get_term_info(data_model, term):
    """
    Function to get a dictionary for term definition, definition source, module

    :param term: the term name

    :returns: a dictionary with keys: Description and Module
    """
    # get the definition and module of the term from data model
    results = data_model.loc[
        data_model["Attribute"] == term, ["Description", "Source", "Module"]
    ].to_dict("records")

    return results


def generate_page(data_model, term):
    """
    Function to generate term/template markdown page

    :param term: the term name

    :returns: a term Markdown page generated under the docs/<module_name> folder
    """
    term_file = re.sub(" ", "_", term)
    # get term information
    results = get_term_info(data_model, term_attr)

    # add paragraph for term definition and source
    try:
        if results[0]["Source"] == "Sage Bionetworks":
            results[0]["Source"] = "https://sagebionetworks.org/"
    except IndexError:
        results[0]["Source"] = ""

    if "Template" in data_model.query("Attribute == @term")["Module"].values:
        # load template
        post = frontmatter.load("template_page_template.md")
        post.metadata["title"] = re.sub("([A-Z]+)", r" \1", term).title()
        post.metadata["permalink"] = f'docs/{post.metadata["title"]}.html'
    else:
        # load template
        post = frontmatter.load("term_page_template.md")
        post.metadata["title"] = term

    post.metadata["parent"] = results[0]["Module"]

    # load input data and term/template description
    if "Template" in data_model.query("Attribute == @term")["Module"].values:
        post.content = (
            "{% assign mydata=site.data."
            + term_file
            + " %} \n{: .note-title } \n"
            + f">{post.metadata['title']}\n"
            + ">\n"
            + f">{results[0]['Description']} [[Source]]({results[0]['Source']})\n"
            + post.content
        )
    else:
        post.content = (
            "{% assign mydata=site.data."
            + term_file
            + " %} \n{: .note-title } \n"
            + f">{term}\n"
            + ">\n"
            + f">{results[0]['Description']} [[Source]]({results[0]['Source']})\n"
            + post.content
        )

    # create directory for the moduel if not exist
    if not os.path.exists(f"docs/{results[0]['Module']}/"):
        os.mkdir(f"docs/{results[0]['Module']}/")
        # create a module page
        module = fileutils.MarkDownFile(
            f"docs/{results[0]['Module']}/{results[0]['Module']}"
        )
        if "Template" in data_model.query("Attribute == @term")["Module"].values:
            # add permalink for template page
            module.append_end(
                f"--- \nlayout: page \ntitle: {results[0]['Module']} \nhas_children: true \nnav_order: 5 \npermalink: docs/{results[0]['Module']}.html \n---"
            )
        else:
            module.append_end(
                f"--- \nlayout: page \ntitle: {results[0]['Module']} \nhas_children: true \nnav_order: 2 \npermalink: docs/{results[0]['Module']}.html \n---"
            )

    # create file
    file = fileutils.MarkDownFile(f"docs/{results[0]['Module']}/{term}")
    # add content to the file
    file.append_end(frontmatter.dumps(post))


def delete_page(term):
    for file in glob.glob("docs/*/*.md"):
        if file.split("/")[-1].split(".")[0] == term:
            os.remove(file)


def main():
    # load data model csv file
    data_model = pd.read_csv(config["data_model"])

    # pull terms
    term_files = [file.split("/")[-1].split(".")[0]
                  for file in glob.glob("_data/*.csv")]

    term_files_attr = [re.sub("_", " ", t) for t in term_files]

    term_pages = [file.split("/")[-1].split(".")[0]
                  for file in glob.glob("docs/*/*.md")]

    term_pages_attr = [re.sub("_", " ", t) for t in term_pages]

    to_add = map(str, np.setdiff1d(term_files_attr, term_pages_attr))

    to_delete = np.setdiff1d(term_pages_attr, term_files_attr).tolist()

    # pdb.set_trace()
    # generate pages for terms with the term files

    generate_page_temp = partial(generate_page, data_model)

    list(map(generate_page_temp, to_add))

    # delete pages for terms without the term files and exclude module and template pages (since template page might be named differently from the template files)
    to_delete = [
        x
        for x in to_delete
        if x not in data_model["Module"].dropna().unique().tolist() and "Template" not in x
    ]

    list(map(delete_page, to_delete))

In [7]:
data_model = pd.read_csv(config["data_model"])

In [9]:
term = "Biospecimen human"

In [10]:
# def get_term_info(data_model, term):
"""
# # Function to get a dictionary for term definition, definition source, module

:param term: the term name

:returns: a dictionary with keys: Description and Module
"""

# get the definition and module of the term from data model
results = data_model.loc[
    data_model["Attribute"] == term, ["Description", "Source", "Module"]
].to_dict("records")

In [16]:
help(frontmatter.load)

Help on function load in module frontmatter:

load(fd, encoding='utf-8', handler=None, **defaults)
    Load and parse a file-like object or filename,
    return a :py:class:`post <frontmatter.Post>`.
    
    .. doctest::
    
        >>> post = frontmatter.load('tests/yaml/hello-world.txt')
        >>> with open('tests/yaml/hello-world.txt') as f:
        ...     post = frontmatter.load(f)



In [42]:
term_file

'Biospecimen_human'

In [43]:

    # def generate_page(data_model, term):
    """
    Function to generate term/template markdown page

    :param term: the term name

    :returns: a term Markdown page generated under the docs/<module_name> folder
    """
    term_file = re.sub(" ", "_", term)
    
    # get term information
    results = get_term_info(data_model, term_attr)

    # add paragraph for term definition and source
    try:
        if results[0]["Source"] == "Sage Bionetworks":
            results[0]["Source"] = "https://sagebionetworks.org/"
    except IndexError:
        results[0]["Source"] = ""

    if "Template" in data_model.query("Attribute == @term")["Module"].values:
        # load template
        post = frontmatter.load("template_page_template.md")
        post.metadata["title"] = re.sub("([A-Z]+)", r" \1", term).title()
        post.metadata["permalink"] = f'docs/{post.metadata["title"]}.html'
    else:
        # load template
        post = frontmatter.load("term_page_template.md")
        post.metadata["title"] = term

    post.metadata["parent"] = results[0]["Module"]

    # load input data and term/template description
    if "Template" in data_model.query("Attribute == @term")["Module"].values:
        post.content = (
            "{% assign mydata=site.data."
            + term_file
            + " %} \n{: .note-title } \n"
            + f">{post.metadata['title']}\n"
            + ">\n"
            + f">{results[0]['Description']} [[Source]]({results[0]['Source']})\n"
            + post.content
        )
    else:
        post.content = (
            "{% assign mydata=site.data."
            + term_file
            + " %} \n{: .note-title } \n"
            + f">{term}\n"
            + ">\n"
            + f">{results[0]['Description']} [[Source]]({results[0]['Source']})\n"
            + post.content
        )

    # create directory for the moduel if not exist
    if not os.path.exists(f"docs/{results[0]['Module']}/"):
        os.mkdir(f"docs/{results[0]['Module']}/")
        # create a module page
        module = fileutils.MarkDownFile(
            f"docs/{results[0]['Module']}/{results[0]['Module']}"
        )
        if "Template" in data_model.query("Attribute == @term")["Module"].values:
            # add permalink for template page
            module.append_end(
                f"--- \nlayout: page \ntitle: {results[0]['Module']} \nhas_children: true \nnav_order: 5 \npermalink: docs/{results[0]['Module']}.html \n---"
            )
        else:
            module.append_end(
                f"--- \nlayout: page \ntitle: {results[0]['Module']} \nhas_children: true \nnav_order: 2 \npermalink: docs/{results[0]['Module']}.html \n---"
            )

    # create file
    file = fileutils.MarkDownFile(f"docs/{results[0]['Module']}/{term}")
    # add content to the file
    file.append_end(frontmatter.dumps(post))



In [None]:

# def delete_page(term):
    for file in glob.glob("docs/*/*.md"):
        if file.split("/")[-1].split(".")[0] == term:
            os.remove(file)



In [38]:
# def main():
    # load data model csv file
    data_model = pd.read_csv(config["data_model"])

    # pull terms
    term_files = [file.split("/")[-1].split(".")[0] for file in glob.glob("_data/*.csv")]

    term_files_attr = [re.sub("_", " ", t) for t in term_files]

    term_pages = [file.split("/")[-1].split(".")[0] for file in glob.glob("docs/*/*.md")]

    term_pages_attr = [re.sub("_", " ", t) for t in term_pages]

    to_add = map(str, np.setdiff1d(term_files_attr, term_pages_attr))

    to_delete = np.setdiff1d(term_pages_attr, term_files_attr).tolist()

    # pdb.set_trace()
    # generate pages for terms with the term files

    generate_page_temp = partial(generate_page, data_model)

    list(map(generate_page_temp, to_add))

    # delete pages for terms without the term files and exclude module and template pages (since template page might be named differently from the template files)
    to_delete = [
        x
        for x in to_delete
        if x not in data_model["Module"].dropna().unique().tolist() and "Template" not in x
    ]

    list(map(delete_page, to_delete))

[]

In [31]:
list(to_add)

['Biospecimen_nonHuman',
 'ID_mapping',
 'Individual_Human',
 'Individual_nonHuman',
 'Metabolomics_Human',
 'Whole_Genome_Sequencing',
 'analytical_covariates',
 'bsSeq_(bisulfite-seq_WGBS_methylseq_methylomics)',
 'data_dictionary',
 'ethnicity',
 'race']

In [24]:
sorted([re.sub("_", " ", t) for t in term_files])

['Biospecimen human',
 'Biospecimen nonHuman',
 'ID mapping',
 'Individual Human',
 'Individual nonHuman',
 'Metabolomics Human',
 'Microbiome',
 'RNAseq',
 'Whole Genome Sequencing',
 'acquisitionBatchID',
 'acquisitionBatchSize',
 'acquisitionBatchSizeUnit',
 'age',
 'analysisType',
 'analytical covariates',
 'assay',
 'batchID',
 'batchLabel',
 'biospecimen',
 'bsSeq (bisulfite-seq WGBS methylseq methylomics)',
 'captivityDuration',
 'captivityStatus',
 'cohort',
 'commonName',
 'consentGroupID',
 'consortium',
 'countryCode',
 'data dictionary',
 'dataFile',
 'dataSubtype',
 'dataType',
 'diagnosis',
 'diagnosisStatus',
 'directionalBSseqLibrary',
 'dnaBatchID',
 'dnaBatchSize',
 'dnaBatchSizeUnit',
 'ethnicGroupCode',
 'ethnicity',
 'fieldCenterCode',
 'fileFormat',
 'genotyping',
 'grant',
 'individual',
 'individualID',
 'isModelSystem',
 'isMultiSpecimen',
 'isStranded',
 'libraryBatchID',
 'libraryPrep',
 'libraryPreparationMethod',
 'libraryType',
 'libraryVersion',
 'lifeSta

In [None]:
term = "dataType"

"""_summary_

Args:
    data_model (_type_): _description_
    term (_type_): _description_
"""
term_csv_name = re.sub("\s|/", "_", term)

if "Template" in data_model.query("Attribute == @term")["Module"].values:
    depends_on = get_template_keys(data_model, term)
    new = data_model.loc[data_model["Attribute"].isin(depends_on),]
    new = new[
        [
            "Attribute",
            "Description",
            "Type",
            "Valid Values",
            "DependsOn",
            "Required",
            "Source",
            "Module",
        ]
    ].reset_index(drop=True)
    new.rename(
        columns={"Attribute": "Key", "Description": "Key Description"}, inplace=True
    )
    # update template file
    new.to_csv(os.path.join("./_data", term_csv_name + ".csv"), index=False)
    print("\033[92m {} \033[00m".format(f"Updated {term}.csv"))
else:
    # convert dataframe to long format
    new = data_model.loc[data_model["Attribute"] == term,][
        ["Attribute", "Valid Values", "DependsOn", "Type", "Module"]
    ]
    new = (
        new.drop(columns=["Attribute", "DependsOn"])
        .set_index(["Type", "Module"])
        .apply(lambda x: x.str.split(",").explode())
        .reset_index()
    )

    # add columns
    new.rename(columns={"Valid Values": "Key"}, inplace=True)

    new["Key"] = new["Key"].str.strip()

    # load existing csv
    old = pd.read_csv(f"./_data/{term_csv_name}.csv")
    # upload existing csv if Key, Type or Module column is changed
    if not (
        new["Key"].equals(old["Key"])
        and new["Type"].equals(old["Type"])
        and new["Module"].equals(old["Module"])
    ):
        updated = new.astype(str).merge(
            old.astype(str), how="left", on=["Key", "Type", "Module"]
        )
        updated["Type"] = new["Type"]
        updated["Module"] = new["Module"]
        updated = updated[["Key", "Key Description",
                           "Type", "Source", "Module"]]
        updated.to_csv(
            os.path.join("./_data", term_csv_name + ".csv"),
            index=False,
        )
        print("\033[92m {} \033[00m".format(f"Updated {term_csv_name}.csv"))

In [None]:
new["Key"].str.strip()

In [None]:
term = "grant"
data_model.query("Attribute == @term")

In [None]:
term = "AU/ml"
re.sub("\s|/", "_", term)

In [None]:
os.path.join("./_data", term.replace("\\s", "_") + ".csv")

In [None]:
    term = ['Biospecimen_human', 'assay']
    
    """_summary_

    Args:
        term (_type_, optional): _description_. Defaults to None.
    """

    # load data model
    data_model = pd.read_csv(config["data_model"])

    data_model['Attribute'] = data_model['Attribute'].str.replace("\\s|/", "_", regex = True)

    # get the list of existing term csvs
    files = [
        file.split(".csv")[0] for file in os.listdir("_data/") if file.endswith(".csv")
    ]

    if term:
        df = data_model.loc[
            (data_model["Module"].notnull())
            & (
                data_model["Attribute"].isin(term)
                & (data_model["Parent"] != "validValue")
            )
        ]

    else:
        df = data_model.loc[data_model["Module"].notnull(),]

    # generate files when term files don't exist. Do not add files for valid values or specify because these have no useful sub values or depends on
    new_terms = df.loc[
        (~df["Attribute"].isin(files))
        & (df["Parent"] != "validValue")
        & (~df["Attribute"].str.contains("specify")),
        "Attribute",
    ].tolist()

    # generate csv by calling reformatter for each row of the df
    generate_csv_temp = partial(generate_csv, data_model)

    list(map(generate_csv_temp, new_terms))

    # update files if the term files exist
    exist_terms = df.loc[df["Attribute"].isin(files), "Attribute"].tolist()

    update_csv_temp = partial(update_csv, data_model)

    list(map(update_csv_temp, exist_terms))

    # delete term csv if the attribute is removed from data model
    for file in files:
        if file not in data_model.Attribute.values:
            os.remove(f"_data/{file}.csv")


In [None]:
sorted(files)

In [None]:
data_model.query('Parent == "dataProperty"')

In [None]:
# alter specification attributes
data_model.loc[data_model["Attribute"].str.contains(
    "specify"), "Parent"] = "Other"
data_model.loc[data_model["Attribute"].str.contains(
    "specify"), "Module"] = "Other"
data_model.loc[
    data_model["Attribute"].str.contains("specify"), "Description"
] = "Value is determined by the data contributor"
data_model.loc[data_model["Attribute"].str.contains(
    "specify"), "Type"] = "String"
data_model.loc[
    data_model["Attribute"].str.contains("specify"), "Source"
] = "Sage Bionetworks"

In [None]:
data_model.to_csv("EL.data.model.csv")