In [17]:
import glob
import os
import pandas as pd
import numpy as np
import re
import datetime
import pathlib

# custom package(s)
from utils import utils, synapse_login

syn = synapse_login.main()

dm_name = "../EL.data.model.csv"
output_path = "../backups"
data_model = utils.load_and_backup_dm(dm_name, output_path)


UPGRADE AVAILABLE

A more recent version of the Synapse Client (3.0.0) is available. Your version (2.7.0) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 3.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Nicholas Lee!



In [18]:
data_model = data_model.sort_values(by=["Parent", "Attribute"]).reset_index(drop=True)

# Join Models together to create one model

partition_path = "../models/partitions/*.csv"

In [19]:
def create_data_model_partitions(data_model, partition_path):
    """Create partitions from the data model based on the Parent names

    Args:
        data_model (object): dataframe that contains the data model
        partition_path (str): directory to store the partitioned data model as CSVs
    """
    if not os.path.exists(partition_path):
        os.path.mkdir(partition_path)

    # Split by Parent
    Parents = data_model["Parent"].unique()

    df_dict = {
        m: data_model.loc[data_model["Parent"] == m,].reset_index(drop=True)
        for m in Parents
    }

    # write out partitions
    for k, v in df_dict.items():
        v.to_csv(pathlib.Path(partition_path, k + ".csv"), index=False)

In [20]:
def join_data_model_partitions(partition_path):
    """Join the partitions back together to form the data model used in DCA

    Args:
        partition_path (str): directory containing the partitions as CSVs

    Returns:
        object: pandas dataframe
    """
    Parents = glob.glob(partition_path)
    df_dict = {pathlib.Path(m).stem: pd.read_csv(m) for m in Parents}
    data_model_full = (
        pd.concat([pd.read_csv(m) for m in Parents])
        .sort_values(by=["Parent", "Attribute"])
        .reset_index(drop=True)
        .fillna("")
    )

    data_model_full.info()

    return data_model

In [21]:
def write_out_data_model(data_model, file_path):
    data_model.drop_duplicates(subset=["Attribute"], inplace=True)
    data_model.reset_index(drop=True, inplace=True)
    data_model.to_csv(file_path, index=False)

In [22]:
def main():
    data_model_name = "../EL.data.model.csv"
    partition_path = "../models/partitions/"
    backup_path = os.path.join("../backups", data_model_name)

    # Load data model
    data_model_path = pathlib.Path(data_model_name).resolve()
    backup_path = pathlib.Path(backup_path).resolve()
    data_model = utils.load_and_backup_data_model(data_model_path, backup_path)

    # create partitions
    create_data_model_partitions(data_model, partition_path)

In [23]:
partition_path = "../models/partitions/"
create_data_model_partitions(data_model, partition_path)

In [None]:
# Manually updating data model
data_model.loc[
    data_model["Parent"] == "Template", "Description"
] = "Template used for contributing metadata to the ELITE portal"

data_model.loc[
    data_model["Attribute"] == "bsSeq (bisulfite-seq WGBS methylseq methylomics)",
    ["Attribute", "Description"],
] = ["bsSeq", "bisulfite-seq WGBS methylseq methylomics"]

In [None]:
# Trying to see if having this extra column helps trace dependencies
data_model["UsedIn"] = ""

for i, d in data_model.loc[data_model["Parent"] == "Template",].iterrows():
    attr = d["Attribute"]
    dependsOn = d["DependsOn"].split(",")
    data_model.loc[data_model["Attribute"].isin(dependsOn), "UsedIn"] += "," + attr

data_model["UsedIn"] = (
    data_model["UsedIn"]
    .str.split(",")
    .apply(lambda x: ",".join([y.strip() for y in x if len(y) > 0]))
)

In [None]:
write_out_data_model(data_model, dm_name)

In [None]:
# Only keep five most recent backups
backup_files = sorted(glob.glob("../backups/*.csv"))

if len(backup_files) > 5:
    for d in backup_files[:-5]:
        os.remove(d)

In [None]:
partition_path = "../models/partitions/*.csv"
dm = join_data_model_partitions(partition_path)

In [None]:
write_out_data_model(dm, "../EL.data.model.csv")