In [96]:
import glob
import os
import pandas as pd
import numpy as np
import re
import datetime
import pathlib

# custom package(s)
from utils import utils, synapse_login

syn = synapse_login.main()
dm_name = "../EL.data.model.csv"
output_path = "../backups"
data_model = utils.load_and_backup_dm(dm_name, output_path)
data_model = data_model.sort_values(
    by=["Module", "Attribute"]).reset_index(drop=True)

# Join Models together to create one model

partition_path = "../models/partitions/*.csv"


UPGRADE AVAILABLE

A more recent version of the Synapse Client (3.0.0) is available. Your version (2.7.2) can be upgraded by typing:
    pip install --upgrade synapseclient

Python Synapse Client version 3.0.0 release notes

https://python-docs.synapse.org/build/html/news.html



Welcome, Nicholas Lee!



In [87]:
def create_data_model_partitions(data_model, partition_path):
    """Create partitions from the data model based on the module names

    Args:
        data_model (object): dataframe that contains the data model
        partition_path (str): directory to store the partitioned data model as CSVs
    """
    if not os.path.exists(path_name):
        os.path.mkdir(path_name)

    # Split by module
    modules = data_model["Module"].unique()

    df_dict = {
        m: data_model.loc[data_model["Module"] == m,].reset_index(drop=True)
        for m in modules
    }

    # write out partitions
    for k, v in df_dict.items():
        v.to_csv(pathlib.Path("../models/partitions/", k + ".csv"), index=False)

In [101]:
def join_data_model_partitions(partition_path):
    """Join the partitions back together to form the data model used in DCA

    Args:
        partition_path (str): directory containing the partitions as CSVs

    Returns:
        object: pandas dataframe
    """
    modules = glob.glob(partition_path)
    df_dict = {pathlib.Path(m).stem: pd.read_csv(m) for m in modules}
    data_model_full = (
        pd.concat([pd.read_csv(m) for m in modules])
        .sort_values(by=["Module", "Attribute"])
        .reset_index(drop=True)
        .fillna("")
    )

    data_model_full.info()

    return data_model

In [89]:
def write_out_data_model(data_model, file_path):
    data_model.drop_duplicates(subset=["Attribute"], inplace=True)
    data_model.reset_index(drop=True, inplace=True)
    data_model.to_csv(file_path, index=False)

In [90]:
def main():
    data_model_name = "../EL.data.model.csv"
    partition_path = "../models/partitions/"
    backup_path = os.path.join("../backups", data_model_name)

    # Load data model
    data_model_path = pathlib.Path(data_model_name).resolve()
    backup_path = pathlib.Path(backup_path).resolve()
    data_model = utils.load_and_backup_data_model(data_model_path, backup_path)

    # create partitions
    create_data_model_partitions(data_model, partition_path)

In [91]:
# Manually updating data model
data_model.loc[
    data_model["Module"] == "Template", "Description"
] = "Template used for contributing metadata to the ELITE portal"

data_model.loc[
    data_model["Attribute"] == "bsSeq (bisulfite-seq WGBS methylseq methylomics)",
    ["Attribute", "Description"],
] = ["bsSeq", "bisulfite-seq WGBS methylseq methylomics"]

In [92]:
# Trying to see if having this extra column helps trace dependencies
data_model["UsedIn"] = ""

for i, d in data_model.loc[data_model["Module"] == "Template",].iterrows():
    attr = d["Attribute"]
    dependsOn = d["DependsOn"].split(",")
    data_model.loc[data_model["Attribute"].isin(
        dependsOn), "UsedIn"] += "," + attr

data_model["UsedIn"] = (
    data_model["UsedIn"]
    .str.split(",")
    .apply(lambda x: ",".join([y.strip() for y in x if len(y) > 0]))
)

In [95]:
write_out_data_model(data_model, dm_name)

In [93]:
backup_files = sorted(glob.glob("../backups/*.csv"))

if len(backup_files) > 5:
    to_delete = backup_files[:-5]

In [102]:
partition_path = "../models/partitions/*.csv"
dm = join_data_model_partitions(partition_path)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Attribute            647 non-null    object
 1   Description          647 non-null    object
 2   Valid Values         647 non-null    object
 3   DependsOn            647 non-null    object
 4   Properties           647 non-null    object
 5   Required             647 non-null    object
 6   Parent               647 non-null    object
 7   DependsOn Component  647 non-null    object
 8   Source               647 non-null    object
 9   Validation Rules     647 non-null    object
 10  Module               647 non-null    object
 11  Type                 647 non-null    object
 12  Ontology             647 non-null    object
dtypes: object(13)
memory usage: 65.8+ KB


In [None]:
write_out_data_model(dm, '../EL.data.model.csv')