## Case-study transformation

### Import data and prepare source schemas

In [1]:
%%time
###################################################################################################
### IMPORTS AND CONSTANTS
###################################################################################################
from time import gmtime, strftime
from datetime import date, datetime
import sys
print(sys.version)
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

import warnings
warnings.filterwarnings('ignore')

import whyqd as qd

SCHEMA_DESTINATION_PATH = "whyqd_schemas/fairtracks-convenience-schema.SCHEMA"
DATASOURCE_PATH = "whyqd_schemas/Meta-Data 49 Diatom Protein Coding Gene Annotations.xlsx"
MIMETYPE = "xlsx"
SCHEMA_SOURCE = {}
DATA_SOURCE = {}

3.11.4 (main, Nov  7 2023, 16:51:51) [GCC 11.4.0]
2025-01-07 09:27:36


2025-01-07 10:27:46,345	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


CPU times: user 850 ms, sys: 2.36 s, total: 3.21 s
Wall time: 10.4 s


In [24]:
%%time
###################################################################################################
### IMPORT SOURCES
###################################################################################################
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

# Get the destination schema
SCHEMA_DESTINATION = qd.SchemaDefinition(source=SCHEMA_DESTINATION_PATH)
# Get data sources from multiple sheets and derive the source schemas for each
datasource = qd.DataSourceDefinition()
datasource.derive_model(source=DATASOURCE_PATH, mimetype=MIMETYPE)
for ds in datasource.get:
    # We separate out the individual sheets to transform independently
    DATA_SOURCE[ds.sheet_name.lower()] = ds
    schema_source = qd.SchemaDefinition(source={
          "name": ds.sheet_name.lower()
    })
    schema_source.derive_model(data=ds)
    SCHEMA_SOURCE[ds.sheet_name.lower()] = schema_source

# Show source schemas
SCHEMA_SOURCE

2025-01-07 09:54:36
Failed to delete /tmp/systemd-private-305f46beac1e464c9a3b33c389efa586-systemd-logind.service-YD8DhN. Reason: [Errno 13] Permission denied: '/tmp/systemd-private-305f46beac1e464c9a3b33c389efa586-systemd-logind.service-YD8DhN'
Failed to delete /tmp/.X11-unix. Reason: [Errno 30] Read-only file system: 'X0'
Failed to delete /tmp/.XIM-unix. Reason: [Errno 1] Operation not permitted: '/tmp/.XIM-unix'
Failed to delete /tmp/.Test-unix. Reason: [Errno 1] Operation not permitted: '/tmp/.Test-unix'
Failed to delete /tmp/systemd-private-305f46beac1e464c9a3b33c389efa586-systemd-resolved.service-XKhcBt. Reason: [Errno 13] Permission denied: '/tmp/systemd-private-305f46beac1e464c9a3b33c389efa586-systemd-resolved.service-XKhcBt'
Failed to delete /tmp/systemd-private-305f46beac1e464c9a3b33c389efa586-systemd-timesyncd.service-yv0SRe. Reason: [Errno 13] Permission denied: '/tmp/systemd-private-305f46beac1e464c9a3b33c389efa586-systemd-timesyncd.service-yv0SRe'
Failed to delete /tmp/.I

2025-01-07 10:54:44,226	INFO worker.py:1821 -- Started a local Ray instance.


CPU times: user 960 ms, sys: 609 ms, total: 1.57 s
Wall time: 26.6 s


{'input_data_accession_numbers': Schema: `input_data_accession_numbers`,
 'entap_functional_annotation_sum': Schema: `entap_functional_annotation_sum`,
 'software_and_container_versions': Schema: `software_and_container_versions`,
 'annotations_descriptive_statist': Schema: `annotations_descriptive_statist`,
 'repeats': Schema: `repeats`,
 'busco_scores': Schema: `busco_scores`,
 'omark_scores': Schema: `omark_scores`,
 'busco_version_dependencies': Schema: `busco_version_dependencies`}

### Prepare crosswalks for each source schema

The joining field across the useable sheets are some variation of `Species name`. However, while useable data are scattered throughout the sheets, only a subset are addressable via a `whyqd` crosswalk while the rest would need to be added in as a `NEW` action.

Useable schemas:
- input_data_accession_numbers
- entap_functional_annotation_sum
- annotations_descriptive_statist
- repeats
- busco_scores
- omark_scores

In [22]:
%%time
###################################################################################################
### GET THE EXISTING CROSSWALKS
###################################################################################################
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

CROSSWALK_SOURCES = [f"whyqd_schemas/sheet{n+1}.CROSSWALK" for n in range(4)]
for CROSSWALK_SOURCE in CROSSWALK_SOURCES:
    print(f"-----{CROSSWALK_SOURCE}-----")
    crosswalk = qd.CrosswalkDefinition(crosswalk=CROSSWALK_SOURCE)
    for a in crosswalk.actions.get_all():
        print(a.script)

2025-01-07 09:41:52
-----whyqd_schemas/sheet1.CROSSWALK-----
SELECT > 'local_id' < ['Species name']
SELECT > 'track_assembly_id' < ['Genome assembly accession number']
NEW > 'track_feature' < ['http://purl.obolibrary.org/obo/SO_0000704']
COLLATE > 'experiment_evidence_global_id' < ['RNAseq Libraries used for annotation','Legacy genome assembly accession number or protein source (protein sequence source used for annotation)','OrthoDB Proteins']
COLLATE > 'experiment_annotation_method_name' < ['Annotation method']
COLLATE > 'study_publication' < ['Data Zenodo DOI','Publication with Methods Descripton']
SELECT > 'track_file_url' < ['Data File on Zenodo']
SELECT > 'track_file_name' < ['Data File on Zenodo']
NEW > 'study_study_name' < ['Annotation of protein-coding genes in 49 diatom genomes from the Bacillariophyta clade']
NEW > 'study_contact_name' < ['Katharina Hoff']
NEW > 'study_contact_email' < ['katharina.hoff@uni-greifswald.de']
NEW > 'study_contact_orcid' < ['orcid:0000-0002-7333-8

In [133]:
%%time
###################################################################################################
### PREPARE AND RUN CROSSWALKS
###################################################################################################
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

TRANSFORM_DATA = []
# Describe the crosswalks
CROSSWALK_SCRIPTS = {
    "input_data_accession_numbers": [
        "SELECT > 'local_id' < ['Species name']",
        "SELECT > 'track_assembly_id' < ['Genome assembly accession number']",
        "NEW > 'track_feature' < ['http://purl.obolibrary.org/obo/SO_0000704']",
        "COLLATE > 'experiment_evidence_global_id' < ['RNAseq Libraries used for annotation','Legacy genome assembly accession number or protein source (protein sequence source used for annotation)','OrthoDB Proteins']",
        "COLLATE > 'experiment_annotation_method_name' < ['Annotation method']",
        "COLLATE > 'study_publication' < ['Data Zenodo DOI','Publication with Methods Descripton']",
        "SELECT > 'track_file_url' < ['Data File on Zenodo']",
        "SELECT > 'track_file_name' < ['Data File on Zenodo']",
        "NEW > 'study_study_name' < ['Annotation of protein-coding genes in 49 diatom genomes from the Bacillariophyta clade']",
        "NEW > 'study_contact_name' < ['Katharina Hoff']",
        "NEW > 'study_contact_email' < ['katharina.hoff@uni-greifswald.de']",
        "NEW > 'study_contact_orcid' < ['orcid:0000-0002-7333-8390']",
        "NEW > 'experiment_technique' < ['http://edamontology.org/operation_0362']",
        "NEW > 'track_file_format' < ['http://edamontology.org/format_1975']",
    ],
    "entap_functional_annotation_sum": [
        "SELECT > 'local_id' < ['Species']",
        "NEW > 'experiment_compute_protocol_description' < ['https://docs.google.com/spreadsheets/d/1_1wKibSXbi8MQCTqpK03ytuQH71yn28xKuRS1i-mVSw/edit?gid=52185137#gid=52185137']",
    ],
    "busco_scores": [
        "SELECT > 'local_id' < ['Species name']",
        "COLLATE > 'track_evaluation_method_value' < ['BUSCO scores genome level','BUSCO scores protein level']",
    ],
    "omark_scores": [
        "SELECT > 'local_id' < ['Species name']",
        "COLLATE > 'track_evaluation_method_value' < ['OMArk sum']",
    ],
}

for key, cw_script in CROSSWALK_SCRIPTS.items():
    # Define a Crosswalk
    crosswalk = qd.CrosswalkDefinition()
    crosswalk.set(schema_source=SCHEMA_SOURCE[key], schema_destination=SCHEMA_DESTINATION)
    crosswalk.actions.add_multi(terms=cw_script)
    # Transform a data source
    transform = qd.TransformDefinition(crosswalk=crosswalk, data_source=DATA_SOURCE[key])
    transform.process()
    TRANSFORM_DATA.append(transform.data)

2025-01-07 11:13:01
CPU times: user 863 ms, sys: 290 ms, total: 1.15 s
Wall time: 4.39 s


### Merge the data into a single table

`track_evaluation_method_value` is required to be an array (hence the `COLLATE` action). However, to merge these, we have two columns with the same name. We need to treat this as a special case.

- Restructure the `omark_scores` field, since this has only a single value, but is structured as `[[x1], [x2], [x3], ...]`.
- Concatenate it with the `busco_scores` column and update the original column.
- Assign a new `track_evaluation_method_name` with the static value `['BUSCO scores genome level','BUSCO scores protein level', 'OMArk sum']`

In [134]:
import numpy as np
import modin.pandas as pd
from itertools import chain

# 1. Merge `input_data_accession_numbers` and `entap_functional_annotation_sum`
df = pd.merge(TRANSFORM_DATA[0], TRANSFORM_DATA[1], how="outer", on="local_id")

# 2. Process `busco_scores` and `omark_scores`, then merge with the merged dataframe
# First get the column values as arrays
busco_values = TRANSFORM_DATA[2]["track_evaluation_method_value"].to_numpy()
omark_values = TRANSFORM_DATA[3]["track_evaluation_method_value"].to_numpy()
# Unpack and get the T value of the `omark` values
omark_values = np.array([list(chain(*omark_values.tolist()))])
# Concatenate into a new list and assign
TRANSFORM_DATA[2]["track_evaluation_method_value"] = np.concatenate((busco_values.tolist(), omark_values.T), axis=1).tolist()
df = pd.merge(df, TRANSFORM_DATA[2], how="outer", on="local_id")

# 3. Add in missing `name` array fields, as required
# `experiment_evidence_global_id` requires a corresponding `experiment_evidence_evidence_type` with value `['RNA', 'proteins', 'protein database']`
df["experiment_evidence_evidence_type"] = np.tile(["RNA", "proteins", "protein database"], (len(df), 1)).tolist()
# `track_evaluation_method_value` requires a corresponding `track_evaluation_method_name` with value `['BUSCO scores genome level','BUSCO scores protein level', 'OMArk sum']`
df["track_evaluation_method_name"] = np.tile(["BUSCO scores genome level","BUSCO scores protein level", "OMArk sum"], (len(df), 1)).tolist()

In [135]:
df.head()

Unnamed: 0,local_id,study_study_name,study_publication,study_contact_name,study_contact_email,study_contact_orcid,experiment_technique,track_assembly_id,track_file_name,track_file_url,track_file_format,track_feature,experiment_annotation_method_name,experiment_evidence_global_id,experiment_compute_protocol_description,track_evaluation_method_value,experiment_evidence_evidence_type,track_evaluation_method_name
0,Asterionella formosa,Annotation of protein-coding genes in 49 diato...,"[10.5281/zenodo.14040071, https://doi.org/10.4...",Katharina Hoff,katharina.hoff@uni-greifswald.de,orcid:0000-0002-7333-8390,http://edamontology.org/operation_0362,GCA_002256025.1,https://zenodo.org/records/14040072/files/Aste...,https://zenodo.org/records/14040072/files/Aste...,http://edamontology.org/format_1975,http://purl.obolibrary.org/obo/SO_0000704,[BRAKER3],"[SRR5749612, None, Viridiplantae+Stramenopiles...",https://docs.google.com/spreadsheets/d/1_1wKib...,"[C:95.0%[S:94.0%,D:1.0%],F:1.0%,M:4.0%,n:100,E...","[RNA, proteins, protein database]","[BUSCO scores genome level, BUSCO scores prote..."
1,Asterionellopsis glacialis,Annotation of protein-coding genes in 49 diato...,"[10.5281/zenodo.14040071, https://doi.org/10.4...",Katharina Hoff,katharina.hoff@uni-greifswald.de,orcid:0000-0002-7333-8390,http://edamontology.org/operation_0362,GCA_014885115.2,https://zenodo.org/records/14040072/files/Aste...,https://zenodo.org/records/14040072/files/Aste...,http://edamontology.org/format_1975,http://purl.obolibrary.org/obo/SO_0000704,[BRAKER2],"[None, None, Viridiplantae+Stramenopiles (http...",https://docs.google.com/spreadsheets/d/1_1wKib...,"[C:92.0%[S:92.0%,D:0.0%],F:1.0%,M:7.0%,n:100,E...","[RNA, proteins, protein database]","[BUSCO scores genome level, BUSCO scores prote..."
2,Bacterosira constricta,Annotation of protein-coding genes in 49 diato...,"[10.5281/zenodo.14040071, https://doi.org/10.4...",Katharina Hoff,katharina.hoff@uni-greifswald.de,orcid:0000-0002-7333-8390,http://edamontology.org/operation_0362,GCA_037356235.1,https://zenodo.org/records/14040072/files/Bact...,https://zenodo.org/records/14040072/files/Bact...,http://edamontology.org/format_1975,http://purl.obolibrary.org/obo/SO_0000704,[BRAKER3],"[SRR18733610, None, Viridiplantae+Stramenopile...",https://docs.google.com/spreadsheets/d/1_1wKib...,"[C:95.0%[S:95.0%,D:0.0%],F:2.0%,M:3.0%,n:100,E...","[RNA, proteins, protein database]","[BUSCO scores genome level, BUSCO scores prote..."
3,Chaetoceros muellerii,Annotation of protein-coding genes in 49 diato...,"[10.5281/zenodo.14040071, https://doi.org/10.4...",Katharina Hoff,katharina.hoff@uni-greifswald.de,orcid:0000-0002-7333-8390,http://edamontology.org/operation_0362,GCA_019693545.1,https://zenodo.org/records/14040072/files/Chae...,https://zenodo.org/records/14040072/files/Chae...,http://edamontology.org/format_1975,http://purl.obolibrary.org/obo/SO_0000704,[BRAKER3],"[SRR8647946,SRR8647948,SRR8647949,SRR8647950,S...",https://docs.google.com/spreadsheets/d/1_1wKib...,"[C:97.0%[S:96.0%,D:1.0%],F:1.0%,M:2.0%,n:100, ...","[RNA, proteins, protein database]","[BUSCO scores genome level, BUSCO scores prote..."
4,Conticribra guillardii,Annotation of protein-coding genes in 49 diato...,"[10.5281/zenodo.14040071, https://doi.org/10.4...",Katharina Hoff,katharina.hoff@uni-greifswald.de,orcid:0000-0002-7333-8390,http://edamontology.org/operation_0362,GCA_036939335.1,https://zenodo.org/records/14040072/files/Cont...,https://zenodo.org/records/14040072/files/Cont...,http://edamontology.org/format_1975,http://purl.obolibrary.org/obo/SO_0000704,[BRAKER3],"[SRR18733581, None, Viridiplantae+Stramenopile...",https://docs.google.com/spreadsheets/d/1_1wKib...,"[C:99.0%[S:98.0%,D:1.0%],F:0.0%,M:1.0%,n:100,E...","[RNA, proteins, protein database]","[BUSCO scores genome level, BUSCO scores prote..."
