Skip to content

Commit

Permalink
chore: remove deprecated ontology processing code (#818)
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed Apr 3, 2024
1 parent bc86a51 commit 36e0ec6
Show file tree
Hide file tree
Showing 35 changed files with 68 additions and 3,023 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 5.0.0
current_version = 5.0.2
commit = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(?:-(?P<prerel>rc)\.(?P<prerelversion>\d+))?
serialize =
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
name: Updates to Ontology Files
name: Updates to GENCODE Files

on:
push:
paths:
- '**/cellxgene_schema_cli/cellxgene_schema/ontology_files/gene_info.yml'
- '**/cellxgene_schema_cli/cellxgene_schema/ontology_files/owl_info.yml'
- '**/cellxgene_schema_cli/cellxgene_schema/gencode_files/gene_info.yml'
branches-ignore:
- main

Expand All @@ -16,19 +15,10 @@ jobs:
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.ref }}
- name: ontology changes
uses: dorny/paths-filter@v2
id: filter
with:
filters: |
owl_info:
- 'cellxgene_schema_cli/cellxgene_schema/ontology_files/owl_info.yml'
gene_info:
- 'cellxgene_schema_cli/cellxgene_schema/ontology_files/gene_info.yml'
- name: Set up Python 3.8
- name: Set up Python 3.10
uses: actions/setup-python@v1
with:
python-version: 3.8
python-version: "3.10"
- name: Python cache
uses: actions/cache@v1
with:
Expand All @@ -43,21 +33,14 @@ jobs:
run: |
git config user.name github-actions
git config user.email github-actions@github.com
- name: owl-processing
if: ${{ steps.filter.outputs.owl_info == 'true' }}
run: |
make download-ontologies -C cellxgene_schema_cli
git add ./cellxgene_schema_cli/cellxgene_schema/ontology_files/all_ontology.json.gz
- name: gene-processing
if: ${{ steps.filter.outputs.gene_info == 'true' }}
run: |
make gene-processing -C cellxgene_schema_cli
git add ./cellxgene_schema_cli/cellxgene_schema/ontology_files/genes_*.csv.gz
git add ./cellxgene_schema_cli/cellxgene_schema/ontology_files/*_diff.txt
- name: Commit
if: ${{ steps.filter.outputs.gene_info == 'true' || steps.filter.outputs.owl_info == 'true' }}
run: |
git commit -m "AUTO: update ontologies"
git commit -m "AUTO: update gencode files"
git push
unit-tests:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schema-version-bump.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ jobs:
id: changed-files
uses: tj-actions/changed-files@v36
with:
files: cellxgene_schema_cli/cellxgene_schema/ontology_files/*.csv.gz
files: cellxgene_schema_cli/cellxgene_schema/gencode_files/*.csv.gz
- name: Run Dry-Run Script
if: steps.changed-files.outputs.any_changed == 'true'
run: python3 -m scripts.schema_bump_dry_run_genes.gene_bump_dry_run
Expand Down
53 changes: 0 additions & 53 deletions .github/workflows/update-ontology-mappings.yml

This file was deleted.

6 changes: 1 addition & 5 deletions cellxgene_schema_cli/Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
.PHONY: update-references
update-references: download-ontologies gene-processing clean

.PHONY: download-ontologies
download-ontologies:
python3 ./scripts/ontology_processing.py
update-references: gene-processing clean

.PHONY: gene-processing
gene-processing:
Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/cellxgene_schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "5.0.0"
__version__ = "5.0.2"
6 changes: 2 additions & 4 deletions cellxgene_schema_cli/cellxgene_schema/env.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ONTOLOGY_DIR = os.path.join(PACKAGE_ROOT, "ontology_files")
GENE_INFO_YAML = os.path.join(ONTOLOGY_DIR, "gene_info.yml")
OWL_INFO_YAML = os.path.join(ONTOLOGY_DIR, "owl_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_DIR, "all_ontology.json.gz")
GENCODE_DIR = os.path.join(PACKAGE_ROOT, "gencode_files")
GENE_INFO_YAML = os.path.join(GENCODE_DIR, "gene_info.yml")
SCHEMA_DEFINITIONS_DIR = os.path.join(PACKAGE_ROOT, "schema_definitions")
SCHEMA_DEFINITION_FILE = os.path.join(SCHEMA_DEFINITIONS_DIR, "schema_definition.yaml")
SCHEMA_REFERENCE_BASE_URL = "https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ class GeneChecker:
"""Handles checking gene ids, retrieves symbols"""

GENE_FILES = {
SupportedOrganisms.HOMO_SAPIENS: os.path.join(env.ONTOLOGY_DIR, "genes_homo_sapiens.csv.gz"),
SupportedOrganisms.MUS_MUSCULUS: os.path.join(env.ONTOLOGY_DIR, "genes_mus_musculus.csv.gz"),
SupportedOrganisms.SARS_COV_2: os.path.join(env.ONTOLOGY_DIR, "genes_sars_cov_2.csv.gz"),
SupportedOrganisms.ERCC: os.path.join(env.ONTOLOGY_DIR, "genes_ercc.csv.gz"),
SupportedOrganisms.HOMO_SAPIENS: os.path.join(env.GENCODE_DIR, "genes_homo_sapiens.csv.gz"),
SupportedOrganisms.MUS_MUSCULUS: os.path.join(env.GENCODE_DIR, "genes_mus_musculus.csv.gz"),
SupportedOrganisms.SARS_COV_2: os.path.join(env.GENCODE_DIR, "genes_sars_cov_2.csv.gz"),
SupportedOrganisms.ERCC: os.path.join(env.GENCODE_DIR, "genes_ercc.csv.gz"),
}

def __init__(self, species: SupportedOrganisms):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Changes to this file will trigger a run of the ontology_process.yml GHA job, which will run the gene_processing.py script.
# Changes to this file will trigger a run of the gencode_process.yml GHA job, which will run the gene_processing.py script.
ercc:
description: ercc
url: https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095047.txt
Expand Down
Binary file not shown.
39 changes: 0 additions & 39 deletions cellxgene_schema_cli/cellxgene_schema/ontology_files/owl_info.yml

This file was deleted.

10 changes: 5 additions & 5 deletions cellxgene_schema_cli/cellxgene_schema/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pandas.core.computation.ops import UndefinedVariableError
from scipy import sparse

from . import ontology, schema
from . import gencode, schema
from .utils import SPARSE_MATRIX_TYPES, get_matrix_format, getattr_anndata, read_h5ad

logger = logging.getLogger(__name__)
Expand All @@ -31,8 +31,8 @@ def __init__(self, ignore_labels=False):
self.schema_version: str = None
self.ignore_labels = ignore_labels

# Values will be instances of ontology.GeneChecker,
# keys will be one of ontology.SupportedOrganisms
# Values will be instances of gencode.GeneChecker,
# keys will be one of gencode.SupportedOrganisms
self.gene_checkers = dict()

def reset(self):
Expand Down Expand Up @@ -292,7 +292,7 @@ def _validate_feature_id(self, feature_id: str, df_name: str):
:rtype none
"""

organism = ontology.get_organism_from_feature_id(feature_id)
organism = gencode.get_organism_from_feature_id(feature_id)

if not organism:
self.errors.append(
Expand All @@ -302,7 +302,7 @@ def _validate_feature_id(self, feature_id: str, df_name: str):
return

if organism not in self.gene_checkers:
self.gene_checkers[organism] = ontology.GeneChecker(organism)
self.gene_checkers[organism] = gencode.GeneChecker(organism)

if not self.gene_checkers[organism].is_valid_id(feature_id):
self.errors.append(f"'{feature_id}' is not a valid feature ID in '{df_name}'.")
Expand Down
10 changes: 5 additions & 5 deletions cellxgene_schema_cli/cellxgene_schema/write_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Dict, List, Optional

import pandas as pd
from cellxgene_schema import ontology
from cellxgene_schema import gencode
from cellxgene_schema.env import SCHEMA_REFERENCE_BASE_URL, SCHEMA_REFERENCE_FILE_NAME
from cellxgene_schema.validate import ONTOLOGY_PARSER, Validator

Expand Down Expand Up @@ -169,12 +169,12 @@ def _get_mapping_dict_feature_id(self, ids: List[str]) -> Dict[str, str]:
mapping_dict = {}

for i in ids:
organism = ontology.get_organism_from_feature_id(i)
organism = gencode.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_symbol(i)

return mapping_dict

def _get_mapping_dict_feature_reference(self, ids: List[str]) -> Dict[str, Optional[ontology.SupportedOrganisms]]:
def _get_mapping_dict_feature_reference(self, ids: List[str]) -> Dict[str, Optional[gencode.SupportedOrganisms]]:
"""
Creates a mapping dictionary of gene/feature IDs and NCBITaxon curies
Expand All @@ -187,7 +187,7 @@ def _get_mapping_dict_feature_reference(self, ids: List[str]) -> Dict[str, Optio
mapping_dict = {}

for i in ids:
organism = ontology.get_organism_from_feature_id(i)
organism = gencode.get_organism_from_feature_id(i)
mapping_dict[i] = organism.value

return mapping_dict
Expand Down Expand Up @@ -227,7 +227,7 @@ def _get_mapping_dict_feature_length(self, ids: List[str]) -> Dict[str, int]:

for i in ids:
if i.startswith("ENS"):
organism = ontology.get_organism_from_feature_id(i)
organism = gencode.get_organism_from_feature_id(i)
mapping_dict[i] = self.validator.gene_checkers[organism].get_length(i)
else:
mapping_dict[i] = 0
Expand Down
2 changes: 1 addition & 1 deletion cellxgene_schema_cli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ cellxgene-ontology-guide==0.4.0
click==8.1.3
Cython==0.29.34
numpy==1.23.2
owlready2==0.40.0
pandas==1.4.4
PyYaml==6.0
wheel==0.40.0
scipy<1.13.0 # scipy 1.13.0 is not compatible with anndata <=0.10.6, revisit before next release
semver==3.0.0
xxhash==3.3.0
matplotlib==3.7.3
Loading

0 comments on commit 36e0ec6

Please sign in to comment.