Skip to content

Commit

Permalink
C4-294 Named Genotype Label Fields (#200)
Browse files Browse the repository at this point in the history
* C4-196 intermediate update

* Schema and types file

* test inserts

* add demo_inserts

* less gene lists for demo

* update uuid

* update mapping

* add default filter set for case

* add gene list digester

* update delimiters and attachment type

* add ensemble look up

* add xls xlsx

* small changes in support of moving to es6

* add poetry.lock back

* C4-196 ES6 compatible cgap

* C4-294 add sample_id to genotype labels

* C4-294 clean up, spec out calc prop

* C4-294 first pass, always proband centric view

* add omitted items() call

* add tests

* update versions and add pandas

* add more tests

* Commented out test for genelist processing

* default values in calc prop

* rip out es6 compat changes from this branch

* add revlink and test

* add demo insert

* remove duplicates

* fix type

* remove ingester

* C4-294 mapping table update + genotype label facets

* remove pandas

* resolve lock file

* C4-294 scrub label

* fix cmphet, fix test

Co-authored-by: Koray Kırlı <koray_kirli@hms.harvard.edu>
  • Loading branch information
willronchetti and KorayKirli committed Sep 4, 2020
1 parent 02e10d8 commit f352425
Show file tree
Hide file tree
Showing 29 changed files with 1,295 additions and 882 deletions.
509 changes: 243 additions & 266 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[tool.poetry]
# Note: Various modules refer to this system as "encoded", not "cgap-portal".
name = "encoded"
version = "2.4.20"
version = "2.5.0"
description = "Clinical Genomics Analysis Platform"
authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
license = "MIT"
Expand Down Expand Up @@ -35,7 +35,7 @@ classifiers = [
]

[tool.poetry.dependencies]
python = ">=3.6,<3.7"
python = ">=3.6.1,<3.7"
boto3 = "^1.10.46"
botocore = "^1.13.46"
certifi = ">=2020.4.5.2"
Expand Down Expand Up @@ -80,7 +80,7 @@ pyramid-multiauth = "0.8.0"
pyramid-retry = "1.0"
pyramid-tm = "2.2.1"
pyramid-translogger = "^0.1"
python-dateutil = "2.5.3"
python-dateutil = "2.7.3"
# python-magic is presently pinned to 0.4.15 in lockstep with dcicsnovault's requirements. See explanation there.
python-magic = "0.4.15"
pytz = ">=2020.1"
Expand Down
606 changes: 304 additions & 302 deletions src/encoded/annotations/variant_table_v0.4.8.csv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/encoded/commands/create_mapping_on_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
'FileReference',
'Image',
'Gene',
'GeneList',
'Phenotype',
'Disorder',
'Individual',
Expand Down
5 changes: 4 additions & 1 deletion src/encoded/commands/ingest_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,10 @@ def create_sample_variant_from_record(self, record):
for group in comhet:
annotations = {}
for field_name, value in zip(field_names, group.split('|')):
annotations[field_name] = self.fix_encoding(value)
if field_name == 'comhet_transcript': # array field
annotations[field_name] = self.fix_encoding(value).split('~')
else:
annotations[field_name] = self.fix_encoding(value)
s['cmphet'].append(annotations)

self.parse_samples(s, sample) # add sample fields, already formatted
Expand Down
24 changes: 24 additions & 0 deletions src/encoded/commands/variant_table_intake.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,29 @@ def add_extra_variant_sample_columns(cols):
'title': 'Variant'
}

@staticmethod
def add_extra_variant_sample_facets(facs):
facs['associated_genotype_labels.proband_genotype_label'] = {
'title': 'Proband Genotype',
'order': 12,
'grouping': 'Genotype'
}
facs['associated_genotype_labels.mother_genotype_label'] = {
'title': 'Mother Genotype',
'order': 13,
'grouping': 'Genotype'
}
facs['associated_genotype_labels.father_genotype_label'] = {
'title': 'Father Genotype',
'order': 14,
'grouping': 'Genotype'
}
facs['inheritance_modes'] = {
'title': 'Inheritance Modes',
'order': 15,
'grouping': 'Genotype'
}

def generate_variant_sample_schema(self, sample_props, cols, facs, variant_cols, variant_facs):
""" Builds the variant_sample.json schema based on sample_props. Will also add variant columns and
facets since this information is embedded
Expand Down Expand Up @@ -548,6 +571,7 @@ def format_variant_cols_or_facs(d):
cols.update(variant_cols) # add variant stuff since we are embedding this info
facs.update(variant_facs)
self.add_extra_variant_sample_columns(cols)
self.add_extra_variant_sample_facets(facs)
schema['columns'] = cols
schema['facets'] = facs
schema['facets'] = self.sort_schema_properties(schema, key='facets')
Expand Down
14 changes: 8 additions & 6 deletions src/encoded/inheritance_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class InheritanceModeError(Exception):

class InheritanceMode:

MISSING = '.' # XXX: is this really what this is? Should it be called 'dot'?
MISSING = '.'

AUTOSOME = 'autosome'
CHROMOSOMES = [
Expand All @@ -25,7 +25,7 @@ class InheritanceMode:

MOTHER = 'mother'
FATHER = 'father'
SELF = 'proband' # XXX: this is what the data actually looks like, NOT 'self'
SELF = 'proband'
TRIO = [MOTHER, FATHER, SELF]

# Genotype labels
Expand Down Expand Up @@ -308,7 +308,7 @@ def compute_cmphet_inheritance_modes(cmphet):
return inheritance_modes

@staticmethod
def build_genotype_label_structure(genotype_labels):
def build_genotype_label_structure(genotype_labels, sample_ids):
""" Converts the genotype_labels structure into a consistent structure that can be used
in our item ecosystem.
Expand All @@ -323,7 +323,8 @@ def build_genotype_label_structure(genotype_labels):
for role, labels in genotype_labels.items():
structured_labels.append({
'role': role,
'labels': labels
'labels': labels,
'sample_id': sample_ids.get(role)
})
return structured_labels

Expand All @@ -338,6 +339,7 @@ def compute_inheritance_modes(cls, variant_sample, chrom=None):
"""
sample_geno = variant_sample.get('samplegeno', [])
try:
sample_ids = {s["samplegeno_role"]: s["samplegeno_sampleid"] for s in sample_geno}
genotypes = {s["samplegeno_role"]: s["samplegeno_numgt"] for s in sample_geno}
sexes = {s["samplegeno_role"]: s["samplegeno_sex"] for s in sample_geno}
chrom = chrom if chrom else variant_sample.get('variant', {}).get('CHROM') # attempt to get from variant
Expand All @@ -349,7 +351,7 @@ def compute_inheritance_modes(cls, variant_sample, chrom=None):
return {}

if chrom not in ['X', 'Y']:
chrom = cls.AUTOSOME # XXX: so chrom is one of ['X', 'Y', 'autosome'] ?
chrom = cls.AUTOSOME

if cls.SELF not in genotypes:
raise InheritanceModeError('Role "proband" not present in genotypes: %s' % genotypes)
Expand All @@ -363,7 +365,7 @@ def compute_inheritance_modes(cls, variant_sample, chrom=None):
inheritance_modes = cls.inheritance_modes_other_labels(genotypes, genotype_labels)

new_fields = {
'genotype_labels': cls.build_genotype_label_structure(genotype_labels),
'genotype_labels': cls.build_genotype_label_structure(genotype_labels, sample_ids),
'inheritance_modes': inheritance_modes
}

Expand Down
59 changes: 59 additions & 0 deletions src/encoded/schemas/gene_list.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"title": "Gene List",
"description": "Groups of genes that are relecant for a disease or condition",
"id": "/profiles/gene_list.json",
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"required": ["institution", "project", "title"],
"identifyingProperties": ["uuid", "aliases"],
"additionalProperties": false,
"mixinProperties": [
{ "$ref": "mixins.json#/schema_version" },
{ "$ref": "mixins.json#/uuid" },
{ "$ref": "mixins.json#/status" },
{ "$ref": "mixins.json#/aliases" },
{ "$ref": "mixins.json#/attribution" },
{ "$ref": "mixins.json#/submitted" },
{ "$ref": "mixins.json#/modified" },
{ "$ref": "mixins.json#/tags" },
{ "$ref": "mixins.json#/notes" }
],
"properties": {
"schema_version": {
"default": "1"
},
"title": {
"title": "Title",
"description": "Title for this Gene List",
"uniqueItems": true,
"type": "string"
},
"genes" : {
"title" : "Genes",
"type" : "array",
"uniqueItems": true,
"items" : {
"title" : "Gene",
"type" : "string",
"linkTo" : "Gene"
}
},
"disorders" : {
"title" : "Related Disorders",
"description": "Disorders that are related to this gene list",
"type" : "array",
"uniqueItems": true,
"items" : {
"title" : "Disorder",
"type" : "string",
"linkTo" : "Disorder"
}
},
"source_file": {
"title" : "Source File",
"description": "The original gene list file used for generating this gene list item",
"type" : "string",
"linkTo": "Document"
}
}
}
Loading

0 comments on commit f352425

Please sign in to comment.