# How To Represent Copy Number Variants (CNVs)

In [7]:
from ga4gh.vrs import models
from ga4gh.vrs.extras.translator import Translator
from ga4gh.vrs.dataproxy import SeqRepoRESTDataProxy

import json
from IPython.display import Image



In [2]:
# a basic example. Gene APOE - apolipoprotein E has (at least) three copies:

indefrange = models.IndefiniteRange(comparator=">=", value=3)

apoe_cn = models.CopyNumber(copies = indefrange ,                         
                            subject = models.Gene(gene_id="ncbigene:348")
                           )



In [3]:
print (json.dumps(apoe_cn.as_dict(), indent=1))

{
 "type": "CopyNumber",
 "subject": {
  "type": "Gene",
  "gene_id": "ncbigene:348"
 },
 "copies": {
  "type": "IndefiniteRange",
  "value": 3,
  "comparator": ">="
 }
}


## Example BRCA1 exon duplication
![BRCA1 exon duplication](images/BRCA1_exon_dup.png)


In [4]:
# Let's define an exon that has three copies: 
# This exon has three copies, and we don't know the exact breakpoints.
# They are somewhere in the intron (which did not get sequenced)
# NC_000017.10:g.41209048-?_41209172+?dup

# first let's specify the sequence interval and chromosome that got duplicated
interval = models.SequenceInterval(start=models.Number(value=41209048), end=models.Number(value=41209172))
location = models.SequenceLocation(interval=interval,
                                  sequence_id="refseq:NC_000017.10")

# For a CNV we declare this derived from this location. 
# Use of DerivedSequenceExpression indicates that the derived sequence is approximately equivalent 
# to the reference indicated, and is typically used for describing large regions for variation concepts 
# where the exact sequence is inconsequential
# Note, if we would KNOW the duplication is in tandem, we would use RepeatedSequenceExpression
# but we actually don't know where in the genome the duplication was inserted in this case.
derivedseq = models.DerivedSequenceExpression(location=location, reverse_complement=False)

# and finally we express how many copies of this derived sequence can be found 

# note, we know there are at least 3 copies (but not 100% sure there might not be more.)
# that means we use an IndefiniteRange and provide the comparator
copies = models.IndefiniteRange(value=3, comparator=">=")

# and finally this comes together as the CopyNumber object:
cn = models.CopyNumber(copies=copies, subject = derivedseq)

print (json.dumps(cn.as_dict(), indent=1))

{
 "type": "CopyNumber",
 "subject": {
  "type": "DerivedSequenceExpression",
  "location": {
   "type": "SequenceLocation",
   "sequence_id": "refseq:NC_000017.10",
   "interval": {
    "type": "SequenceInterval",
    "start": {
     "type": "Number",
     "value": 41209048
    },
    "end": {
     "type": "Number",
     "value": 41209172
    }
   }
  },
  "reverse_complement": false
 },
 "copies": {
  "type": "IndefiniteRange",
  "value": 3,
  "comparator": ">="
 }
}


## Example MME exon tandem duplication
![MME exon tandem duplication](images/MME_exon_tandem_dup.png)


Here a different example. This CNV event is known to be in tandem.


In [5]:
# let's start again with expressing the location which got duplicated.
interval = models.SequenceInterval(start=models.Number(value=154886500), end=models.Number(value=41209172))
location = models.SequenceLocation(interval=interval,
                                  sequence_id="refseq:NC_000003.11")

In [6]:
# in contrast to the previous example, where we were not confident about this being a tandem duplication event, here we are. 
# as such we use RepeatedSequenceExpression
derivedseq = models.DerivedSequenceExpression(location=location, reverse_complement=False)

cnv_count = models.IndefiniteRange(comparator=">=", value=4)

repeatedexp = models.RepeatedSequenceExpression(seq_expr=derivedseq, count=cnv_count)

# and finally this comes together as the CopyNumber object:
tandem_cn = models.CopyNumber(copies=cnv_count, subject = repeatedexp)

print (json.dumps(tandem_cn.as_dict(), indent=1))


{
 "type": "CopyNumber",
 "subject": {
  "type": "RepeatedSequenceExpression",
  "seq_expr": {
   "type": "DerivedSequenceExpression",
   "location": {
    "type": "SequenceLocation",
    "sequence_id": "refseq:NC_000003.11",
    "interval": {
     "type": "SequenceInterval",
     "start": {
      "type": "Number",
      "value": 154886500
     },
     "end": {
      "type": "Number",
      "value": 41209172
     }
    }
   },
   "reverse_complement": false
  },
  "count": {
   "type": "IndefiniteRange",
   "value": 4,
   "comparator": ">="
  }
 },
 "copies": {
  "type": "IndefiniteRange",
  "value": 4,
  "comparator": ">="
 }
}
