# Fetching gene sequence

In [2]:
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "A.N.Other@example.com"

In [15]:
gene_id = '1007358254'
handle = Entrez.efetch(id=gene_id, db='nucleotide', rettype='gb', retmode='text')

In [16]:
res = SeqIO.read(handle, format='gb')

In [18]:
acc_ids = res.annotations['accessions'] 
res.__dict__


{'_per_letter_annotations': {},
 '_seq': Seq('GGGATCAGAAAACTCGCTGGTTAATGAGTGGCCAAGAACATATCAGCTTAAGTT...CAA', IUPACAmbiguousDNA()),
 'annotations': {'accessions': ['NM_001321072', 'XM_011529784'],
  'comment': "REVIEWED REFSEQ: This record has been curated by NCBI staff. The\nreference sequence was derived from AP001630.1.\nOn Mar 18, 2016 this sequence version replaced XM_011529784.1.\nSummary: The protein encoded by this gene acts as a homotetramer to\ncatalyze the conversion of homocysteine to cystathionine, the first\nstep in the transsulfuration pathway. The encoded protein is\nallosterically activated by adenosyl-methionine and uses pyridoxal\nphosphate as a cofactor. Defects in this gene can cause\ncystathionine beta-synthase deficiency (CBSD), which can lead to\nhomocystinuria. This gene is a major contributor to cellular\nhydrogen sulfide production. Multiple alternatively spliced\ntranscript variants have been found for this gene. [provided by\nRefSeq, Feb 2016].\nTranscript V

In [70]:
features = {}
for f in res.features:
    features[f.type] = features.setdefault(f.type, []) + [f.__dict__]

In [71]:
row = 0
row_type = {}
for f_type in features:
    features[f_type].sort(key=lambda f: f['location']._end)
    prev_end = -1
    contiguous = 0
    row_span = 1
    for f in features[f_type]:
        start, end = f['location']._start, f['location']._end
        if start == prev_end:
            contiguous += 1
            row_span = 2
        else:
            contiguous = 0
        f['coordinates'] = [(int(start), row + (contiguous % 2)), (int(end), row + (contiguous % 2))]
        prev_end = end
    row_type[row] = f_type
    if row_span == 2:
        row_type[row+1] = f_type
    row += row_span


In [72]:
features

{'CDS': [{'coordinates': [(626, 3), (1967, 3)],
   'id': '<unknown id>',
   'location': FeatureLocation(ExactPosition(626), ExactPosition(1967), strand=1),
   'qualifiers': OrderedDict([('gene', ['CBS']),
                ('gene_synonym', ['HIP4']),
                ('EC_number', ['4.2.1.22']),
                ('note',
                 ['isoform 2 is encoded by transcript variant 5; serine sulfhydrase; beta-thionase; methylcysteine synthase']),
                ('codon_start', ['1']),
                ('product', ['cystathionine beta-synthase isoform 2']),
                ('protein_id', ['NP_001308001.1']),
                ('db_xref', ['GeneID:875', 'HGNC:HGNC:1550', 'MIM:613381']),
                ('translation',
                 ['MAKCEFFNAGGSVKDRISLRMIEDAERDGTLKPGDTIIEPTSGNTGIGLALAAAVRGYRCIIVMPEKMSSEKVDVLRALGAEIVRTPTNARFDSPESHVGVAWRLKNEIPNSHILDQYRNASNPLAHYDTTADEILQQCDGKLDMLVASVGTGGTITGIARKLKEKCPGCRIIGVDPEGSILAEPEELNQTEQTTYEVEGIGYDFIPTVLDRTVVDKWFKSNDEEAFTFARMLIAQEGLLCGGSAGSTVAVAVKAAQELQE

# Fetch variations from ClinVar

In [85]:
handle = Entrez.esearch(term=' OR '.join(['%s[Nucleotide/Protein Accession]' % acc_id for acc_id in acc_ids]), db='clinvar')

In [86]:
res = Entrez.read(handle, validate=False)
var_ids = res['IdList']
var_ids

['519588', '370129', '340091', '264609', '236937', '212883', '212874', '189185', '188989', '188988', '136677']

In [103]:
handle = Entrez.efetch(id=var_ids, db='clinvar', rettype='variation')
# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=clinvar&id=11%2C9&rettype=variation&tool=biopython

In [102]:
handle.read()

'<?xml version="1.0" encoding="UTF-8" ?>\n                        \n<ClinVarResult-Set><VariationReport VariationID="519588" VariationName="NM_000071.2(CBS):c.297C&gt;T (p.Phe99=)" DateCreated="2018-04-12" VariationType="Simple" DateLastUpdated="2018-04-14" SubmitterCount="1">\n  <Species TaxonomyId="9606">human</Species>\n  <GeneList GeneCount="1">\n    <Gene GeneID="875" Symbol="CBS" FullName="cystathionine-beta-synthase" HGNCID="HGNC:1550" strand="-" Type="submitted" RelationshipType="within single gene">\n      <OMIM>613381</OMIM>\n    </Gene>\n  </GeneList>\n  <Allele AlleleID="510862">\n    <Name>NM_000071.2(CBS):c.297C&gt;T (p.Phe99=)</Name>\n    <VariantType>single nucleotide variant</VariantType>\n    <CytogeneticLocation>21q22.3</CytogeneticLocation>\n    <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.33" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="43068528" stop="43068528" display_start="43068528" display_stop="43068528" vari

In [96]:
res

TypeError: 'str' object is not an iterator