## Searching for IRS1 sequence

In [45]:
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 
from Bio import SeqIO

In [34]:
database = 'nucleotide'
word = 'irs1 and homo sapiens and Chromosome 2 and not predicted and not unverified '
res= '15'
email= 'karyanlysenko@ua.pt'
Entrez.email= email
handle_search=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle_search)
handle_search.close()
idlist= record['IdList']

In [38]:
handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for info in records:
    print(info.id, '-length of seq:', len(info.seq), '-', info.description)

NM_001330156.1 -length of seq: 2450 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 3, mRNA
NM_005544.3 -length of seq: 9771 - Homo sapiens insulin receptor substrate 1 (IRS1), mRNA
NG_015830.1 -length of seq: 74474 - Homo sapiens insulin receptor substrate 1 (IRS1), RefSeqGene on chromosome 2
NM_001100818.2 -length of seq: 2557 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 2, mRNA
NM_001330158.2 -length of seq: 2664 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 5, mRNA
NM_017933.5 -length of seq: 2768 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 1, mRNA
NM_001330157.2 -length of seq: 2410 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 4, mRNA
CM000253.1 -length of seq: 236827137 - Homo sapiens chromosome 2, whole genome shotgun sequence
CH471063.1 -length of seq: 4762804

The selection of the id has to be done manually as there is no pattern writing the titles of the queries.\
The id __NG_015830.1__ is the only one where the annotated sequence is not a whole genome of the chromossome and actually is RefSeq. This means that the sequence is being used as a standard for well-characterized genes. So id __NG_015830.1__ will be used from now on.

In [44]:
#The correspondent information of NG_015830.1 was downloaded to a file
import os
Entrez.email = "karynalysenko@ua.pt"
filename = "NG_015830_1_teste.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NG_015830.1", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [70]:
record = SeqIO.read(open("NG_015830_1_teste.gb"), format="genbank")
count,position=0,0
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        count+=1
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
print("Number of annotated CDS: {} in position:{}\n".format(count,position))

The length of the sequence: 74474

Comment from NCBI: REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from AC010735.11.
This sequence is a reference standard in the RefSeqGene project.
Summary: This gene encodes a protein which is phosphorylated by
insulin receptor tyrosine kinase. Mutations in this gene are
associated with type II diabetes and susceptibility to insulin
resistance. [provided by RefSeq, Nov 2009].

Number of annotated CDS: 1 in position:6



In [75]:
#checking the location of the CDS on the original sequence
print(record.features[6].location)

[5052:8781](+)


Seq('ATGGCGAGCCCTCCGGAGAGCGATGGCTTCTCGGACGTGCGCAAGGTGGGCTAC...TAG')

In [76]:
CDS_nuc_seq=record.seq[5052:8781]

In [None]:

with open('uniprot_result_CDS_filtered.txt', 'w',encoding='utf-8') as f:
    for i in file:
        r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
        print(r.text)
        f.write(r.text)
        f.write('\n')