In [1]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.Data import CodonTable
from Bio.SeqRecord import SeqRecord
from Bio import SeqFeature

from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW 

# ---------------------------------------------

Gene = IRS1 variante rs2972144
Fazendo pesquisa no NCBI: https://www.ncbi.nlm.nih.gov/snp/rs2972144#history


In [23]:
info = SeqIO.read("irs1_variant_incomplete","fasta") 
seq_irs1_incomplete = info.seq #seq de nucleótidos

Pesquisando a seq incompleta da variante IRS1, encontramos: https://www.ncbi.nlm.nih.gov/nuccore/AC062015.7?report=fasta  que não é a mesma variante

In [28]:
seq_irs1_complete = SeqIO.read("irs1_variante.fasta","fasta") 

Info encontrada para essa variante: https://www.ncbi.nlm.nih.gov/pmc/?term=rs2972144%20irs1

"One of the few T2D loci associated with insulin resistance (IR) encodes insulin receptor substrate 1 (IRS1), a key protein central to the insulin signaling pathway (3). A common genetic variant (rs2943641) in the neighborhood of IRS1 is associated with T2D, IR, and hyperinsulinemia in GWAS of Euro-pean populations, and this variant may disrupt the insulin signaling pathway."

Conclusion: "Participants with different genotypes of IRS1 rs2943641 exhibit differential benefit from high circulating 25(OH)D for the reduction of insulin resistance and T2D risk."\
artigo: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4026060/

(Mais um artigo: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3657179/)

Portanto, podemos fazer alinhamento multiplo dessa variante de IRS1 com outras mais comuns. Pesquisa no pubmed com keywords: ((rs2943641) AND (irs1)) AND (diabetis): https://pubmed.ncbi.nlm.nih.gov/?term=%28%28rs2943641%29+AND+%28irs1%29%29+AND+%28diabetis%29&sort=

https://www.ebi.ac.uk/Tools/msa/clustalo/

# ---------------------------------------------

Pesquisando "IRS1" em NCBI, apenas encontramos esta seq para Homo Sapiens: NG_015830.1

In [2]:
seq_irs1 = SeqIO.read("NG_015830_1.fasta","fasta") 

Corremo-la no blastn para encontrar sequênicas homólogas -> ficheiro my_NG_015830.1.xml

In [3]:
seq = SeqIO.read("NG_015830_1.fasta","fasta") 
result_seq = NCBIWWW.qblast("blastn", "nt", seq.format("fasta"), entrez_query = "Homo Sapiens[organism]")


In [17]:
# Não é possível correr este script uma vez que a sequencia de NG_015830_1 é muito grande

# save_file_seq = open("my_HS_NG_015830_1.xml", "w")
# save_file_seq.write(result_seq.read()) 
# save_file_seq.close() 
# result_seq.close()

In [18]:
# result_seq = open("my_HS_NG_015830_1.xml")
# blast_results = NCBIXML.read(result_seq)
# blast_results.alignments[0]


In [12]:
seq = SeqIO.read("irs1_variant_incomplete.fasta","fasta") 
result_seq = NCBIWWW.qblast("blastn", "nt", seq.format("fasta"), entrez_query = "Homo Sapiens[organism]")
save_file_seq = open("blast_irs1_variant_incomplete.xml", "w")
save_file_seq.write(result_seq.read()) 
save_file_seq.close() 
result_seq.close()

In [13]:
teste = open("blast_irs1_variant_incomplete.xml")
teste_ler= NCBIXML.read(teste)
for i in teste_ler.alignments:
    print(i)

gi|13677110|gb|AC062015.7| Homo sapiens BAC clone CTD-2031E17 from 2, complete sequence
           Length = 144223

gi|1083260056|ref|NG_028216.2| Homo sapiens anoctamin 10 (ANO10), RefSeqGene on chromosome 3
           Length = 332269

gi|20340478|gb|AC105903.2| Homo sapiens chromosome 3 clone RP11-606H24, complete sequence
           Length = 186951

gi|293337318|gb|AC240509.1| Homo sapiens FOSMID clone ABC14-50083900N23 from chromosome unknown, complete sequence
           Length = 37784

gi|154091212|gb|AC203630.3| Homo sapiens FOSMID clone ABC10-45505500C8 from chromosome 9, complete sequence
           Length = 38940

gi|14715668|emb|AL355975.10| Human DNA sequence from clone RP11-468C2 on chromosome 9, complete sequence
           Length = 131705

gi|1543379894|ref|NG_029119.2| Homo sapiens semaphorin 6D (SEMA6D), RefSeqGene on chromosome 15
           Length = 597018

gi|2033711141|gb|CP068263.2| Homo sapiens isolate CHM13 chromosome 15
           Length = 99753195

gi|19099424

In [20]:
cds = SeqIO.read("cds.fasta","fasta") 
result_cds = NCBIWWW.qblast("blastp", "nr", cds.format("fasta"), entrez_query = "Homo Sapiens[organism]")
save_file_cds = open("cds.xml", "w")
save_file_cds.write(result_cds.read()) 
save_file_cds.close() 
result_cds.close()

In [10]:
cds_xml = open("cds.xml")
cds_homo_sap= NCBIXML.parse(cds_xml)
for info in cds_homo_sap:
    print('base de dados usada: ', info.database)
    print('matriz: ', info.matrix)
    print('parâmetros para os espaçamentos: ', info.gap_penalties)
cds_xml.close()

base de dados usada:  nr
matriz:  BLOSUM62
parâmetros para os espaçamentos:  (11, 1)


In [43]:
cds_xml = open("cds.xml")
cds_homo_sap= NCBIXML.parse(cds_xml)
for parametros in cds_homo_sap:
    for info in parametros.alignments:
        print("ID accession: ", info.accession) 
        #print('descrição: ', info.hit_def)
        for hsp in info.hsps:
            print('identidade: ',hsp.identities)
            print('E-value: ', hsp.expect)
            print('comprimento dos alinhamentos: ', hsp.align_length)
            print()

ID accession:  NP_005535
identidade:  1242
E-value:  0.0
comprimento dos alinhamentos:  1242

ID accession:  AAB21608
identidade:  1240
E-value:  0.0
comprimento dos alinhamentos:  1243

ID accession:  1QQG_A
identidade:  264
E-value:  0.0
comprimento dos alinhamentos:  264

ID accession:  AAK83053
identidade:  421
E-value:  3.34261e-159
comprimento dos alinhamentos:  971

identidade:  71
E-value:  8.10643e-06
comprimento dos alinhamentos:  219

ID accession:  AAK66751
identidade:  421
E-value:  2.63985e-158
comprimento dos alinhamentos:  971

identidade:  71
E-value:  8.82272e-06
comprimento dos alinhamentos:  219

ID accession:  NP_003740
identidade:  421
E-value:  2.69495e-158
comprimento dos alinhamentos:  971

identidade:  71
E-value:  8.82272e-06
comprimento dos alinhamentos:  219

ID accession:  AAK66750
identidade:  421
E-value:  2.69508e-158
comprimento dos alinhamentos:  971

identidade:  71
E-value:  8.82397e-06
comprimento dos alinhamentos:  219

ID accession:  KAI4063792
i

In [48]:
cds_xml = open("cds.xml")
cds_homo_sap= NCBIXML.parse(cds_xml)
for parametros in cds_homo_sap:
    print(len(parametros.alignments))
    print(max([len(parametros.alignments[a].hsps) for a,b in enumerate (parametros.alignments)]))
    print([len(parametros.alignments[a].hsps) for a,b in enumerate (parametros.alignments)])

30
2
[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
