## Searching for IRS1 sequence

In [2]:
from Bio import Entrez, SeqIO, Medline
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import Phylo
import requests, sys, json
import re

In [16]:
#pubmed search
database = 'PubMed'
word = 'irs1 insulin'
res= 15 
email= "karynalysenko@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text")
records = list(Medline.parse(handle))


for record in records[0:5]:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("Title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("Authors:", record.get("AU", "-"))
    print("Source:", record.get("SO", "-"))
    print("")

Title: Effect of Codonopsis Radix and Polygonati Rhizoma on the regulation of the IRS1/PI3K/AKT signaling pathway in type 2 diabetic mice.
Authors: ['Mao YP', 'Song YM', 'Pan SW', 'Li N', 'Wang WX', 'Feng BB', 'Zhang JH']
Source: Front Endocrinol (Lausanne). 2022 Dec 14;13:1068555. doi: 10.3389/fendo.2022.1068555. eCollection 2022.

Title: The identities of insulin signaling pathway are affected by overexpression of Tau and its phosphorylation form.
Authors: ['Ma N', 'Liang Y', 'Yue L', 'Liu P', 'Xu Y', 'Zhu C']
Source: Front Aging Neurosci. 2022 Dec 16;14:1057281. doi: 10.3389/fnagi.2022.1057281. eCollection 2022.

Title: Insulin ameliorates dim blue light at night-induced apoptosis in hippocampal neurons via the IR/IRS1/AKT/GSK3beta/beta-catenin signaling pathway.
Authors: ['Liu Q', 'Wang Z', 'Cao J', 'Dong Y', 'Chen Y']
Source: Ecotoxicol Environ Saf. 2022 Dec 29;250:114488. doi: 10.1016/j.ecoenv.2022.114488.

Title: Sodium oxamate reduces lactate production to improve the glucose h

In [23]:
#ncbi search to find the sequence of IRS1
database = 'nucleotide'
word = 'irs1 and homo sapiens and Chromosome 2 and not predicted and not unverified '
res= '15'
email= 'karyanlysenko@ua.pt'
Entrez.email= email
handle_search=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle_search)
handle_search.close()
idlist= record['IdList']
handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for info in records:
    print(info.id, '-', info.description)
    #print('length of seq:', len(info.seq)) #to check the length of the sequences

NM_005544.3 - Homo sapiens insulin receptor substrate 1 (IRS1), mRNA
NM_001100818.2 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 2, mRNA
NM_001330158.2 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 5, mRNA
NM_001330157.2 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 4, mRNA
NM_017933.5 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 1, mRNA
NM_001330156.1 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 3, mRNA
NG_015830.1 - Homo sapiens insulin receptor substrate 1 (IRS1), RefSeqGene on chromosome 2
CM000253.1 - Homo sapiens chromosome 2, whole genome shotgun sequence
CH471063.1 - Homo sapiens 211000035834619 genomic scaffold, whole genome shotgun sequence


The selection of the id has to be done manually as there is no pattern in writing the titles of the queries.\
The id __NG_015830.1__ is the only one where the annotated sequence is not a whole genome of the chromossome, is not a mRNA and actually is RefSeq. This means that the sequence is being used as a standard for well-characterized genes. So id __NG_015830.1__ will be used from now on.

In [7]:
#The correspondent information of NG_015830.1 was downloaded to a file
import os
Entrez.email = "karynalysenko@ua.pt"
filename = "NG_015830_1.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NG_015830.1", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [25]:
record = SeqIO.read(open("NG_015830_1.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
#checking the location of the CDS on the original sequence
print("Location of the CDS on the original sequence: {}".format(record.features[position].location))

The length of the sequence: 74474

Type of features: {'source': 1, 'gene': 2, 'mRNA': 2, 'exon': 2, 'CDS': 1, 'misc_feature': 46}

Comment from NCBI: REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from AC010735.11.
This sequence is a reference standard in the RefSeqGene project.
Summary: This gene encodes a protein which is phosphorylated by
insulin receptor tyrosine kinase. Mutations in this gene are
associated with type II diabetes and susceptibility to insulin
resistance. [provided by RefSeq, Nov 2009].

Location of the CDS on the original sequence: [5052:8781](+)


In [34]:
CDS_nuc_seq_location=record.seq[int(record.features[position].location.start):int(record.features[position].location.end)]

In [33]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename = "NG_015830_1.gb"
filename_CDS_nucl = "CDS_nucleot_seq-test.fasta"
filename_CDS_prot = "CDS_prot_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank"):
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq_location))
    output_handle_prot.write(">\n%s" % ("".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_nucl.close()
output_handle_prot.close()
input_handle.close()

## BLASTN - for all organisms

In [4]:
record_blastn = SeqIO.read(open("CDS_nucleot_seq.fasta"), format="fasta") 
print(len(record_blastn.seq))

3729


In [41]:
Blastn = NCBIWWW.qblast("blastn", "nt", record_blastn.seq) #not filtered for Homo sapiens
with open('blastn_CDS_nucleot_seq.xml', "w") as out_handle:
        out_handle.write(Blastn.read())
Blastn.close()

In [46]:
results_Blastn = open("blastn_CDS_nucleot_seq.xml")
blastn_records = NCBIXML.read(results_Blastn)
for parameter in blastn_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)

Number of alignments: 50

Accession:  XM_047444224
Definition:  PREDICTED: Homo sapiens insulin receptor substrate 1 (IRS1), transcript variant X2, mRNA
E-value:  0.0

Accession:  XM_047444223
Definition:  PREDICTED: Homo sapiens insulin receptor substrate 1 (IRS1), transcript variant X1, mRNA
E-value:  0.0

Accession:  NM_005544
Definition:  Homo sapiens insulin receptor substrate 1 (IRS1), mRNA
E-value:  0.0

Accession:  NG_015830
Definition:  Homo sapiens insulin receptor substrate 1 (IRS1), RefSeqGene on chromosome 2
E-value:  0.0

Accession:  AC010735
Definition:  Homo sapiens BAC clone RP11-395N3 from 2, complete sequence
E-value:  0.0


In [49]:
#filtering the "predicted" alignments
results_Blastn= open('blastn_CDS_nucleot_seq.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                # print( existe[0] )
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
                    #print(id)
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(list_filtered_alignments)
print('Total {} PREDICTED seqs found and remaining {} ids are from Homo sapiens'.format(count_preditc, count_homo))
#counting of Homo sapiens it's not conclusive, just to have an ideia, because the titles don't follow any pattern

['NM_005544', 'NG_015830', 'AC010735', 'S62539', 'BC053895', 'LT743046', 'KJ891488', 'AB384351', 'EU831611', 'EU831698', 'S85963', 'U43502']
Total 38 PREDICTED seqs found and remaining 8 ids are from Homo sapiens


In [None]:
with open('CDS_nucleotide_result_blast.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## BLASTP

In [14]:
record_blastp = SeqIO.read(open("CDS_prot_seq.fasta"), format="fasta") 
print(len(record_blastp.seq))

1242


In [15]:
Blastp = NCBIWWW.qblast("blastp", "swissprot", record_blastp.seq) #not filtered for Homo sapiens
with open('blastp_CDS_prot_seq.xml', "w") as out_handle:
        out_handle.write(Blastp.read())
Blastp.close()

In [61]:
results_Blastp = open("blastp_CDS_prot_seq.xml")
blastp_records = NCBIXML.read(results_Blastp)
for parameter in blastp_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  P35568
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1 [Homo sapiens]
E-value:  0.0

Accession:  Q28224
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1 [Chlorocebus aethiops]
E-value:  0.0

Accession:  P35570
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1; AltName: Full=pp185 [Rattus norvegicus]
E-value:  0.0

Accession:  P35569
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1 [Mus musculus]
E-value:  0.0

Accession:  P84770
Definition:  RecName: Full=Insulin receptor substrate 1-B; Short=IRS1-B; Short=xIRS-1-B; AltName: Full=XIRS-L' [Xenopus laevis]
E-value:  0.0


In [3]:
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records = NCBIXML.read(results_Blastp)
E_VALUE_THRESH = 0.001
list_filtered_alignments,list_species=[],[]
for alignment in  blastp_records.alignments:
    for hsp in alignment.hsps:
#         print(hsp.identities)    # maybe add more 
        if hsp.expect < E_VALUE_THRESH:
            list_filtered_alignments.append(alignment.accession)
            title_organism=re.search(r'\[.+\s.+\]', alignment.title)
            if title_organism:
                m = re.match(r'\[.+\s.+\]', title_organism[0] )
                specie = m.group(0)
                #print(specie)
                list_species.append(specie)
for x in sorted(set(list_species)):
    print("number of times: {} that appeared specie: {}".format(list_species.count(x),x))

print(set(list_filtered_alignments))
#print(len(list_filtered_alignments))

number of times: 1 that appeared specie: [Bos taurus]
number of times: 1 that appeared specie: [Chlorocebus aethiops]
number of times: 1 that appeared specie: [Drosophila ananassae]
number of times: 1 that appeared specie: [Drosophila erecta]
number of times: 1 that appeared specie: [Drosophila melanogaster]
number of times: 1 that appeared specie: [Drosophila sechellia]
number of times: 1 that appeared specie: [Drosophila yakuba]
number of times: 9 that appeared specie: [Homo sapiens]
number of times: 1 that appeared specie: [Mesocricetus auratus]
number of times: 6 that appeared specie: [Mus musculus]
number of times: 2 that appeared specie: [Rattus norvegicus]
number of times: 4 that appeared specie: [Xenopus laevis]
number of times: 2 that appeared specie: [Xenopus tropicalis]
{'B4NZ70', 'Q9Z0Y7', 'P35570', 'Q9XTN2', 'Q9Y4H2', 'Q5RJW5', 'Q99PF6', 'P35569', 'P81122', 'Q91615', 'B3MPN6', 'Q6P4Y6', 'P84770', 'B4HWI2', 'Q99KE3', 'A6QLU3', 'Q9DF49', 'Q9Z1S8', 'O14654', 'Q8WWW8', 'Q13480

In [5]:
#filter by identities and coverage - not working when blastp alignment has several hsps
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records_r = NCBIXML.read(results_Blastp)
first=blastp_records_r.alignments[0]
len_max_seq=first.hsps[0].align_length

In [6]:
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records = NCBIXML.parse(results_Blastp)
list_accession,list_identities,list_coverage_start, list_coverage_stop,list_alignLen=[],[],[],[],[]

for alignment in  blastp_records:
    for a in alignment.alignments:
        for b in a.hsps:
            list_accession.append(a.accession)
            list_coverage_start.append(b.sbjct_start)
            list_coverage_stop.append(b.sbjct_end)
            list_identities.append(b.identities)
            list_alignLen.append(b.align_length)
            #print(a.accession, b.align_length, b.identities, b.expect, b.sbjct_start, b.sbjct_end)

#calculation of %identity                        
def identity(list_i, list_a):
    perc_list_identities=[]
    for index in range(len(list_i)):
        perc_list_identities.append(round(list_i[index]*100/list_a[index]))
    return perc_list_identities

#calculation of the %coverage - for all hsps of all accessions
def coverage(list_end,list_start):
    perc_list_coverage=[]
    for index in range(len(list_identities)):
        perc_list_coverage.append(round((1+list_end[index]-list_start[index])/len_max_seq*100))
    return perc_list_coverage

#selecting the max coverage and max identity per hsps/accession
def max_values_by_id(ids, covs, idents): 
    unique_ids = []
    max_coverage = []
    max_identity = []
    for id, cov, ident in zip(ids, covs,idents):
        if id in unique_ids:
            idx = unique_ids.index(id)
            max_coverage[idx] = max(max_coverage[idx], cov)
            max_identity[idx] = max(max_identity[idx], ident)
        else:
            unique_ids.append(id)
            max_coverage.append(cov)
            max_identity.append(ident)
    return unique_ids, max_coverage, max_identity

beta_max=max_values_by_id(list_accession, coverage(list_coverage_stop, list_coverage_start),identity(list_identities,list_alignLen))


In [11]:
print('{:>5}{:>14}{:>11}'.format('ID','%coverage','%identity'))
thresold_coverage=30
thresold_identity=30

for id, cov, ident in zip(beta_max[0], beta_max[1], beta_max[2]):
#     print('{:>5}{:>9}{:>12}'.format(id,cov, ident))
    if cov>thresold_coverage and ident>thresold_identity:
        print('{:>5}{:>9}{:>12}'.format(id, cov, ident))

   ID     %coverage  %identity
P35568      100         100
Q28224      101          97
P35570       99          89
P35569       99          88
P84770       87          58
Q91615       71          55
Q9DF49       78          38
Q5RJW5       78          38
P81122       71          43
Q9Y4H2       72          43


The output of Blastp gave more hits. So the list of ids of the last script will be saved and used on Uniprot search.

In [57]:
with open('CDS_protein_result_blastp.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## Uniprot search of Blastp results

In [None]:
#the seqs here are already aligned, they have '-' (from blastp)
# results_Blastp= open('blastp_CDS_prot_seq.xml')
# blastp_records = NCBIXML.read(results_Blastp)

# with open('allOrg_CDS_prot.fasta', 'w') as f:
#     for alignment in  blastp_records.alignments:
#         f.write(f">{alignment.title}\n")
#         for hsp in alignment.hsps:
#             f.write(f"{hsp.query}\n\n")

In [202]:
#saving sequences from Uniprot
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records = NCBIXML.read(results_Blastp)
titles_list=[]
for alignment in  blastp_records.alignments:   
    titles_list.append(alignment.title)

file= open("CDS_protein_result_blastp.txt", "r")
fields="sequence"
WEBSITE_API="https://rest.uniprot.org"
seqs=[]

for i in file:
    r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
    seqs.append(str(r.content))

with open('allOrg_CDS_prot_fromUniprot.fasta', 'w') as f:
    for index, seq in enumerate(seqs):
        existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
        if existe:
            m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
            f.write(f">{titles_list[index]}\n{m.group(1)}\n\n")                    
            #print(m.group(1))       

In [None]:
#another easier way to do this but doesn't work in all cases
# from Bio import SwissProt
# from Bio import ExPASy
# with ExPASy.get_sprot_raw("P35568") as handle:
#     seq_record = SeqIO.read(handle, "swiss")
#     print(seq_record.id)
#     #print(seq_record.entry_name, "\n")
#     #print(", ".join(seq_record.accessions), "\n")
#     #print(seq_record.keywords, "\n")
#     #print(seq_record.organism, "\n")
#     #print(len(seq_record.sequence), "aa", "\n")
#     print(seq_record.seq)

In [5]:
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [60]:
file= open("CDS_protein_result_blastp.txt", "r")
fields="accession,organism_name,protein_name,cc_subcellular_location,cc_function"
WEBSITE_API="https://rest.uniprot.org"
with open('uniprot_result_CDS_filtered.txt', 'w',encoding='utf-8') as f:
    for i in file:
        r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
        print(r.text)
        f.write(r.text)
        f.write('\n')

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
P35568	Homo sapiens (Human)	Insulin receptor substrate 1 (IRS-1)		FUNCTION: May mediate the control of various cellular processes by insulin. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains such as phosphatidylinositol 3-kinase p85 subunit or GRB2. Activates phosphatidylinositol 3-kinase when bound to the regulatory p85 subunit (By similarity). {ECO:0000250, ECO:0000269|PubMed:16878150}.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q28224	Chlorocebus aethiops (Green monkey) (Cercopithecus aethiops)	Insulin receptor substrate 1 (IRS-1)		FUNCTION: May mediate the control of various cellular processes by insulin. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains such as phosphatidylinositol 3-kinase p85 subunit or GRB2. Activates phosphatidylinositol 3-kinase when bo

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
B3N946	Drosophila erecta (Fruit fly)	Insulin receptor substrate 1 (Protein chico)		FUNCTION: Activates phosphatidylinositol 3-kinase when bound to the regulatory p85 subunit. May mediate the control of various cellular processes by insulin-like peptides. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains. Involved in control of cell proliferation, cell size, and body and organ growth throughout development. Also has a role in a signaling pathway controlling the physiological response required to endure periods of low nutrient conditions. Insulin/insulin-like growth factor (IGF) signaling pathway has a role in regulating aging and is necessary in the ovary for vitellogenic maturation (By similarity). {ECO:0000250|UniProtKB:P35570, ECO:0000250|UniProtKB:Q9XTN2}.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
B4NZ70	Drosophila yakuba (

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q13480	Homo sapiens (Human)	GRB2-associated-binding protein 1 (GRB2-associated binder 1) (Growth factor receptor bound protein 2-associated protein 1)		FUNCTION: Adapter protein that plays a role in intracellular signaling cascades triggered by activated receptor-type kinases. Plays a role in FGFR1 signaling. Probably involved in signaling by the epidermal growth factor receptor (EGFR) and the insulin receptor (INSR). Involved in the MET/HGF-signaling pathway (PubMed:29408807). {ECO:0000269|PubMed:29408807}.

Entry	Organism	Protein names	Subcellular location [CC]	Function [CC]
Q9QYY0	Mus musculus (Mouse)	GRB2-associated-binding protein 1 (GRB2-associated binder 1) (Growth factor receptor bound protein 2-associated protein 1)		FUNCTION: Adapter protein that plays a role in intracellular signaling cascades triggered by activated receptor-type kinases. Plays a role in FGFR1 signaling. Probably involved in signaling by th

## Alignment and Phylo

In [93]:
#Clustalw
dir = r'C:\Program Files (x86)\ClustalW2\clustalw2'
in_file = r'allOrg_CDS_prot_fromUniprot.fasta'

clustalw_cline = ClustalwCommandline(dir, infile=in_file)
cline = ClustalwCommandline("clustalw", infile="allOrg_CDS_prot_fromUniprot.fasta", outfile="allOrg_CDS_prot_fromUniprot.aln")

In [118]:
align = AlignIO.read("allOrg_CDS_prot_fromUniprot.aln", "clustal")
print("Número de linhas: %i" % len(align)) 
print(format(align, "clustal"))

# for record in align:
#     print("%s - %s" % (record.seq, record.id)) 

#ver com que frequência as letras são substituidas
subs = align.substitutions
# print(subs)

Número de linhas: 31
CLUSTAL 2.1 multiple sequence alignment


sp|B4HWI2.1|                        --------------------------------------------------
sp|Q9XTN2.1|                        --------------------------------------------------
sp|B3MPN6.1|                        --------------------------------------------------
sp|Q9Z1S8.2|                        --------------------------------------------------
sp|Q9EQH1.2|                        --------------------------------------------------
sp|B3N946.1|                        MASCSFTRDQATRRLRG-AAAAAAAALAAVVTTPLLSSGTPTALIGTGSS
sp|B4NZ70.1|                        MASCSFSGHQALRRLRASAAAAASAALAAVATTPLLSSGTRTALIGTGSS
sp|P35568.1|                        ---------------------------MASPPE-----------------
sp|Q28224.1|                        ---------------------------MASPPE-----------------
sp|P35570.1|                        ---------------------------MASPPD-----------------
sp|P35569.1|                        ---------------------------MASP

In [123]:
count = AlignIO.convert("allOrg_CDS_prot_fromUniprot.aln", "clustal","allOrg_CDS_prot_fromUniprot.sth", "stockholm")

In [135]:
alignment = AlignIO.read("allOrg_CDS_prot_fromUniprot.sth", "stockholm")
calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(alignment)
# print(dm)

In [134]:
constructor = DistanceTreeConstructor() 
upgmatree = constructor.upgma(dm)
# print(upgmatree)

In [133]:
njtree = constructor.nj(dm)
# print(njtree)

In [130]:
Phylo.write([upgmatree, njtree],"phylotree_IRS1.nhx","newick")

2

In [131]:
Phylo.draw_ascii(njtree)

                                                 , sp|A7MBB8.1|
          _______________________________________|
         |                                       | sp|Q5RA30.1|
         |
         |               ___________________________________ sp|Q99KE3.1|
         |              |
         |              |                                    _ sp|Q2WGN9.1|
         |              |                                  _|
         |  ____________|                                 | |_ sp|Q9QYY0.2|
        _| |            |      ___________________________|
       | | |            |     |                           | __ sp|Q8WWW8.1|
       | | |            |     |                           ||
       | | |            |_____|                            |___ sp|Q13480.2|
       | | |                  |
       | | |                  |              _________________ sp|Q8TEW6.2|
       | | |                  |_____________|
       | | |                                |               __ sp|A6

In [15]:
# Phylo.draw_ascii(upgmatree)

In [1]:
# string link: https://string-db.org/cgi/network?taskId=bbOoq41xuF5p&sessionId=byBYhzUYg9Mw