## Searching for IRS1 sequence

In [3]:
from Bio import Entrez, SeqIO, Medline
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio import AlignIO
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
from Bio import Phylo
import requests, sys, json
import re

In [4]:
#pubmed search
database = 'PubMed'
word = 'irs1 insulin'
res= 15 
email= "karynalysenko@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text")
records = list(Medline.parse(handle))


for record in records[0:5]:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("Title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("Authors:", record.get("AU", "-"))
    print("Source:", record.get("SO", "-"))
    print("")

Title: Chordin like-1 regulates osteoblast and adipocyte differentiation through stabilizing insulin-like growth factor binding protein 3.
Authors: ['Sun H', 'Wang S', 'Yang Z', 'Tian L', 'Li X', 'Zhou J', 'Wang B']
Source: Stem Cells. 2023 Jan 22:sxad009. doi: 10.1093/stmcls/sxad009.

Title: HIIT Ameliorates Inflammation and Lipid Metabolism by Regulating Macrophage Polarization and Mitochondrial Dynamics in the Liver of Type 2 Diabetes Mellitus Mice.
Authors: ['Wang Y', 'Guo Y', 'Xu Y', 'Wang W', 'Zhuang S', 'Wang R', 'Xiao W']
Source: Metabolites. 2022 Dec 21;13(1):14. doi: 10.3390/metabo13010014.

Title: Sediment pollutant exposures caused hepatotoxicity and disturbed glycogenesis.
Authors: ['Lin MW', 'Yu XR', 'Chen JY', 'Wei YS', 'Chen HY', 'Tsai YT', 'Lin LH', 'Liao EC', 'Kung HY', 'Young SS', 'Chan HL', 'Chou HC']
Source: Ecotoxicol Environ Saf. 2023 Jan 19;251:114559. doi: 10.1016/j.ecoenv.2023.114559.

Title: Alpha-Mangosteen lessens high-fat/high-glucose diet and low-dose str

In [5]:
#pubmed search
database = 'PMC'
word = 'insulin signaling homo sapiens'
res= 10 
email= "karynalysenko@ua.pt"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text")
records = list(Medline.parse(handle))


for record in records:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("Title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("Authors:", record.get("AU", "-"))
    print("Source:", record.get("SO", "-"))
    print("")

Title: Potential mechanisms involved in regulating muscle protein turnover after acute exercise: A brief review.
Authors: ['Hajj-Boutros G', 'Karelis AD', 'Cefis M', 'Morais JA', 'Casgrain J', 'Gouspillou G', 'Sonjak V']
Source: Front Physiol. 2023 Jan 09;13:. doi:10.3389/fphys.2022.1106425.

Title: Regulation of autophagy, lipid metabolism, and neurodegenerative pathology by heparan sulfate proteoglycans.
Authors: ['Schultheis N', 'Becker R', 'Berhanu G', 'Kapral A', 'Roseman M', 'Shah S', 'Connell A', 'Selleck S']
Source: Front Genet. 2023 Jan 09;13:. doi:10.3389/fgene.2022.1012706.

Title: A host–gut microbial amino acid co-metabolite, p-cresol glucuronide, promotes blood–brain barrier integrity in vivo.
Authors: ['Stachulski AV', 'Knausenberger TBA', 'Shah SN', 'Hoyles L', 'McArthur S']
Source: Tissue Barriers. ;11(1):. doi:10.1080/21688370.2022.2073175.

Title: Resistin-like molecules: a marker, mediator and therapeutic target for multiple diseases.
Authors: ['Shi Y', 'Zhu N', 'Qi

In [6]:
#ncbi search to find the sequence of IRS1
database = 'nucleotide'
word = 'irs1 and homo sapiens and Chromosome 2 and not predicted and not unverified '
res= '15'
email= 'karyanlysenko@ua.pt'
Entrez.email= email
handle_search=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle_search)
handle_search.close()
idlist= record['IdList']
handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for info in records:
    print(info.id, '-', info.description)
    #print('length of seq:', len(info.seq)) #to check the length of the sequences

NM_005544.3 - Homo sapiens insulin receptor substrate 1 (IRS1), mRNA
NM_001100818.2 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 2, mRNA
NM_001330158.2 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 5, mRNA
NM_001330157.2 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 4, mRNA
NM_017933.5 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 1, mRNA
NM_001330156.1 - Homo sapiens phosphotyrosine interaction domain containing 1 (PID1), transcript variant 3, mRNA
NG_015830.1 - Homo sapiens insulin receptor substrate 1 (IRS1), RefSeqGene on chromosome 2
CM000253.1 - Homo sapiens chromosome 2, whole genome shotgun sequence
CH471063.1 - Homo sapiens 211000035834619 genomic scaffold, whole genome shotgun sequence


The selection of the id has to be done manually as there is no pattern in writing the titles of the queries.\
The id __NG_015830.1__ is the only one where the annotated sequence is not a whole genome of the chromossome, is not a mRNA and actually is RefSeq. This means that the sequence is being used as a standard for well-characterized genes. So id __NG_015830.1__ will be used from now on.

In [7]:
#The correspondent information of NG_015830.1 was downloaded to a file 
import os
Entrez.email = "karynalysenko@ua.pt"
filename = "NG_015830_1.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NG_015830.1", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [8]:
record = SeqIO.read(open("NG_015830_1.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
#checking the location of the CDS on the original sequence
print("Location of the CDS on the original sequence: {}\n".format(record.features[position].location))
print("Associated Genbank protein info of CDS: {}".format("".join(record.features[position].qualifiers['protein_id'])))

The length of the sequence: 74474

Type of features: {'source': 1, 'gene': 2, 'mRNA': 2, 'exon': 2, 'CDS': 1, 'misc_feature': 46}

Comment from NCBI: REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from AC010735.11.
This sequence is a reference standard in the RefSeqGene project.
Summary: This gene encodes a protein which is phosphorylated by
insulin receptor tyrosine kinase. Mutations in this gene are
associated with type II diabetes and susceptibility to insulin
resistance. [provided by RefSeq, Nov 2009].

Location of the CDS on the original sequence: [5052:8781](+)

Associated Genbank protein info of CDS: NP_005535.1


In [9]:
CDS_nuc_seq_location=record.seq[int(record.features[position].location.start):int(record.features[position].location.end)]

In [10]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename = "NG_015830_1.gb"
filename_CDS_nucl = "CDS_nucleot_seq.fasta"
filename_CDS_prot = "CDS_prot_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank"):
    protein_id="".join(record.features[position].qualifiers['protein_id'])
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq_location))
    output_handle_prot.write(">%s\n%s" % (protein_id,"".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_nucl.close()
output_handle_prot.close()
input_handle.close()

## BLASTN - for all organisms

In [11]:
record_blastn = SeqIO.read(open("CDS_nucleot_seq.fasta"), format="fasta") 
print(len(record_blastn.seq))

3729


In [41]:
Blastn = NCBIWWW.qblast("blastn", "nt", record_blastn.seq) #not filtered for Homo sapiens
with open('blastn_CDS_nucleot_seq.xml', "w") as out_handle:
        out_handle.write(Blastn.read())
Blastn.close()

In [12]:
results_Blastn = open("blastn_CDS_nucleot_seq.xml")
blastn_records = NCBIXML.read(results_Blastn)
for parameter in blastn_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  XM_047444224
Definition:  PREDICTED: Homo sapiens insulin receptor substrate 1 (IRS1), transcript variant X2, mRNA
E-value:  0.0

Accession:  XM_047444223
Definition:  PREDICTED: Homo sapiens insulin receptor substrate 1 (IRS1), transcript variant X1, mRNA
E-value:  0.0

Accession:  NM_005544
Definition:  Homo sapiens insulin receptor substrate 1 (IRS1), mRNA
E-value:  0.0

Accession:  NG_015830
Definition:  Homo sapiens insulin receptor substrate 1 (IRS1), RefSeqGene on chromosome 2
E-value:  0.0

Accession:  AC010735
Definition:  Homo sapiens BAC clone RP11-395N3 from 2, complete sequence
E-value:  0.0


In [13]:
#filtering the "predicted" alignments
results_Blastn= open('blastn_CDS_nucleot_seq.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                # print( existe[0] )
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
                    #print(id)
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(list_filtered_alignments)
print('Total {} PREDICTED seqs found and remaining {} ids are from Homo sapiens'.format(count_preditc, count_homo))
#counting of Homo sapiens it's not conclusive, just to have an ideia, because the titles don't follow any pattern

['NM_005544', 'NG_015830', 'AC010735', 'S62539', 'BC053895', 'LT743046', 'KJ891488', 'AB384351', 'EU831611', 'EU831698', 'S85963', 'U43502']
Total 38 PREDICTED seqs found and remaining 8 ids are from Homo sapiens


In [12]:
with open('CDS_nucleotide_result_blast.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## BLASTP

In [14]:
record_blastp = SeqIO.read(open("CDS_prot_seq.fasta"), format="fasta") 
print(len(record_blastp.seq))

1242


In [15]:
Blastp = NCBIWWW.qblast("blastp", "swissprot", record_blastp.seq) #not filtered for Homo sapiens
with open('blastp_CDS_prot_seq.xml', "w") as out_handle:
        out_handle.write(Blastp.read())
Blastp.close()

In [15]:
results_Blastp = open("blastp_CDS_prot_seq.xml")
blastp_records = NCBIXML.read(results_Blastp)
for parameter in blastp_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  P35568
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1 [Homo sapiens]
E-value:  0.0

Accession:  Q28224
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1 [Chlorocebus aethiops]
E-value:  0.0

Accession:  P35570
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1; AltName: Full=pp185 [Rattus norvegicus]
E-value:  0.0

Accession:  P35569
Definition:  RecName: Full=Insulin receptor substrate 1; Short=IRS-1 [Mus musculus]
E-value:  0.0

Accession:  P84770
Definition:  RecName: Full=Insulin receptor substrate 1-B; Short=IRS1-B; Short=xIRS-1-B; AltName: Full=XIRS-L' [Xenopus laevis]
E-value:  0.0


In [3]:
#not necessary
#this part was adapted below
# results_Blastp= open('blastp_CDS_prot_seq.xml')
# blastp_records = NCBIXML.read(results_Blastp)
# E_VALUE_THRESH = 0.001
# list_filtered_alignments,list_species=[],[]
# for alignment in  blastp_records.alignments:
#     for hsp in alignment.hsps:
# #         print(hsp.identities)    # maybe add more 
#         if hsp.expect < E_VALUE_THRESH:
#             list_filtered_alignments.append(alignment.accession)
#             title_organism=re.search(r'\[.+\s.+\]', alignment.title)
#             if title_organism:
#                 m = re.match(r'\[.+\s.+\]', title_organism[0] )
#                 specie = m.group(0)
#                 #print(specie)
#                 list_species.append(specie)
# for x in sorted(set(list_species)):
#     print("number of times: {} that appeared specie: {}".format(list_species.count(x),x))

# print(set(list_filtered_alignments))
#print(len(list_filtered_alignments))

number of times: 1 that appeared specie: [Bos taurus]
number of times: 1 that appeared specie: [Chlorocebus aethiops]
number of times: 1 that appeared specie: [Drosophila ananassae]
number of times: 1 that appeared specie: [Drosophila erecta]
number of times: 1 that appeared specie: [Drosophila melanogaster]
number of times: 1 that appeared specie: [Drosophila sechellia]
number of times: 1 that appeared specie: [Drosophila yakuba]
number of times: 9 that appeared specie: [Homo sapiens]
number of times: 1 that appeared specie: [Mesocricetus auratus]
number of times: 6 that appeared specie: [Mus musculus]
number of times: 2 that appeared specie: [Rattus norvegicus]
number of times: 4 that appeared specie: [Xenopus laevis]
number of times: 2 that appeared specie: [Xenopus tropicalis]
{'B4NZ70', 'Q9Z0Y7', 'P35570', 'Q9XTN2', 'Q9Y4H2', 'Q5RJW5', 'Q99PF6', 'P35569', 'P81122', 'Q91615', 'B3MPN6', 'Q6P4Y6', 'P84770', 'B4HWI2', 'Q99KE3', 'A6QLU3', 'Q9DF49', 'Q9Z1S8', 'O14654', 'Q8WWW8', 'Q13480

In [16]:
#FINAL
#has to be apart because of NCBIXML.read() and not parse
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records_r = NCBIXML.read(results_Blastp)
first=blastp_records_r.alignments[0]
len_max_seq=first.hsps[0].align_length
print(len_max_seq)

1242


In [14]:
#função não aplicável por causa do resto do código, que está construído para listas (coverage e identity)
# def read_xml(filename):
#     results_Blastp= open(filename)
#     blastp_records = NCBIXML.parse(results_Blastp)
#     alltog=[]

#     for alignment in  blastp_records:
#         for a in alignment.alignments:
#             for b in a.hsps:
#                 list_accession,list_identities,list_coverage_start, list_coverage_stop,list_alignLen=[],[],[],[],[]
#                 list_accession.append(a.accession)
#                 list_coverage_start.append(b.sbjct_start)
#                 list_coverage_stop.append(b.sbjct_end)
#                 list_identities.append(b.identities)
#                 list_alignLen.append(b.align_length)
#             alltog.append(list_accession+list_coverage_start+list_coverage_start+list_coverage_stop+list_identities+list_alignLen)
#     return alltog 
# print(read_xml('blastp_CDS_prot_seq.xml')[0][0])

In [17]:
#FINAL
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records = NCBIXML.parse(results_Blastp)
list_accession,list_identities,list_coverage_start, list_coverage_stop,list_alignLen=[],[],[],[],[]
list_organism=[]
E_VALUE_THRESH = 0.001
for alignment in  blastp_records:
    for a in alignment.alignments:
        for b in a.hsps:
            if b.expect < E_VALUE_THRESH:
                title_organism=re.search(r'\[.+\s.+\]', a.title)
                if title_organism:
                    m = re.match(r'\[(.+\s.+)\]', title_organism[0] )
                    list_accession.append(a.accession)
                    list_coverage_start.append(b.sbjct_start)
                    list_coverage_stop.append(b.sbjct_end)
                    list_identities.append(b.identities)
                    list_alignLen.append(b.align_length)
                    list_organism.append(m.group(1))
            #print(a.accession, b.align_length, b.identities, b.expect, b.sbjct_start, b.sbjct_end)
            
#calculation of %identity                        
def identity(list_i, list_a):
    perc_list_identities=[]
    for index in range(len(list_i)):
        perc_list_identities.append(round(list_i[index]*100/list_a[index]))
    return perc_list_identities

#calculation of the %coverage - for all hsps of all accessions
def coverage(list_end,list_start):
    perc_list_coverage=[]
    for index in range(len(list_identities)):
        perc_list_coverage.append(round((1+list_end[index]-list_start[index])/len_max_seq*100))
    return perc_list_coverage

#selecting the max coverage and max identity per hsps/accession
def max_values_by_id(ids, covs, idents): 
    unique_ids = []
    max_coverage = []
    max_identity = []
    for id, cov, ident in zip(ids, covs,idents):
        if id in unique_ids:
            idx = unique_ids.index(id)
            max_coverage[idx] = max(max_coverage[idx], cov)
            max_identity[idx] = max(max_identity[idx], ident)
        else:
            unique_ids.append(id)
            max_coverage.append(cov)
            max_identity.append(ident)
    return unique_ids, max_coverage, max_identity

beta_max=max_values_by_id(list_accession, coverage(list_coverage_stop, list_coverage_start),identity(list_identities,list_alignLen))


In [18]:
#FINAL
thresold_coverage=50
thresold_identity=50
print('___Thresholds___')
print('coverage: {}%   |'.format(thresold_coverage))
print('identity: {}%   |'.format(thresold_identity))
print('e-value: {}% |\n'.format(E_VALUE_THRESH))
print('{:>5}{:>14}{:>11}{:>15}'.format('ID','%coverage','%identity', 'Organism'))
count=0
filtragem= []
for id, cov, ident, specie in zip(beta_max[0], beta_max[1], beta_max[2], list_organism):
    if int(cov)>thresold_coverage and ident>thresold_identity:
        if int(cov)>100:
            print('|{:>5}|{:>9}*{:>9}{:>24}'.format(id, cov, ident, specie))
            count+=1
        else:
            print('|{:>5}|{:>9}{:>10}{:>21}'.format(id, cov, ident, specie))
        filtragem.append(f"{id} {specie}")
  
if count>0:
    print('\n* means that the subject sequence is longer than the query sequence')

___Thresholds___
coverage: 50%   |
identity: 50%   |
e-value: 0.001% |

   ID     %coverage  %identity       Organism
|P35568|      100       100         Homo sapiens
|Q28224|      101*       97    Chlorocebus aethiops
|P35570|       99        89    Rattus norvegicus
|P35569|       99        88         Mus musculus
|P84770|       87        58       Xenopus laevis
|Q91615|       71        55       Xenopus laevis

* means that the subject sequence is longer than the query sequence


In [19]:
#saving previous list but filtered, without repeting same organisms
lista_sem_repetidos= []
seen = set()
for x in filtragem:
    parts = x.split()
    organi = parts[1] + '_' + parts[2]    
    if organi in seen:
        filtragem.remove(x)
    else:
        seen.add(organi)
        lista_sem_repetidos.append(x)      
# print(lista_sem_repetidos)

lista_ids = []
with open('CDS_protein_result_blastp.txt', 'w') as f:
    for x in lista_sem_repetidos:
        IDS = x.split()
        lista_ids.append(IDS[0])
        f.write(f"{IDS[0]}\n")
# print(lista_ids)

In [20]:
print(lista_sem_repetidos)
print(filtragem)

['P35568 Homo sapiens', 'Q28224 Chlorocebus aethiops', 'P35570 Rattus norvegicus', 'P35569 Mus musculus', 'P84770 Xenopus laevis']
['P35568 Homo sapiens', 'Q28224 Chlorocebus aethiops', 'P35570 Rattus norvegicus', 'P35569 Mus musculus', 'P84770 Xenopus laevis']


The output of Blastp gave more hits. So the list of ids of the last script will be saved and used on Uniprot search.

## Uniprot search of Blastp results

In [21]:
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs);
    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()
    return response

In [48]:
#ORIGINAL -- WORKING ON THIS BELOW
#saving sequences from Uniprot
# results_Blastp= open('blastp_CDS_prot_seq.xml')
# blastp_records = NCBIXML.read(results_Blastp)
# titles_list=[]
# for alignment in  blastp_records.alignments:   
#     titles_list.append(alignment.title)

# file= open("CDS_protein_result_blastp.txt", "r")
# fields="sequence"
# WEBSITE_API="https://rest.uniprot.org"
# seqs=[]

# for i in file:
#     r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
#     seqs.append(str(r.content))

# with open('allOrg_CDS_prot_fromUniprot.fasta', 'w') as f:
#     for index, seq in enumerate(seqs):
#         existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
#         if existe:
#             m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
#             f.write(f">{titles_list[index]}\n{m.group(1)}\n\n")                    
#             print(m.group(1))       

In [71]:
#OLD
# file= open("CDS_protein_result_blastp.txt", "r")
# fields="accession,organism_name,protein_name,cc_subcellular_location,cc_function"
# WEBSITE_API="https://rest.uniprot.org"
# with open('uniprot_result_CDS_filtered.txt', 'w',encoding='utf-8') as f:
#     for i in file:
#         r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
#         print(r.text)
#         f.write(r.text)
#         f.write('\n')

In [22]:
a=", ".join(lista_sem_repetidos)
b=a.replace(" ","_")
c=b.split(",")
print(c[0])

P35568_Homo_sapiens


In [23]:
fileUniprot= open("CDS_protein_result_blastp.txt", "r")
fields="sequence"
WEBSITE_API="https://rest.uniprot.org"
seqs=[]

for i in fileUniprot:
    r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
    seqs.append(str(r.content))

with open('allOrg_CDS_prot_fromUniprot.fasta', 'w') as f:
    for index, seq in enumerate(seqs):
        existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
        if existe:
            m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
            f.write(f">{c[index]}\n{m.group(1)}\n\n")                    
#             print(m.group(1))

In [24]:
fileUniprot= open("CDS_protein_result_blastp.txt", "r")
fields=['accession','organism_name','protein_name','cc_function','cc_subcellular_location']
WEBSITE_API="https://rest.uniprot.org"

def get_field_for_id(ID_PROT, field): 
    response = get_url("{}/uniprotkb/search?query={}&fields={}&size=1&format=tsv".format(WEBSITE_API,ID_PROT,field))
    return str(response.content)

def get_info_uniprot(fileUniprot):
    results = []
    result = []
    tmp = []
    uniprot_final_list=[]
    
    for first_index in fileUniprot:
        tmp= []
        for field in fields:
            result = get_field_for_id(first_index, field)
            tmp.append(result)
        results.append(tmp)
#     print(results)
    for ind in results:
        uniprot_list=[]
        for i in ind:
            entry = re.search(r'b\'Entry\\n(.+?(?=\\n\'))', str(i), re.DOTALL)
            organism = re.search(r'b\'Organism\\n(.+?(?=\\n\'))', str(i), re.DOTALL)
            protein = re.search(r'Protein names\\n(.+?(?=\\n))', str(i), re.DOTALL)
            function = re.search(r'Function \[CC\]\\n.{9} (.+?(?=\\n))', str(i), re.DOTALL )
            location_exist = re.search(r'b\'Subcellular location \[CC\]\\nSUBCELLULAR LOCATION: (.+?(?=\\n\'))', str(i), re.DOTALL )
            location_notexist = re.search( r'b\'Subcellular location \[CC\]\\n\\n\'',str(i), re.DOTALL )   
            if entry:
                uniprot_list.append(entry.group(1))
            if organism:
                uniprot_list.append(organism.group(1))
            if protein:
                uniprot_list.append(protein.group(1))
            if location_exist:
                uniprot_list.append(+location_exist.group(1))
            if location_notexist:
                uniprot_list.append('N/A')
            if function:
                uniprot_list.append(function.group(1))
        uniprot_final_list.append(uniprot_list)
    return uniprot_final_list

In [73]:
get_info_uniprot(fileUniprot)

[['P35568',
  'Homo sapiens (Human)',
  'Insulin receptor substrate 1 (IRS-1)',
  'May mediate the control of various cellular processes by insulin. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains such as phosphatidylinositol 3-kinase p85 subunit or GRB2. Activates phosphatidylinositol 3-kinase when bound to the regulatory p85 subunit (By similarity). {ECO:0000250, ECO:0000269|PubMed:16878150}.',
  'N/A'],
 ['Q28224',
  'Chlorocebus aethiops (Green monkey) (Cercopithecus aethiops)',
  'Insulin receptor substrate 1 (IRS-1)',
  'May mediate the control of various cellular processes by insulin. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains such as phosphatidylinositol 3-kinase p85 subunit or GRB2. Activates phosphatidylinositol 3-kinase when bound to the regulatory p85 subunit (By similarity). {ECO:0000250}.',
  'N/A'],
 ['P35570',
  'Rattus norvegicus (R

In [25]:
fileUniprot= open("CDS_protein_result_blastp.txt", "r")
allInfo=get_info_uniprot(fileUniprot)
for index in allInfo:
    print('Id: {}\nOrganism: {}\nProtein name: {}\nSubcelular location: {}\nFunction:  {}\n\n'.format(index[0], index[1], index[2], index[4], index[3]))
    

Id: P35568
Organism: Homo sapiens (Human)
Protein name: Insulin receptor substrate 1 (IRS-1)
Subcelular location: N/A
Function:  May mediate the control of various cellular processes by insulin. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains such as phosphatidylinositol 3-kinase p85 subunit or GRB2. Activates phosphatidylinositol 3-kinase when bound to the regulatory p85 subunit (By similarity). {ECO:0000250, ECO:0000269|PubMed:16878150}.


Id: Q28224
Organism: Chlorocebus aethiops (Green monkey) (Cercopithecus aethiops)
Protein name: Insulin receptor substrate 1 (IRS-1)
Subcelular location: N/A
Function:  May mediate the control of various cellular processes by insulin. When phosphorylated by the insulin receptor binds specifically to various cellular proteins containing SH2 domains such as phosphatidylinositol 3-kinase p85 subunit or GRB2. Activates phosphatidylinositol 3-kinase when bound to the regulatory p85 subun

## Alignment and Phylo

In [67]:
#Clustalw
dir = r'C:\Program Files (x86)\ClustalW2\clustalw2'
in_file = r'allOrg_CDS_prot_fromUniprot.fasta'

clustalw_cline = ClustalwCommandline(dir, infile=in_file)
clustalw_cline()

('\n\n\n CLUSTAL 2.1 Multiple Sequence Alignments\n\n\nSequence format is Pearson\nSequence 1: P35568_Homo_sapiens          1242 aa\nSequence 2: Q28224_Chlorocebus_aethiops  1251 aa\nSequence 3: P35570_Rattus_norvegicus     1235 aa\nSequence 4: P35569_Mus_musculus          2335 aa\nStart of Pairwise alignments\nAligning...\n\nSequences (1:2) Aligned. Score:  97\nSequences (1:3) Aligned. Score:  90\nSequences (1:4) Aligned. Score:  89\nSequences (2:3) Aligned. Score:  90\nSequences (2:4) Aligned. Score:  88\nSequences (3:4) Aligned. Score:  97\nGuide tree file created:   [allOrg_CDS_prot_fromUniprot.dnd]\n\nThere are 3 groups\nStart of Multiple Alignment\n\nAligning...\nGroup 1: Sequences:   2      Score:26736\nGroup 2: Sequences:   2      Score:26436\nGroup 3: Sequences:   4      Score:25319\nAlignment Score 43037\n\nCLUSTAL-Alignment file created  [allOrg_CDS_prot_fromUniprot.aln]\n\n',
 '')

In [70]:
cline = ClustalwCommandline("clustalw", infile="allOrg_CDS_prot_fromUniprot.fasta", outfile="allOrg_CDS_prot_fromUniprot.aln")

In [69]:
align = AlignIO.read("allOrg_CDS_prot_fromUniprot_teste.aln", "clustal")
print("Número de linhas: %i" % len(align)) 
print(format(align, "clustal"))

subs = align.substitutions
# print(subs)

FileNotFoundError: [Errno 2] No such file or directory: 'allOrg_CDS_prot_fromUniprot_teste.aln'

In [496]:
count = AlignIO.convert("allOrg_CDS_prot_fromUniprot.aln", "clustal","allOrg_CDS_prot_fromUniprot.sth", "stockholm")

In [497]:
alignment = AlignIO.read("allOrg_CDS_prot_fromUniprot.sth", "stockholm")
calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(alignment)
# print(dm)

In [498]:
constructor = DistanceTreeConstructor() 
upgmatree = constructor.upgma(dm)
# print(upgmatree)

In [499]:
njtree = constructor.nj(dm)
# print(njtree)

In [500]:
Phylo.write([upgmatree, njtree],"phylotree_IRS1.nhx","newick")

2

In [501]:
Phylo.draw_ascii(njtree)

         _ sp|P35568.1|
  ______|
 |      |_ sp|Q28224.1|
 |
_|       _ sp|P35569.1|
 |______|
 |      |_ sp|P35570.1|
 |
 |_______________________________________________________________ sp|P84770.1|



In [15]:
# Phylo.draw_ascii(upgmatree)

In [1]:
# string link: https://string-db.org/cgi/network?taskId=bbOoq41xuF5p&sessionId=byBYhzUYg9Mw