In [2]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
import requests, sys, json
import re
from Bio import SeqIO
from Bio import Entrez
from Bio import Medline
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SearchIO
from Bio.SwissProt import KeyWList
from Bio import SwissProt
from Bio import ExPASy
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

## Searching for literature for GLI2 gene

In [7]:
database = "PubMed"
word = 'GLI2'
res= int('30')
email= "rodrigoce9@gmail.com"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, --retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text")
records = Medline.parse(handle)


for record in records:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("title:", record.get("TI", "-"))
    #print('abstract:', record.get('AB', '-'))
    print("authors:", record.get("AU", "-"))
    print("source:", record.get("SO", "-"))
    print("")

title: c-Jun phosphorylated by JNK is required for protecting Gli2 from proteasomal-ubiquitin degradation by PGE2-JNK signaling axis.
authors: ['Yang J', 'Wang J', 'Zhang Y', 'Huang W', 'Zhang S', 'Yin P', 'Tan W']
source: Biochim Biophys Acta Mol Cell Res. 2022 Dec 26:119418. doi: 10.1016/j.bbamcr.2022.119418.

title: Expression of Indian hedgehog signaling in murine oviductal infundibulum and its relationship with epithelial homeostasis.
authors: ['Hosotani M', 'Ichii O', 'Namba T', 'Masum MA', 'Nakamura T', 'Hasegawa Y', 'Watanabe T', 'Kon Y']
source: Cell Tissue Res. 2022 Dec 29. doi: 10.1007/s00441-022-03722-w.

title: LOXL2 reduces 5-FU sensitivity through the Hedgehog/BCL2 signaling pathway in colorectal cancer.
authors: ['Qiu Z', 'Qiu S', 'Mao W', 'Lin W', 'Peng Q', 'Chang H']
source: Exp Biol Med (Maywood). 2022 Dec 27:15353702221139203. doi: 10.1177/15353702221139203.

title: Icariin Treatment Rescues Diabetes Induced Bone Loss via Scavenging ROS and Activating Primary Cilia/

## Searching for GLI2 sequence

In [3]:
database = 'nucleotide'
word = 'GLI2 and homo sapiens and Chromosome 2 and not predicted and not unverified '
res= '15'
email= 'rodrigoce9@gmail.com'
Entrez.email= email
handle_search=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle_search)
handle_search.close()
idlist= record['IdList']

In [4]:
handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for info in records:
    print(info.id, '-', info.description)
    #print('length of seq:', len(info.seq)) #to check the length of the sequences

NM_001374354.1 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 4, mRNA
NM_001374353.1 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 3, mRNA
NM_001371271.1 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 1, mRNA
NM_005270.5 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 2, mRNA
NM_003743.5 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 1, mRNA
NM_001362950.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 4, mRNA
NM_001362952.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 5, mRNA
NM_147223.3 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 2, mRNA
NM_001362954.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 6, mRNA
NM_001362955.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 7, mRNA
NM_147233.2 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), 

The selection of the id has to be done manually as there is no pattern in writing the titles of the queries.\
The id __NG_009030.2__ is the only one where the annotated sequence is in chromosome 2, is not a mRNA and actually is RefSeq. This means that the sequence is being used as a standard for well-characterized genes. So id __NG_009030.2__ will be used from now on.

In [6]:
#The correspondent information of NG_015830.1 was downloaded to a file
import os
Entrez.email = "rodrigoce9@gmail.com"
filename = "NG_009030.2.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NG_009030.2", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [7]:
record = SeqIO.read(open("NG_009030.2.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    x.
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
#checking the location of the CDS on the original sequence
print("Location of the CDS on the original sequence: {}".format(record.features[position].location))

The length of the sequence: 263786

Type of features: {'source': 1, 'gene': 1, 'mRNA': 1, 'exon': 14, 'CDS': 1, 'misc_feature': 17}

Comment from NCBI: REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from AC018866.9, AC017033.5,
KF510752.1, KF510212.1 and AC016764.8.
This sequence is a reference standard in the RefSeqGene project.
On Sep 14, 2022 this sequence version replaced NG_009030.1.
Summary: This gene encodes a protein which belongs to the C2H2-type
zinc finger protein subclass of the Gli family. Members of this
subclass are characterized as transcription factors which bind DNA
through zinc finger motifs. These motifs contain conserved H-C
links. Gli family zinc finger proteins are mediators of Sonic
hedgehog (Shh) signaling and they are implicated as potent
oncogenes in the embryonal carcinoma cell. The protein encoded by
this gene localizes to the cytoplasm and activates patched
Drosophila homolog (PTCH) gene expression. It is al

In [8]:
CDS_nuc_seq=record.seq[250000:260000]

In [9]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename_CDS_nucl = "CDS_nucleot_GLI2_seq.fasta"
filename_CDS_prot = "CDS_prot_GLI2_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank") :
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq))
    output_handle_prot.write(">\n%s" % ("".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_nucl.close()
output_handle_prot.close()
input_handle.close()

## BLASTN - for all organisms

In [None]:
record = SeqIO.read(open("CDS_nucleot_GLI2_seq.fasta"), format="fasta") 
print (len(record.seq))

In [None]:
Blast = NCBIWWW.qblast("blastn", "nt", record.seq)
with open('blast_DNA_GLI2_file.xml', "w") as out_handle:
        out_handle.write(Blast.read())
Blast.close()

In [15]:
results_Blastn = open("blast_DNA_GLI2_file.xml")
blastn_records = NCBIXML.read(results_Blastn)
for parameter in blastn_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  NG_009030
Definition:  Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
E-value:  0.0
E-value:  1.23303e-94
E-value:  2.71593e-90
E-value:  4.03079e-88
E-value:  1.71394e-86
E-value:  5.98222e-86
E-value:  2.54371e-84
E-value:  8.87841e-84
E-value:  8.87841e-84
E-value:  8.87841e-84
E-value:  3.09887e-83
E-value:  3.77519e-82
E-value:  4.59913e-81
E-value:  4.59913e-81
E-value:  1.60525e-80
E-value:  5.60289e-80
E-value:  2.38241e-78
E-value:  1.23412e-75
E-value:  4.30749e-75
E-value:  4.30749e-75
E-value:  1.50346e-74
E-value:  1.50346e-74
E-value:  1.50346e-74
E-value:  5.2476e-74
E-value:  6.39289e-73
E-value:  6.39289e-73
E-value:  2.23134e-72
E-value:  2.23134e-72
E-value:  9.48789e-71
E-value:  1.40813e-68
E-value:  3.78187e-44
E-value:  3.78187e-44
E-value:  5.61279e-42
E-value:  5.61279e-42
E-value:  3.54206e-38
E-value:  5.9981e-29
E-value:  3.54519e-19
E-value:  4.31892e-18
E-value:  4.04506e-12
E-value:  4.92789e-11
E-value:  6.0034e-10


In [11]:
E_VALUE_THRESH = 0.001
results_Blast= open('blast_DNA_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    print (alignment)
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print('****Alignment****')
            print('sequence: ', alignment.title)
            print('lenght:', alignment.length)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')
            print()
            break

gi|2301838502|ref|NG_009030.2| Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
           Length = 263786

****Alignment****
sequence:  gi|2301838502|ref|NG_009030.2| Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
lenght: 263786
TTGATCCATTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAG...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
TTGATCCATTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAG...

gi|18097434|gb|AC016764.8| Homo sapiens BAC clone RP11-549G13 from 2, complete sequence
           Length = 155868

****Alignment****
sequence:  gi|18097434|gb|AC016764.8| Homo sapiens BAC clone RP11-549G13 from 2, complete sequence
lenght: 155868
TTGATCCATTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAG...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
TTGATCCATTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAG.

In [30]:
#filtering the "predicted" alignments
results_Blastn= open('blast_DNA_GLI2_file.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                # print( existe[0] )
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
                    #print(id)
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(set(list_filtered_alignments))
print('Total {} PREDICTED seqs found.'.format(count_preditc))
#counting of Homo sapiens it's not conclusive, just to have an ideia, because the titles don't follow any pattern

{'AB007295', 'NG_009030', 'AB007296', 'AC108019', 'NG_046987', 'AC004262', 'NM_001354259', 'NG_096463', 'AC008870', 'CP068260', 'AB209354', 'AC015878', 'AB007297', 'CP068259', 'D14828', 'AC279654', 'NM_001371271', 'NM_001374354', 'AC016764', 'BX295541', 'CP068262', 'AP002453', 'AP023476', 'AH001472', 'NG_047146', 'AL356212', 'D14827', 'AC008983', 'NM_005270', 'AC011774', 'BC111410', 'NM_001374353', 'AL611925', 'NG_016599', 'NG_011776', 'DQ086814', 'AP023478', 'AP023479', 'AL121658', 'AC012317', 'AB007298', 'AC090427', 'AP001972', 'DQ314865', 'NG_028118', 'AC279566'}
Total 16 PREDICTED seqs found.


In [18]:
with open('CDS_GLI2_nucleotide_result_blast.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## BLASTP

In [None]:
record = SeqIO.read(open("10001bp_sequence_GLI2.gb"), format="gb") 
print (len(record.seq))

In [13]:
Trans=[]
record = SeqIO.read(open("10001bp_sequence_GLI2.gb"), format="gb") 
for feat in record.features:
    if feat.type == "CDS":
        Trans.append(str(feat.qualifiers['translation']))
        print(feat.qualifiers['translation'])

['METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHNVNPGPLPPCADRRGLRLQSHPST

In [None]:
result_handle = NCBIWWW.qblast("blastp", "swissprot", Trans)

In [None]:
with open('blast_PROT_GLI2_file.xml', "w") as out_handle:
        out_handle.write(result_handle.read())
result_handle.close()

In [2]:
results_Blastp = open("blast_PROT_GLI2_file.xml")
blastp_records = NCBIXML.read(results_Blastp)
for parameter in blastp_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  P10070
Definition:  RecName: Full=Zinc finger protein GLI2; AltName: Full=GLI family zinc finger protein 2; AltName: Full=Tax helper protein [Homo sapiens]
E-value:  0.0

Accession:  Q0VGT2
Definition:  RecName: Full=Zinc finger protein GLI2; AltName: Full=Tax helper protein [Mus musculus]
E-value:  0.0

Accession:  Q91661
Definition:  RecName: Full=Zinc finger protein GLI4; AltName: Full=Neural-specific DNA-binding protein xGLI4; Short=xGLI-4 [Xenopus laevis]
E-value:  0.0

Accession:  Q9IA31
Definition:  RecName: Full=Transcriptional activator GLI3; AltName: Full=GLI3 full-length protein; Short=GLI3FL; Contains: RecName: Full=Transcriptional repressor GLI3R; AltName: Full=GLI3 C-terminally truncated form [Gallus gallus]
E-value:  0.0
E-value:  1.2649e-18

Accession:  P55879
Definition:  RecName: Full=Zinc finger protein GLI2 [Gallus gallus]
E-value:  0.0


In [19]:
results_Blastp= open('blast_PROT_GLI2_file.xml')
blastp_records = NCBIXML.read(results_Blastp)
E_VALUE_THRESH = 0.001
list_filtered_alignments,list_species=[],[]
for alignment in  blastp_records.alignments:
    for hsp in alignment.hsps:
#         print(hsp.identities)    # maybe add more 
        if hsp.expect < E_VALUE_THRESH:
            list_filtered_alignments.append(alignment.accession)
            title_organism=re.search(r'\[.+\s.+\]', alignment.title)
            if title_organism:
                m = re.match(r'\[.+\s.+\]', title_organism[0] )
                specie = m.group(0)
                #print(specie)
                list_species.append(specie)
for x in sorted(set(list_species)):
    print("number of times: {} that appeared specie: {}".format(list_species.count(x),x))

print(list_filtered_alignments)

number of times: 5 that appeared specie: [Bos taurus]
number of times: 1 that appeared specie: [Caenorhabditis briggsae]
number of times: 1 that appeared specie: [Caenorhabditis elegans]
number of times: 4 that appeared specie: [Danio rerio]
number of times: 3 that appeared specie: [Drosophila melanogaster]
number of times: 1 that appeared specie: [Drosophila yakuba]
number of times: 5 that appeared specie: [Gallus gallus]
number of times: 21 that appeared specie: [Homo sapiens]
number of times: 14 that appeared specie: [Mus musculus]
number of times: 2 that appeared specie: [Pan troglodytes]
number of times: 17 that appeared specie: [Xenopus laevis]
number of times: 6 that appeared specie: [Xenopus tropicalis]
['P10070', 'Q0VGT2', 'Q91661', 'Q9IA31', 'Q9IA31', 'P55879', 'Q91660', 'Q91660', 'P10071', 'P10071', 'Q5IS56', 'Q5IS56', 'Q61602', 'Q61602', 'Q91690', 'Q91690', 'P55878', 'P08151', 'P47806', 'P47806', 'P19538', 'Q8NEA6', 'Q8NEA6', 'Q6XP49', 'Q6XP49', 'Q8NBF1', 'Q8K1M4', 'P34708'

In [20]:
with open('CDS_GLI2_protein_result_blastp.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## Uniprot search of Blastp results

In [None]:
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [16]:
lista=[]
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    print (a)
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SeqIO.read(handle, "swiss")
        lista.append(len(seq_record.seq))
print(min(lista))
print (lista)

334
[1586, 1544, 1361, 1544, 663, 1569, 1580, 1580, 1583, 1360, 556, 1106, 1111, 1397, 775, 780, 620, 789, 1110, 1165, 492, 524, 521, 384, 341, 515, 403, 334, 622, 530, 663, 467, 497, 503, 532, 530, 466, 444, 443, 447, 447, 609, 441, 441, 613, 565, 638, 567, 623, 753]


In [20]:
lista=[]
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SwissProt.read(handle)
        b= (seq_record.entry_name)
        lista.append(b)
print(lista)
    

['GLI2_HUMAN', 'GLI2_MOUSE', 'GLI4_XENLA', 'GLI3_CHICK', 'GLI2_CHICK', 'GLI3_XENLA', 'GLI3_HUMAN', 'GLI3_PANTR', 'GLI3_MOUSE', 'GLI1_XENLA', 'GLI1_CHICK', 'GLI1_HUMAN', 'GLI1_MOUSE', 'CI_DROME', 'GLIS3_HUMAN', 'GLIS3_MOUSE', 'GLIS1_HUMAN', 'GLIS1_MOUSE', 'TRA1_CAEEL', 'TRA1_CAEBR', 'GLIS2_XENLA', 'GLIS2_HUMAN', 'GLIS2_MOUSE', 'GLIS2_DROME', 'ZIC4_MOUSE', 'ZIC5_XENLA', 'CI_DROYA', 'ZIC4_HUMAN', 'ZIC5_MOUSE', 'ZIC4_XENLA', 'ZIC5_HUMAN', 'ZIC3_HUMAN', 'ZIC2B_XENLA', 'ZIC2A_XENLA', 'ZIC2_HUMAN', 'ZIC2_MOUSE', 'ZIC3_MOUSE', 'ZIC1_CHICK', 'ZIC1_XENLA', 'ZIC1_HUMAN', 'ZIC1_MOUSE', 'OPA_DROME', 'ZIC3_XENTR', 'ZIC3_XENLA', 'ZN143_BOVIN', 'ZN143_XENLA', 'ZN143_HUMAN', 'ZN143_XENTR', 'ZN143_DANRE', 'MTF1_HUMAN']


In [22]:
lista=[]
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    lista.append(a)
print(lista)

['P10070', 'Q0VGT2', 'Q91661', 'Q9IA31', 'P55879', 'Q91660', 'P10071', 'Q5IS56', 'Q61602', 'Q91690', 'P55878', 'P08151', 'P47806', 'P19538', 'Q8NEA6', 'Q6XP49', 'Q8NBF1', 'Q8K1M4', 'P34708', 'Q17308', 'Q98T94', 'Q9BZE0', 'Q8VDL9', 'Q7K0S9', 'Q61467', 'Q9IB89', 'O77027', 'Q8N9L1', 'Q7TQ40', 'A0JC51', 'Q96T25', 'O60481', 'Q9YIB7', 'Q91689', 'O95409', 'Q62520', 'Q62521', 'Q8JJC0', 'O73689', 'Q15915', 'P46684', 'P39768', 'Q6DJQ6', 'O57311', 'A6QQW0', 'Q91853', 'P52747', 'Q58DZ6', 'Q1LYE3', 'Q14872']


In [5]:
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
with open('BLASTp_new.fasta', 'w') as f:
    for alignment in  blast_records.alignments:
        #print (alignment.hit_id)
        m = alignment.hit_id.replace("sp|","")
        a=m[0:6]
        #print (a)
        with ExPASy.get_sprot_raw(a) as handle:
            seq_record = SeqIO.read(handle, "swiss")
            #print(f'>{seq_record.id}')
            #print(seq_record.seq)
            f.write(f'>{seq_record.id}\n{seq_record.seq}\n\n')
            

           

>P10070
METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQGQVSGHGSCGCALPLSQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHN

>P10071
MEAQSHSSTTTEKKKVENSIVKCSTRTDVSEKAVASSTTSNEDESPGQTYHRERRNAITMQPQNVQGLSKVSEEPSTSSDERASLIKKEIHGSLPHVAEPSVPYRGTVFAMDPRNGYMEPHYHPPHLFPAFHPPVPIDARHHEGRYHYDPSPIPPLHMTSALSSSPTYPDLPFIRISPHRNPTAASESPFSPPHPYINPYMDYIRSLHSSPSLSMISATRGLSPTDAPHAGVSPAEYYHQMALLTGQRSPYADIIPSAATAGTGAIHMEYLHAMDSTRFSSPRLSARPSRKRTLSISPLSDHSFDLQTMIRTSPNSLVTILNNSRSSSSASGSYGHLSASAISPALSFTYSSAPVSLHMHQQILSRQQSLGSAFGHSPPLIHPAPTFPTQRPIPGIPTVLNPVQVSSGPSESSQNKPTSESAVSSTGDPMHNKRSKIKPDEDLPSPGARGQQEQPEGTTLVKEEGDKDESKQEPEVIYETNCHWEGCAREFDTQEQLVHHINNDHIHGEKKEFVCRWLDCSREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCTKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYVCKIPGCTKRYTDPSSLRKHVKTVHGPEAHVTKKQRGDIHPRPPPPRDSGSHSQSRSPGRPTQGALGEQQDLSNTTSKREECLQVKTVKAEKPMTSQPSPGGQSSCSSQQSPISNYSNSGLELPLTDGGSIGDLSAIDETPIMDSTISTATTALALQARRNPAGTKWMEHVKLERLKQVNGMFPRLNPILPPKAPAVSPLIGNGTQSNNTCSLGGPMTLLPGRSDLSGVDVTMLNMLNRRDSSASTISSAYLSSRRSSGISPCFSSRRSSEASQAEGRPQNVSVADSYDPISTDASRRSSEASQSDGLPSLLSLTPAQQYRLKAKYAAATGGPPPTPLPNMERMSLKTRLALLGDALEPGVALPPVHAPRRCSDGGAHGYGRRHL

>P19538
MDAYALPTYFPLAYSELQFLASRRAAAVAAAATVLPGSPCINQHHPTDVSSSVTVPSIIPTGGTSDSIKTSIQPQICNENTLLGNAGHQHNHQPQHVHNINVTGQPHDFHPAYRIPGYMEQLYSLQRTNSASSFHDPYVNCASAFHLAGLGLGSADFLGSRGLSSLGELHNAAVAAAAAGSLASTDFHFSVDGNRRLGSPRPPGGSIRASISRKRALSSSPYSDSFDINSMIRFSPNSLATIMNGSRGSSAASGSYGHISATALNPMSHVHSTRLQQIQAHLLRASAGLLNPMTPQQVAASGFSIGHMPTSASLRVNDVHPNLSDSHIQITTSPTVTKDVSQVPAAAFSLKNLDDAREKKGPFKDVVPEQPSSTSGGVAQVEADSASSQLSDRCYNNVVNNITGIPGDVKVNSRLDEYINCGSISIPSNEYDCANADTTDIKDEPGDFIETNCHWRSCRIEFITQDELVKHINNDHIQTNKKAFVCRWEDCTRGEKPFKAQYMLVVHMRRHTGEKPHKCTFEGCFKAYSRLENLKTHLRSHTGEKPYTCEYPGCSKAFSNASDRAKHQNRTHSNEKPYICKAPGCTKRYTDPSSLRKHVKTVHGAEFYANKKHKGLPLNDANSRLQQNNSRHNLQEHNIDSSPCSEDSHLGKMLGTSSPSIKSESDISSSNHHLVNGVRASDSLLTYSPDDLAENLNLDDGWNCDDDVDVADLPIVLRAMVNIGNGNASASTIGGSVLARQRFRGRLQTKGINSSTIMLCNIPESNRTFGISELNQRITELKMEPGTDAEIKIPKLPNTTIGGYTEDPLQNQTSFRNTVSNKQGTVSGSIQGQFRRDSQNSTASTYYGSMQSRRSSQSSQVSSIPTMRPNPSCNSTASFYDPISPGCSRRSSQMSNGANCNSFTSTSGLPVLNKESNKSLNACINKPNIGVQGVGIYNSSLPPPPSSHLIATNLKRLQRKDSEYHNFTSGRFSVPSYMHSLHIKNNKPVGEN

>Q7K0S9
MDIIQKSIFNSGPHSRGIYEPPLGYFTPYNTPPYIAAYSDSGSWLADHHQHHQQQHQQHQQQMQHIRFPTPPITPPRPIAGYGYRQRTQSVIMKARGQQDELCRSPVEFPDDSKSCSSSSECGTASDFVCNWTDCDRVFDTLDALAQHVTQRHAIASLTDGLYYCRWRGCQRSERGFNARYKMLVHTRTHTKEKPHRCHLCEKSFSRAENLKIHIRSHSGEKPYKCSFEGCQKAYSNSSDRFKHTRTHSMEKPYMCKVAGCQKRYTDPSSLRKHVKTFKHSIHLIASQPLTLPSVPCLLEASSESAFTCLPAASSVESTSSSSSARYYDDSNNEPSDYSLKPKQDAEFSPSYWLGDRQHSYLHSEDFFVKMDVESPLDLRIHRI
>Q61467
MRLGRVCPRGPGKVRSPRHRFSCTLFVSTTGSSCGHHGPQLAASSNPSVLPGLHEQPPQASHSRPLNGLLRLGIPGDMYARSEPFAPGPMARSDTLATATALHGYGGMNLTMNLTAPHGPGAFFRYMRQPIKQELICKWLGDDSPMSPRPCSKTFSTMHELVTHVTVEHVGGPEQANHICFWEECPRQGKPFKAKYKLVNHIRVHTGEKPFPCPFPGCGKVFARSENLKIHKRTHTGEKPFRCEFEGCERRFANSSDRKKHSHVHTSDKPYMCKVRGCDKCYTHPSSLRKHMKVHGRSPPPSSGYDSAITSALASPSLESGREPSVACSAAVVVRGTDVSE
>Q9IB89
MFGKDGRGGKIRTVSVDGLDCALMEPPLSKRSQTLRLADLAATQAHPHHNMTGFPGLGSHQSHSLPAHMHPGELGSDPGVALTPFGPEHMAQATALKLSPSAHPEAQTAAAFASPATVSYPVAHPHSGYSTSRDFILRRELSTSAMLGEQHPAAGSPHHHHHHHPHSMFISSTGSYAHPEGVGHPLFPAIHEQAAAGVHHPLNGQMRLGLAGELYGRPEAFRAEHYAASSLHHSYNSMNLNVNIAAAHP

>P46684
MLLDAGPQYPAIGVTTFGASRHHSAGDVAERDVGLGINPFADGMGAFKLNPSSHELASAGQTAFTSQAPGYAAAAALGHHHHPGHVGSYSSAAFNSTRDFLFRNRGFGDAAAAASAQHSLFAASAGGFGGPHGHTDAAGHLLFSGLHEQAAGHASPNVVNGQMRLGFSGDMYPRPEQYGQVTSPRSEHYAAPQLHGYGPMNVNMAAHHGAGAFFRYMRQPIKQELICKWIEPEQLANPKKSCNKTFSTMHELVTHVTVEHVGGPEQSNHICFWEECPREGKPFKAKYKLVNHIRVHTGEKPFPCPFPGCGKVFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYLCKMCDKSYTHPSSLRKHMKVHESSSQGSQPSPAASSGYESSTPPTIVSPTTDNPTTSSMSPSSSAVHHTAGHSALSSNFNEWYV
>P39768
MMMNAFIEPAQHHLASYGLRMSPNTTASNSNAQQQQQQQLEMTQQQQQQQQQQQQQQQQDQESAAATAAAYQNSGYGHFNSYASRDFLLGRREAEYGVAGSAGQASAAADSMLFSGFPAQAAELGSGFGQHPFHSHHHHHQMRMGMADAYAAGHPYNHHGNFPTAAVHHPVVHHPSHHAMSAMHPAGAGAFLRYMRHQPASSASSVKQEMQCLWIDPDQPGLVPPGGRKTCNKVFHSMHEIVTHLTVEHVGGPECTTHACFWVGCSRNGRPFKAKYKLVNHIRVHTGEKPFACPHPGCGKVFARSENLKIHKRTHTGEKPFKCEHEGCDRRFANSSDRKKHSHVHTSDKPYNCRINGCDKSYTHPSSLRKHMKVHGNVDEKSPSHGYDSEGEESSSSSIITGGAQTPPSTRLDGSAGSSSGVSSLSGGSGIKSSPHSIKSEPNPMHSVHLGASSSGSSSTASSSASHLLQHQQHQHQQQQQQQQHQQQAQQQQQLTAHPSDPKSSPALQLMAASASAYLPPPLGPPPSHHHHPHHH

In [3]:
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    #print(f"id:{a}")
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SwissProt.read(handle)
        print(seq_record.organism)
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SeqIO.read(handle, "swiss")
        print(f'>{seq_record.id}')
        print(seq_record.seq)
        print()
        

Homo sapiens (Human).
>P10070
METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQGQVSGHGSCGCALPLSQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRAS

Homo sapiens (Human).
>P10071
MEAQSHSSTTTEKKKVENSIVKCSTRTDVSEKAVASSTTSNEDESPGQTYHRERRNAITMQPQNVQGLSKVSEEPSTSSDERASLIKKEIHGSLPHVAEPSVPYRGTVFAMDPRNGYMEPHYHPPHLFPAFHPPVPIDARHHEGRYHYDPSPIPPLHMTSALSSSPTYPDLPFIRISPHRNPTAASESPFSPPHPYINPYMDYIRSLHSSPSLSMISATRGLSPTDAPHAGVSPAEYYHQMALLTGQRSPYADIIPSAATAGTGAIHMEYLHAMDSTRFSSPRLSARPSRKRTLSISPLSDHSFDLQTMIRTSPNSLVTILNNSRSSSSASGSYGHLSASAISPALSFTYSSAPVSLHMHQQILSRQQSLGSAFGHSPPLIHPAPTFPTQRPIPGIPTVLNPVQVSSGPSESSQNKPTSESAVSSTGDPMHNKRSKIKPDEDLPSPGARGQQEQPEGTTLVKEEGDKDESKQEPEVIYETNCHWEGCAREFDTQEQLVHHINNDHIHGEKKEFVCRWLDCSREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCTKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYVCKIPGCTKRYTDPSSLRKHVKTVHGPEAHVTKKQRGDIHPRPPPPRDSGSHSQSRSPGRPTQGALGEQQDLSNTTSKREECLQVKTVKAEKPMTSQPSPGGQSSCSSQQSPISNYSNSGLELPLTDGGSIGDLSAIDETPIMDSTISTATTALALQARRNPAGTKWMEHVKLERLKQVNGMFPRLNPILPPKAPAVSPLIGNGTQSNNTCSLGGPMTLLPGRSDLSGVDVTMLNMLNRRDSSASTISSAYLSSRRSSGISPCFSSRRSSEASQAEGRPQNVSVADSYDPISTDASRRSSEASQSDGLPSLLSLTPAQQYRLKAKYAAATGGPPPTPLPNMERMSLKTRLALLGDALEPGVAL

Drosophila melanogaster (Fruit fly).
>P19538
MDAYALPTYFPLAYSELQFLASRRAAAVAAAATVLPGSPCINQHHPTDVSSSVTVPSIIPTGGTSDSIKTSIQPQICNENTLLGNAGHQHNHQPQHVHNINVTGQPHDFHPAYRIPGYMEQLYSLQRTNSASSFHDPYVNCASAFHLAGLGLGSADFLGSRGLSSLGELHNAAVAAAAAGSLASTDFHFSVDGNRRLGSPRPPGGSIRASISRKRALSSSPYSDSFDINSMIRFSPNSLATIMNGSRGSSAASGSYGHISATALNPMSHVHSTRLQQIQAHLLRASAGLLNPMTPQQVAASGFSIGHMPTSASLRVNDVHPNLSDSHIQITTSPTVTKDVSQVPAAAFSLKNLDDAREKKGPFKDVVPEQPSSTSGGVAQVEADSASSQLSDRCYNNVVNNITGIPGDVKVNSRLDEYINCGSISIPSNEYDCANADTTDIKDEPGDFIETNCHWRSCRIEFITQDELVKHINNDHIQTNKKAFVCRWEDCTRGEKPFKAQYMLVVHMRRHTGEKPHKCTFEGCFKAYSRLENLKTHLRSHTGEKPYTCEYPGCSKAFSNASDRAKHQNRTHSNEKPYICKAPGCTKRYTDPSSLRKHVKTVHGAEFYANKKHKGLPLNDANSRLQQNNSRHNLQEHNIDSSPCSEDSHLGKMLGTSSPSIKSESDISSSNHHLVNGVRASDSLLTYSPDDLAENLNLDDGWNCDDDVDVADLPIVLRAMVNIGNGNASASTIGGSVLARQRFRGRLQTKGINSSTIMLCNIPESNRTFGISELNQRITELKMEPGTDAEIKIPKLPNTTIGGYTEDPLQNQTSFRNTVSNKQGTVSGSIQGQFRRDSQNSTASTYYGSMQSRRSSQSSQVSSIPTMRPNPSCNSTASFYDPISPGCSRRSSQMSNGANCNSFTSTSGLPVLNKESNKSLNACINKPNIGVQGVGIYNSSLPPPPSSHLIATNLK

Drosophila melanogaster (Fruit fly).
>Q7K0S9
MDIIQKSIFNSGPHSRGIYEPPLGYFTPYNTPPYIAAYSDSGSWLADHHQHHQQQHQQHQQQMQHIRFPTPPITPPRPIAGYGYRQRTQSVIMKARGQQDELCRSPVEFPDDSKSCSSSSECGTASDFVCNWTDCDRVFDTLDALAQHVTQRHAIASLTDGLYYCRWRGCQRSERGFNARYKMLVHTRTHTKEKPHRCHLCEKSFSRAENLKIHIRSHSGEKPYKCSFEGCQKAYSNSSDRFKHTRTHSMEKPYMCKVAGCQKRYTDPSSLRKHVKTFKHSIHLIASQPLTLPSVPCLLEASSESAFTCLPAASSVESTSSSSSARYYDDSNNEPSDYSLKPKQDAEFSPSYWLGDRQHSYLHSEDFFVKMDVESPLDLRIHRI

Mus musculus (Mouse).
>Q61467
MRLGRVCPRGPGKVRSPRHRFSCTLFVSTTGSSCGHHGPQLAASSNPSVLPGLHEQPPQASHSRPLNGLLRLGIPGDMYARSEPFAPGPMARSDTLATATALHGYGGMNLTMNLTAPHGPGAFFRYMRQPIKQELICKWLGDDSPMSPRPCSKTFSTMHELVTHVTVEHVGGPEQANHICFWEECPRQGKPFKAKYKLVNHIRVHTGEKPFPCPFPGCGKVFARSENLKIHKRTHTGEKPFRCEFEGCERRFANSSDRKKHSHVHTSDKPYMCKVRGCDKCYTHPSSLRKHMKVHGRSPPPSSGYDSAITSALASPSLESGREPSVACSAAVVVRGTDVSE

Xenopus laevis (African clawed frog).
>Q9IB89
MFGKDGRGGKIRTVSVDGLDCALMEPPLSKRSQTLRLADLAATQAHPHHNMTGFPGLGSHQSHSLPAHMHPGELGSDPGVALTPFGPEHMAQATALKLSPSAHPEAQTAAAFASPATVSYPVAHPHSGYSTSRDFILRRELSTSAMLGEQ

Homo sapiens (Human).
>Q15915
MLLDAGPQYPAIGVTTFGASRHHSAGDVAERDVGLGINPFADGMGAFKLNPSSHELASAGQTAFTSQAPGYAAAAALGHHHHPGHVGSYSSAAFNSTRDFLFRNRGFGDAAAAASAQHSLFAASAGGFGGPHGHTDAAGHLLFPGLHEQAAGHASPNVVNGQMRLGFSGDMYPRPEQYGQVTSPRSEHYAAPQLHGYGPMNVNMAAHHGAGAFFRYMRQPIKQELICKWIEPEQLANPKKSCNKTFSTMHELVTHVTVEHVGGPEQSNHICFWEECPREGKPFKAKYKLVNHIRVHTGEKPFPCPFPGCGKVFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYLCKMCDKSYTHPSSLRKHMKVHESSSQGSQPSPAASSGYESSTPPTIVSPSTDNPTTSSLSPSSSAVHHTAGHSALSSNFNEWYV

Mus musculus (Mouse).
>P46684
MLLDAGPQYPAIGVTTFGASRHHSAGDVAERDVGLGINPFADGMGAFKLNPSSHELASAGQTAFTSQAPGYAAAAALGHHHHPGHVGSYSSAAFNSTRDFLFRNRGFGDAAAAASAQHSLFAASAGGFGGPHGHTDAAGHLLFSGLHEQAAGHASPNVVNGQMRLGFSGDMYPRPEQYGQVTSPRSEHYAAPQLHGYGPMNVNMAAHHGAGAFFRYMRQPIKQELICKWIEPEQLANPKKSCNKTFSTMHELVTHVTVEHVGGPEQSNHICFWEECPREGKPFKAKYKLVNHIRVHTGEKPFPCPFPGCGKVFARSENLKIHKRTHTGEKPFKCEFEGCDRRFANSSDRKKHMHVHTSDKPYLCKMCDKSYTHPSSLRKHMKVHESSSQGSQPSPAASSGYESSTPPTIVSPTTDNPTTSSMSPSSSAVHHTAGHSALSSNFNEWYV

Drosophila melanogaster (Fruit fly).
>P397

In [48]:
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    print(f"id:{a}")
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SwissProt.read(handle)
        print(">",seq_record.entry_name,"\n",seq_record.description, "\n", seq_record.organism_classification)
        print(f"keywords:{seq_record.keywords}")
        print()
        

id:P10070
> GLI2_HUMAN 
 RecName: Full=Zinc finger protein GLI2 {ECO:0000305}; AltName: Full=GLI family zinc finger protein 2 {ECO:0000312|HGNC:HGNC:4318}; AltName: Full=Tax helper protein {ECO:0000303|PubMed:9557682}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
keywords:['Acetylation', 'Activator', 'Alternative splicing', 'Cell projection', 'Cilium', 'Cytoplasm', 'Developmental protein', 'Disease variant', 'DNA-binding', 'Holoprosencephaly', 'Isopeptide bond', 'Metal-binding', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Repressor', 'Transcription', 'Transcription regulation', 'Ubl conjugation', 'Zinc', 'Zinc-finger']

id:Q0VGT2
> GLI2_MOUSE 
 RecName: Full=Zinc finger protein GLI2 {ECO:0000305}; AltName: Full=Tax helper protein {ECO:0000250|UniProtKB:P10070}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Eu

> CI_DROME 
 RecName: Full=Transcriptional activator cubitus interruptus; Short=Transcriptional activator ci; AltName: Full=ci form of 155 kDa; Short=ci-155; AltName: Full=ci full-length protein; Short=ciFL; Contains: RecName: Full=Transcriptional repressor cubitus interruptus; Short=Transcriptional repressor ci; AltName: Full=ci C-terminally truncated form; AltName: Full=ci form of 75 kDa; Short=ci-75; 
 ['Eukaryota', 'Metazoa', 'Ecdysozoa', 'Arthropoda', 'Hexapoda', 'Insecta', 'Pterygota', 'Neoptera', 'Endopterygota', 'Diptera', 'Brachycera', 'Muscomorpha', 'Ephydroidea', 'Drosophilidae', 'Drosophila', 'Sophophora']
keywords:['3D-structure', 'Activator', 'Developmental protein', 'DNA-binding', 'Metal-binding', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Repressor', 'Segmentation polarity protein', 'Transcription', 'Transcription regulation', 'Ubl conjugation', 'Zinc', 'Zinc-finger']

id:Q8NEA6
> GLIS3_HUMAN 
 RecName: Full=Zinc finger protein GLIS3; AltName: Full=GL

> ZIC5_MOUSE 
 RecName: Full=Zinc finger protein ZIC 5; AltName: Full=Odd paired-related protein; Short=Opa-related protein; AltName: Full=Zinc finger protein of the cerebellum 5; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Glires', 'Rodentia', 'Myomorpha', 'Muroidea', 'Muridae', 'Murinae', 'Mus', 'Mus']
keywords:['Developmental protein', 'Differentiation', 'DNA-binding', 'Metal-binding', 'Neurogenesis', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Zinc', 'Zinc-finger']

id:A0JC51
> ZIC4_XENLA 
 RecName: Full=Zinc finger protein ZIC 4 {ECO:0000303|PubMed:16871625}; Short=XlZic4 {ECO:0000303|PubMed:16871625}; AltName: Full=Zinc finger protein of the cerebellum 4 {ECO:0000250|UniProtKB:Q8N9L1}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Amphibia', 'Batrachia', 'Anura', 'Pipoidea', 'Pipidae', 'Xenopodinae', 'Xenopus', 'Xenopus']
keywords:['Developmental

> ZIC3_XENTR 
 RecName: Full=Zinc finger protein ZIC 3 {ECO:0000250|UniProtKB:O57311}; AltName: Full=Zinc finger protein of the cerebellum 3 {ECO:0000250|UniProtKB:O57311}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Amphibia', 'Batrachia', 'Anura', 'Pipoidea', 'Pipidae', 'Xenopodinae', 'Xenopus', 'Silurana']
keywords:['Activator', 'Cytoplasm', 'Developmental protein', 'Differentiation', 'DNA-binding', 'Metal-binding', 'Neurogenesis', 'Nucleus', 'Reference proteome', 'Repeat', 'Transcription', 'Transcription regulation', 'Zinc', 'Zinc-finger']

id:O57311
> ZIC3_XENLA 
 RecName: Full=Zinc finger protein ZIC 3; Short=XZic3; Short=XlZic3; AltName: Full=Zinc finger protein Zic3-A; AltName: Full=Zinc finger protein of the cerebellum 3; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Amphibia', 'Batrachia', 'Anura', 'Pipoidea', 'Pipidae', 'Xenopodinae', 'Xenopus', 'Xenopus']
keywords:['Activator', 'Cytoplasm', 'Developme

## Alignment and Phylo

In [15]:
from Bio.Align.Applications import ClustalwCommandline

dir = r'C:\Program Files (x86)\ClustalW2\clustalw2'
in_file = r'blast_PROT_GLI2_SEQ_file.fasta'

clustalw_cline = ClustalwCommandline(dir, infile=in_file)
clustalw_cline()
print(clustalw_cline)


"C:\Program Files (x86)\ClustalW2\clustalw2" -infile=blast_PROT_GLI2_SEQ_file.fasta


In [18]:
cline = ClustalwCommandline("clustalw", infile="blast_PROT_GLI2_SEQ_file.fasta", outfile="blast_PROT_GLI2_SEQ_file.aln")
cline


ClustalwCommandline(cmd='clustalw', infile='blast_PROT_GLI2_SEQ_file.fasta', outfile='blast_PROT_GLI2_SEQ_file.aln')

In [19]:
from Bio import AlignIO
align = AlignIO.read("blast_PROT_GLI2_SEQ_file.aln", "clustal")

print(format(align, "clustal")) #formato string

CLUSTAL 2.1 multiple sequence alignment


Q61467                              --------------------------------------------------
Q8N9L1                              --------------------------------------------------
A0JC51                              --------------------------------------------------
Q7TQ40                              --------------------------------------------------
Q96T25                              --------------------------------------------------
Q9IB89                              --------------------------------------------------
O60481                              --------------------------------------------------
Q62521                              --------------------------------------------------
Q6DJQ6                              --------------------------------------------------
O57311                              --------------------------------------------------
Q9YIB7                              --------------------------------------------------
Q

In [20]:
from Bio import AlignIO
align = AlignIO.read("blast_PROT_GLI2_SEQ_file.aln", "clustal")
print(align)
print("Número de linhas: %i" % len(align)) 
print(format(align, "clustal"))

for record in align:
    print("%s - %s" % (record.seq, record.id)) 

#ver com que frequência as letras são substituidas
subs = align.substitutions
print(subs)

Alignment with 50 rows and 1840 columns
--------------------------------------------...--- Q61467
--------------------------------------------...--- Q8N9L1
--------------------------------------------...--- A0JC51
--------------------------------------------...--- Q7TQ40
--------------------------------------------...--- Q96T25
--------------------------------------------...--- Q9IB89
--------------------------------------------...--- O60481
--------------------------------------------...--- Q62521
--------------------------------------------...--- Q6DJQ6
--------------------------------------------...--- O57311
--------------------------------------------...--- Q9YIB7
--------------------------------------------...--- Q91689
--------------------------------------------...--- O95409
--------------------------------------------...--- Q62520
--------------------------------------------...--- Q15915
--------------------------------------------...--- P46684
--------------------------------

       A       C      D       E      F       G       H      I       K       L      M      N       P      Q       R       S      T      V      W      Y
A 9800.0   365.0 1436.0  1546.5  739.5  3932.0  1470.0 1029.0  1231.5  2534.0  820.0 1249.5  3797.5 2017.5  1674.0  6062.5 2752.0 1941.5   72.5  775.0
C  365.0 12518.0  193.0   195.0  143.5   481.0   228.5  156.0   103.0   270.0  122.0  114.5   388.0  189.0   212.0   568.5  250.5  248.5    3.5   95.0
D 1436.0   193.0 5283.0  2414.0  303.0  1937.0  1188.0  419.5   578.5   880.5  332.0 1177.5  1182.0 1051.0   829.5  2590.5 1276.5  567.0   13.5  240.5
E 1546.5   195.0 2414.0 11392.0  527.0  1757.5  1422.0  340.5   713.5  1264.5  246.0 1035.0  2238.0 1529.0  1112.0  2214.5 1161.0  696.5  136.0  340.5
F  739.5   143.5  303.0   527.0 7724.0   724.5  1101.5  392.5   383.0  1017.5  315.0  355.0   729.5  403.0   442.5   889.5  520.5  491.0   34.0 1867.0
G 3932.0   481.0 1937.0  1757.5  724.5 15053.0  1994.0  868.0   993.0  2179.0  712.0 1831.0  3

In [21]:
count = AlignIO.convert("blast_PROT_GLI2_SEQ_file.aln", "clustal","blast_PROT_GLI2_SEQ_file.sth", "stockholm") 
print ("Convertidos %i alinhamentos" % count )

Convertidos 1 alinhamentos


In [5]:
alignment = AlignIO.read("blast_PROT_GLI2_SEQ_file.sth", "stockholm")


In [6]:
calculator = DistanceCalculator('blosum62')
dm = calculator.get_distance(alignment)
print(dm)

Q61467	0
Q8N9L1	0.18705818379554107	0
A0JC51	0.3139658848614072	0.3128734383487235	0
Q7TQ40	0.4593867670790748	0.45658110322228296	0.5993103448275863	0
Q96T25	0.4522532188841202	0.45444626295690127	0.606070826306914	0.11332357247437774	0
Q9IB89	0.4326439351593069	0.4161455372370665	0.5222018348623854	0.30651197604790414	0.31131738664762587	0
O60481	0.469853340575774	0.46187845303867403	0.5495867768595042	0.5945616883116883	0.6083803384367445	0.5387665198237885	0
Q62521	0.4684439608269858	0.45932484781405647	0.5487603305785125	0.5913715913715913	0.6052525252525253	0.5370044052863436	0.011705033164260636	0
Q6DJQ6	0.4479338842975207	0.4269725797425853	0.5218135158254918	0.5789697743720732	0.5933277027027026	0.526268522676246	0.16129032258064513	0.15839536807278742	0
O57311	0.45364238410596025	0.43385650224215244	0.5273738237810094	0.5792163543441227	0.5927334178284749	0.522911051212938	0.16839056681836984	0.16549441456350844	0.01930977814297452	0
Q9YIB7	0.4654226125137212	0.44754464285714

In [7]:
constructor = DistanceTreeConstructor() 
upgmatree = constructor.upgma(dm)
print(upgmatree)

Tree(rooted=True)
    Clade(branch_length=0, name='Inner49')
        Clade(name='Inner47')
            Clade(name='Q14872')
            Clade(name='Inner26')
                Clade(name='Q1LYE3')
                Clade(name='Inner14')
                    Clade(name='Inner8')
                        Clade(name='Q58DZ6')
                        Clade(name='Q91853')
                    Clade(name='Inner5')
                        Clade(name='P52747')
                        Clade(name='A6QQW0')
        Clade(name='Inner48')
            Clade(name='Inner38')
                Clade(name='Q17308')
                Clade(name='P34708')
            Clade(name='Inner46')
                Clade(name='Inner41')
                    Clade(name='Inner19')
                        Clade(name='Q8K1M4')
                        Clade(name='Q8NBF1')
                    Clade(name='Inner16')
                        Clade(name='Q6XP49')
                        Clade(name='Q8NEA6')
                Clade(name='Inn

In [8]:
njtree = constructor.nj(dm)
print(njtree)

Tree(rooted=False)
    Clade(branch_length=0, name='Inner48')
        Clade(name='Inner42')
            Clade(name='Inner41')
                Clade(name='Inner40')
                    Clade(name='Inner36')
                        Clade(name='Inner35')
                            Clade(name='Q61467')
                            Clade(name='Q8N9L1')
                        Clade(name='A0JC51')
                    Clade(name='Inner32')
                        Clade(name='Q9IB89')
                        Clade(name='Inner19')
                            Clade(name='Q96T25')
                            Clade(name='Q7TQ40')
                Clade(name='Inner34')
                    Clade(name='Inner33')
                        Clade(name='Inner29')
                            Clade(name='O73689')
                            Clade(name='Inner28')
                                Clade(name='Q8JJC0')
                                Clade(name='Inner27')
                                    Clade(

In [9]:
Phylo.write([upgmatree, njtree],"phylotree_GLI2.nhx","newick")

2

In [11]:
Phylo.draw_ascii(njtree)

                            ___________ Q61467
                          _|
                 ________| |__________ Q8N9L1
                |        |
               _|        |_________________________ A0JC51
              | |
              | |               ________________ Q9IB89
              | |______________|
              |                |              ______ Q96T25
              |                |_____________|
              |                              |______ Q7TQ40
              |
              |                           ___ O73689
              |               ___________|
             _|              |           |___ Q8JJC0
            | |              |           |
            | |              |           |  , P46684
            | |              |           |__|
            | |             _|              | Q15915
            | |            | |
            | |            | |                  , Q62520
            | |            | |        __________|
            | |     

In [12]:
Phylo.draw_ascii(upgmatree)

    _______________________ Q14872
   |
  _|                _______ Q1LYE3
 | |               |
 | |_______________|         , Q58DZ6
 |                 |       __|
 |                 |      |  | Q91853
 |                 |______|
 |                        |  , P52747
 |                        |__|
 |                           | A6QQW0
 |
_|                _____________ Q17308
 |      _________|
 |     |         |_____________ P34708
 |     |
 |     |                                       ___ Q8K1M4
 |     |                        ______________|
 |     |                       |              |___ Q8NBF1
 |     |     __________________|
 |     |    |                  |               ___ Q6XP49
 |     |    |                  |______________|
 |_____|    |                                 |___ Q8NEA6
       |    |
       |    |                                    __ O77027
       |    |                ___________________|
       |    |               |                   |__ P19538
       |  