In [2]:
from Bio.Blast import NCBIXML 
from Bio.Blast import NCBIWWW
import requests, sys, json
import re
from Bio import SeqIO
from Bio import Entrez
from Bio import Medline
import pandas as pd
import numpy as np
from tqdm import tqdm
from Bio import SearchIO
from Bio.SwissProt import KeyWList
from Bio import SwissProt
from Bio import ExPASy
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
from Bio import Phylo

## Searching for literature for GLI2 gene

In [3]:
database = "PubMed"
word = 'GLI2'
res= int('30')
email= "rodrigoce9@gmail.com"

Entrez.email= email
handle=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle)
handle.close()
idlist= record['IdList']

handle = Entrez.efetch(db=database, id=idlist, rettype="medline", retmode="text")
records = Medline.parse(handle)


for record in records:
    #print(record) 
    #print("PMID:",record.get("PMID","-"))
    print("title:", record.get("TI", "-"))
    print('abstract:', record.get('AB', '-'))
    print("authors:", record.get("AU", "-"))
    print("source:", record.get("SO", "-"))
    print("")

title: Reciprocal FGF19-GLI2 signaling induces epithelial-to-mesenchymal transition to promote lung squamous cell carcinoma metastasis.
abstract: PURPOSE: Metastatic lung squamous cell carcinoma (LUSC) is one of the most common causes of cancer death worldwide. As yet, however, the molecular mechanism underlying LUSC metastasis remains elusive. In this study, we report a novel mechanism involving signaling interactions between FGF19 and GLI2 that could drive the progression of LUSC. METHODS: The expression of FGF19 in human LUSC samples was assessed by immunohistochemistry. The concentration of FGF19 in serum samples was assessed by ELISA. RNA sequencing, scratch wound-healing, trans-well, GO analysis, GSEA, luciferase reporter, Western blotting, immunofluorescence and immunohistochemistry assays, as well as an animal model were used to investigate the molecular mechanism underlying FGF19 driven LUSC progression. The therapeutic effect of a GLI2 inhibitor was determined using both in v

title: Gypenosides ameliorate ductular reaction and liver fibrosis via inhibition of hedgehog signaling.
abstract: Backgroud and aims: Ductular reaction (DR) is a common pathological change and thought to have a key role in the pathogenesis and progression of liver fibrosis. Our previous study reported Gypenosides (GPs) ameliorated liver fibrosis, however, the anti-fibrotic mechanisms of GPs are still unclear. Methods: Liver fibrosis was induced in rats by carbon tetrachloride combining with 2-acerylaminofluorene (CCl(4)/2-AAF), and Mdr2 knockout (Mdr2 (-/-)) mice to evaluate the anti-fibrotic role of GPs. In vitro, WB-F344 cells, a hepatic progenitor cells (HPCs) line, with or without Gli1 overexpressing lentiviral vectors, were induced by sodium butyrate (SB) to validate the mechanism of GPs and NPLC0393, the main ingredient of GPs. Results: Both in CCl(4)/2-AAF-treated rats and Mdr2 (-/-) mice, GPs obviously reduced the deposition of collagen and hydroxyproline content, inhibited th

title: Hedgehog Signaling as a Therapeutic Target for Airway Remodeling and Inflammation in Allergic Asthma.
abstract: Genome-wide association studies (GWAS) have shown that variants of patched homolog 1 (PTCH1) are associated with lung function abnormalities in the general population. It has also been shown that sonic hedgehog (SHH), an important ligand for PTCH1, is upregulated in the airway epithelium of patients with asthma and is suggested to be involved in airway remodeling. The contribution of hedgehog signaling to airway remodeling and inflammation in asthma is poorly described. To determine the biological role of hedgehog signaling-associated genes in asthma, gene silencing, over-expression, and pharmacologic inhibition studies were conducted after stimulating human airway epithelial cells or not with transforming growth factor beta1 (TGFbeta1), an important fibrotic mediator in asthmatic airway remodeling that also interacts with SHH pathway. TGFbeta1 increased hedgehog-signa

## Searching for GLI2 sequence

In [3]:
database = 'nucleotide'
word = 'GLI2 and homo sapiens and Chromosome 2 and not predicted and not unverified '
res= '15'
email= 'rodrigoce9@gmail.com'
Entrez.email= email
handle_search=Entrez.esearch(db = database, term=word, retmax= res)
record=Entrez.read(handle_search)
handle_search.close()
idlist= record['IdList']

In [4]:
handle = Entrez.efetch(db=database, id=idlist, rettype="gb") 
records = list(SeqIO.parse(handle,"gb"))
handle.close()
for info in records:
    print(info.id, '-', info.description)
    #print('length of seq:', len(info.seq)) #to check the length of the sequences

NM_001374354.1 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 4, mRNA
NM_001374353.1 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 3, mRNA
NM_001371271.1 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 1, mRNA
NM_005270.5 - Homo sapiens GLI family zinc finger 2 (GLI2), transcript variant 2, mRNA
NM_003743.5 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 1, mRNA
NM_001362950.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 4, mRNA
NM_001362952.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 5, mRNA
NM_147223.3 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 2, mRNA
NM_001362954.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 6, mRNA
NM_001362955.1 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), transcript variant 7, mRNA
NM_147233.2 - Homo sapiens nuclear receptor coactivator 1 (NCOA1), 

The selection of the id has to be done manually as there is no pattern in writing the titles of the queries.\
The id __NG_009030.2__ is the only one where the annotated sequence is in chromosome 2, is not a mRNA and actually is RefSeq. This means that the sequence is being used as a standard for well-characterized genes. So id __NG_009030.2__ will be used from now on.

In [8]:
#The correspondent information of NG_015830.1 was downloaded to a file
import os
Entrez.email = "rodrigoce9@gmail.com"
filename = "NG_009030.2.gb"
if not os.path.isfile(filename):
    net_handle = Entrez.efetch(db="nucleotide", id="NG_009030.2", rettype="gb", retmode="text")
    out_handle = open(filename, "w")
    out_handle.write(net_handle.read())
    out_handle.close()
    net_handle.close()

In [9]:
record = SeqIO.read(open("NG_009030.2.gb"), format="genbank")
position=0
record_types={}
for x in record.features:
    record_types[x.type]=record_types.get(x.type,0)+1
for i in range(len(record.features)):
    if record.features[i].type == "CDS":
        position=i
print("The length of the sequence: {}\n".format(len(record.seq)))
print("Type of features: {}\n".format(record_types))
print("Comment from NCBI: {}\n".format(record.annotations["comment"]))
#checking the location of the CDS on the original sequence
print("Location of the CDS on the original sequence: {}".format(record.features[position].location))

The length of the sequence: 263786

Type of features: {'source': 1, 'gene': 1, 'mRNA': 1, 'exon': 14, 'CDS': 1, 'misc_feature': 17}

Comment from NCBI: REVIEWED REFSEQ: This record has been curated by NCBI staff. The
reference sequence was derived from AC018866.9, AC017033.5,
KF510752.1, KF510212.1 and AC016764.8.
This sequence is a reference standard in the RefSeqGene project.
On Sep 14, 2022 this sequence version replaced NG_009030.1.
Summary: This gene encodes a protein which belongs to the C2H2-type
zinc finger protein subclass of the Gli family. Members of this
subclass are characterized as transcription factors which bind DNA
through zinc finger motifs. These motifs contain conserved H-C
links. Gli family zinc finger proteins are mediators of Sonic
hedgehog (Shh) signaling and they are implicated as potent
oncogenes in the embryonal carcinoma cell. The protein encoded by
this gene localizes to the cytoplasm and activates patched
Drosophila homolog (PTCH) gene expression. It is al

In [11]:
CDS_nuc_seq=0
CDS_nuc_seq=range(int(record.features[position].location.start), int(record.features[position].location.end))

In [12]:
#saving the CDS_nucleotides and CD_aminoacid seqs in files
filename_CDS_nucl = "CDS_nucleot_GLI2_seq.fasta"
filename_CDS_prot = "CDS_prot_GLI2_seq.fasta"
input_handle  = open(filename, "r")
output_handle_nucl = open(filename_CDS_nucl, "w")
output_handle_prot = open(filename_CDS_prot, "w")
for seq_record in SeqIO.parse(input_handle, "genbank") :
    output_handle_nucl.write(">\n%s" % (CDS_nuc_seq))
    output_handle_prot.write(">\n%s" % ("".join(record.features[position].qualifiers['translation']))) #without join, output is a list
    
output_handle_nucl.close()
output_handle_prot.close()
input_handle.close()

## BLASTN - for all organisms

In [None]:
record = SeqIO.read(open("10001bp_sequence_GLI2.gb"), format="gb") 
print (len(record.seq))-

In [None]:
Blast = NCBIWWW.qblast("blastn", "nt", record.seq, entrez_query = "Homo Sapiens[organism]")

In [None]:
with open('blast_DNA_GLI2_file.xml', "w") as out_handle:
        out_handle.write(Blast.read())
Blast.close()

In [13]:
results_Blastn = open("blast_DNA_GLI2_file.xml")
blastn_records = NCBIXML.read(results_Blastn)
for parameter in blastn_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  NG_009030
Definition:  Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
E-value:  0.0
E-value:  1.23303e-94
E-value:  2.71593e-90
E-value:  4.03079e-88
E-value:  1.71394e-86
E-value:  5.98222e-86
E-value:  2.54371e-84
E-value:  8.87841e-84
E-value:  8.87841e-84
E-value:  8.87841e-84
E-value:  3.09887e-83
E-value:  3.77519e-82
E-value:  4.59913e-81
E-value:  4.59913e-81
E-value:  1.60525e-80
E-value:  5.60289e-80
E-value:  2.38241e-78
E-value:  1.23412e-75
E-value:  4.30749e-75
E-value:  4.30749e-75
E-value:  1.50346e-74
E-value:  1.50346e-74
E-value:  1.50346e-74
E-value:  5.2476e-74
E-value:  6.39289e-73
E-value:  6.39289e-73
E-value:  2.23134e-72
E-value:  2.23134e-72
E-value:  9.48789e-71
E-value:  1.40813e-68
E-value:  3.78187e-44
E-value:  3.78187e-44
E-value:  5.61279e-42
E-value:  5.61279e-42
E-value:  3.54206e-38
E-value:  5.9981e-29
E-value:  3.54519e-19
E-value:  4.31892e-18
E-value:  4.04506e-12
E-value:  4.92789e-11
E-value:  6.0034e-10


In [14]:
E_VALUE_THRESH = 0.001
results_Blast= open('blast_DNA_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    print (alignment)
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            print('****Alignment****')
            print('sequence: ', alignment.title)
            print('lenght:', alignment.length)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')
            print()
            

gi|2301838502|ref|NG_009030.2| Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
           Length = 263786

****Alignment****
sequence:  gi|2301838502|ref|NG_009030.2| Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
lenght: 263786
TTGATCCATTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAG...
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||...
TTGATCCATTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAG...

****Alignment****
sequence:  gi|2301838502|ref|NG_009030.2| Homo sapiens GLI family zinc finger 2 (GLI2), RefSeqGene on chromosome 2
lenght: 263786
TTAATTTTTTT-TTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTC...
||||||  ||| |||||||||||||||||||| |||  ||||||||||| ||||||| | | |||||| ||||||...
TTAATTAATTTATTTTTTTGAGACAGAGTCTCGCTCCATCACCCAGGCTGGAGTGCAGTGGCGCCATCATGGCTC...

****Alignment****
sequence:  gi|2301838502|ref|NG_009030.2| Homo sapiens GLI family zinc finger 

sequence:  gi|1909942459|dbj|AP023478.1| Homo sapiens DNA, chromosome 18, nearly complete genome
lenght: 77846715
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
||||||||||||||||||  |||| || |||| ||| |||||||| ||||||||| |||| ||||||||||||||...
TTTTTTTTTTTTTTGAGATGGAGTTTCACTCTTGTCGCCCAGGCTGGAGTGCAATGGTGCAATCTTGGCTCACTG...

****Alignment****
sequence:  gi|1909942459|dbj|AP023478.1| Homo sapiens DNA, chromosome 18, nearly complete genome
lenght: 77846715
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| ||||||| |||||||| ||||||| | | || |||| ||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|1909942459|dbj|AP023478.1| Homo sapiens DNA, chromosome 18, nearly complete genome
lenght: 77846715
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCA-TCTTGGCTCACTG...
|| |||||||||||||||||||||||| ||||| |||||||||| ||||||| | || ||| |||||

lenght: 77846715
GTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCT...
||||| ||| ||||||||||||||||||| ||  ||| ||  |||||||| || |||||| | |  ||||  |||...
GTTAACTTTCTTTTTTTTTGAGACAGAGTTTCGTTCTTGTTGCCCAGGCTGGACTGCAATGGCGTGATCTCAGCT...

****Alignment****
sequence:  gi|1909942459|dbj|AP023478.1| Homo sapiens DNA, chromosome 18, nearly complete genome
lenght: 77846715
TTAATTTTTTTTTTTTTT-GAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTC...
|||||||||||||||||| ||||| ||||||| |||| |  | |||||  ||||||| | | |  ||||||||||...
TTAATTTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTTTGGCTCAGGCCGGAGTGCAGTGGCGTGATCTTGGCTC...

****Alignment****
sequence:  gi|1909942459|dbj|AP023478.1| Homo sapiens DNA, chromosome 18, nearly complete genome
lenght: 77846715
TTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCA...
|||||||||  ||||| ||||||||| | ||||| |||||||| ||||||| | |||| ||||||||||||  ||...
TTTTTTTTTAATTGAGGCAGAGTCTCACCCTGTCTCCCAGGCTGGAGTGCAGTGGTGCTATCTTGGCTCACCACA...

****A

****Alignment****
sequence:  gi|2033714914|gb|CP068260.2| Homo sapiens isolate CHM13 chromosome 18
lenght: 80542538
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||||||||||| |||||||||||||||| ||||||| | |||| | || ||||||||||...
TTTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTCACCCAGGCTGGAGTGCAGTGGTGCAACCTCGGCTCACTGC...

****Alignment****
sequence:  gi|2033714914|gb|CP068260.2| Homo sapiens isolate CHM13 chromosome 18
lenght: 80542538
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| |||||| | ||||||| ||||||||| |||| |||||||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTTATCCAGGCTGGAGTGCAAT-GTGCAATCTTGGCTCACTGC...

****Alignment****
sequence:  gi|2033714914|gb|CP068260.2| Homo sapiens isolate CHM13 chromosome 18
lenght: 80542538
ATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||||||||||   ||||| |  |||||||||||||| ||||||||| |  | |||| |||||||||...
ATTTTTTTTTTTTTTGAG


****Alignment****
sequence:  gi|2033714914|gb|CP068260.2| Homo sapiens isolate CHM13 chromosome 18
lenght: 80542538
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||| ||||||||  ||||||| ||||||||||||| || ||||||||| |||  |||| ||||||||||...
TTTTTTTTTCTTTTGAGATGGAGTCTCACTCTGTCACCCAGCCTGGAGTGCAATGGTGTGATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|2033714914|gb|CP068260.2| Homo sapiens isolate CHM13 chromosome 18
lenght: 80542538
TTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAA...
||||||||||||||||| || | || | ||||| |||||||| ||||||||| |    ||||  |||||||||||...
TTTTTTTTTTTTGAGACGGAATGTCACCCTGTCGCCCAGGCTGGAGTGCAATGGCATGATCTCAGCTCACTGCAA...

****Alignment****
sequence:  gi|2033714914|gb|CP068260.2| Homo sapiens isolate CHM13 chromosome 18
lenght: 80542538
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
||||||||||||||||||| |||| || |||| |||||||||||| ||||||||| | || ||||  ||||||||...
TTTTTTTTTTTTTTGAG


****Alignment****
sequence:  gi|22532582|gb|AC011774.9| Homo sapiens chromosome 18, clone RP11-380C8, complete sequence
lenght: 208039
TTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCA...
|||||||||||||||||||||||||| ||||||| |||||||| ||||||| | | |||||||  ||||||||||...
TTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCCATCTCAGCTCACTGCA...

****Alignment****
sequence:  gi|22532582|gb|AC011774.9| Homo sapiens chromosome 18, clone RP11-380C8, complete sequence
lenght: 208039
TTTTTTTTTTT---TTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCA...
|||||||||||   |||||||| |||| || |||| ||  |||||||| ||||||||| |||| |||||||||||...
TTTTTTTTTTTAAATTTGAGACGGAGTTTCGCTCTTGTTGCCCAGGCTGGAGTGCAATGGTGCAATCTTGGCTCA...

****Alignment****
sequence:  gi|22532582|gb|AC011774.9| Homo sapiens chromosome 18, clone RP11-380C8, complete sequence
lenght: 208039
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||||| ||||| |||||||| ||

sequence:  gi|13488761|dbj|AP001972.4| Homo sapiens genomic DNA, chromosome 11q, clone:CTD-2562J17, complete sequences
lenght: 215225
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||||||||||  | | |||||||||||| ||| ||||| | ||||||| ||||||||||...
TTTTTTTTTTTTTTGAGACAGAGTCTTACCCCGTCACCCAGGCTGGAGGGCAATGGCGCCATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|13488761|dbj|AP001972.4| Homo sapiens genomic DNA, chromosome 11q, clone:CTD-2562J17, complete sequences
lenght: 215225
ATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||||||||||  ||||||| ||||||  |||||||| ||||||| | || | ||||  ||||||| ...
ATTTTTTTTTTTTTTGAGAGGGAGTCTCGCTCTGTTGCCCAGGCTGGAGTGCAGTGGTACGATCTCAGCTCACTA...

****Alignment****
sequence:  gi|13488761|dbj|AP001972.4| Homo sapiens genomic DNA, chromosome 11q, clone:CTD-2562J17, complete sequences
lenght: 215225
TTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAAC...
|||||||

|| ||||||||||| || ||||||  |||| |  |||||||| || |||||| |||| |||| ||||||||||||...
TTCTTTTTTTTTGAAACGGAGTCTAGCTCTATTGCCCAGGCTGGATTGCAATGGTGCGATCTCGGCTCACTGCAA...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTC...
|| || | |||||||||||||||||||| ||  ||| ||  |||||||| ||||||||| |||| |||| |||||...
TTTATGTATTTTTTTTTTGAGACAGAGTTTCATTCTTGTTGCCCAGGCTGGAGTGCAATGGTGCGATCTCGGCTC...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||||||||||| ||||||  |||||||| | ||||| | |  | |||||||||||||||...
TTTTTTTTTTTTTTGAGACAGAGTCTCACTCTGTTGCCCAGGCTGGTGTGCAGTGGCACAATCTTGGCTCACTGC...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTTTGAG

ATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
||||  |||||||||||||||||||||  ||||||| |||||||| |||||   | | || |||| |||||||||...
ATTTCCTTTTTTTTTGAGACAGAGTCTAGCTCTGTCGCCCAGGCTGGAGTGTGGTGGCGCGATCTCGGCTCACTG...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
|| |||||||||| ||||  ||||||| ||||||| |||||||| ||||||||| |||| |||| ||||||||||...
TTGTTTTTTTTTTCGAGATGGAGTCTCACTCTGTCGCCCAGGCTGGAGTGCAATGGTGCGATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAA...
||| ||||||||||||||||||||| |||||||||||||||| ||||||| | | || |||| ||||||||||||...
TTTATTTTTTTTGAGACAGAGTCTCACTCTGTCACCCAGGCTGGAGTGCAGTGGCGCAATCTCGGCTCACTGCAA...

****Alignment****
sequence:  gi|2033715243|gb|CP068259

sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
|||| |||||||||||||| ||||||| |||||||||||||| | ||||||| | | || ||||  |||||||||...
TTTTCTTTTTTTTTGAGACGGAGTCTCGCTCTGTCACCCAGGATGGAGTGCAGTGGCGCGATCTCAGCTCACTGC...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAA...
||| ||| ||| ||||||||||||  ||||||||| |||||| ||||||||| |    |||||||||||||||||...
TTTATTTATTTAGAGACAGAGTCTTACTCTGTCACTCAGGCTGGAGTGCAATGGCATGATCTTGGCTCACTGCAA...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTTTGAGACA-GAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACT...
||||||||||||| |||||| ||||||| |||| |||||||||||| ||| ||| | |  | |||| ||||||||...
TTTTTTTTTTTTTGGAGACAAGAGTCTCACTCTCGT

lenght: 61707364
TTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAAC...
||| ||||| |||| ||||||||| ||||||| |||||||| ||||||| | |||  ||||||||||||||||||...
TTTATTTTTGTGAGTCAGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGTGTGATCTTGGCTCACTGCAAC...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
ATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||||||||||| ||||||| ||||||| |||||||| ||||||| | |  | ||||||||||||||...
ATTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCACAATCTTGGCTCACTG...

****Alignment****
sequence:  gi|2033715243|gb|CP068259.2| Homo sapiens isolate CHM13 chromosome 19
lenght: 61707364
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| |||||||||||||||| ||||||| | |||| ||||  |||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCTCTCTGTCACCCAGGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGC...

****Alignment****
sequence:  gi|20337

sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| |||||||||||||||| ||||||| | | || |||| ||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCACTCTGTCACCCAGGCTGGAGTGCAGTGGCGCAATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||  ||||||| |||||||| ||||||||| |  | |||||||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTTGCTCTGTCGCCCAGGCTGGAGTGCAATGGCACAATCTTGGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||||||||||||||| || |||| ||  |||||||| |||||||||  ||  ||||||

sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||| |||||||| |||||||||| |||||||||||||||| ||||||| | | ||||||| |||||||| |...
TTTTTTTCTTTTTTGACACAGAGTCTCGCTCTGTCACCCAGGCTGGAGTGCAGTGGCGCCATCTCGGCTCACTAC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
|| ||| ||||||||||||||||| || |||||||||||||||| ||||||| | || | | ||  |||||||||...
TTGTTTGTTTTTTTGAGACAGAGTTTCTCTCTGTCACCCAGGCTGGAGTGCAGTGGTACGACCTCAGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||  ||||||| ||||||| |||||||| ||||||| | |||| |||| ||

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
||||||| |||| ||||||  ||| |  |||| ||| |||||||| ||||||||| |||  |||| |||||||||...
TTTTTTTCTTTTGTGAGACCAAGTTTTGCTCTTGTCCCCCAGGCTGGAGTGCAATGGTGTGATCTCGGCTCACTG...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||||||||||||||  || |||| ||  |||||||| ||||||||| | || |||| ||||||| |...
TTTTTTTTTTTTTTGAGACAGAGCTTCGCTCTTGTTGCCCAGGCTGGAGTGCAATGGCGCAATCTCGGCTCACCG...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||||||||||| ||||||  |||||||| ||||

TTTTTTTTTTTTTGAGACAGAGTTTTGCTCTTGTTGCCCAGGCTGGAATGCAGTGGCACAATCTCGGCTCACCGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||| |||||||| ||  ||||||||||||||| ||||||| | |||| ||||  |||||||||...
TTTTTTTTTTTTTTGCGACAGAGTTTCATTCTGTCACCCAGGCTGGAGTGCAGTGGTGCAATCTCAGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||| ||||||||||||||||||| ||||||    ||||||  |||||| | |||| ||||  |||||||||...
TTTTTTTCTTTTTTGAGACAGAGTCTCACTCTGTTGTGCAGGCTGCAGTGCAGTGGTGCGATCTCAGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTT--TTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTA

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| ||||||| |||||||| ||||||  | | || ||||  |||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCACTCTGTCGCCCAGGCTGGAGTGCGGTGGCGCGATCTCAGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||| || | |||||||   ||||| ||||||||| |||| |||||||||||||||...
TTTTTTTTTTTTTTGAGACGGAGACTGC-TCTGTCATTTAGGCTGGAGTGCAATGGTGCAATCTTGGCTCACTGC...

****Alignment****
sequence:  gi|1909942460|dbj|AP023479.1| Homo sapiens DNA, chromosome 19, nearly complete genome
lenght: 59105444
ATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
||||||||||||||||||| |||||||| ||||||  |||||||| |||

lenght: 44544
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCA-----ATAGTGCCATCTTGGCTC...
||||||||||||||  | |||||| |  || |||  |||||| | |||| ||     || | || ||||||||||...
TTTTTTTTTTTTTTTTGGCAGAGTATTGCTGTGTTGCCCAGGGTGGAGTACAGTGGCATGGGGCAATCTTGGCTC...

****Alignment****
sequence:  gi|2935596|gb|AC004262.1|AC004262 Homo sapiens chromosome 19, cosmid R29368, complete sequence
lenght: 44544
TTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAA...
||||||||||||||||||||||||  ||||||  || ||||| |||||  || | |  | |||||||||||||||...
TTTTTTTTTTTTGAGACAGAGTCTTGCTCTGTTGCCTAGGCTGGAGTGTGATGGCGTGACCTTGGCTCACTGCAA...

****Alignment****
sequence:  gi|2935596|gb|AC004262.1|AC004262 Homo sapiens chromosome 19, cosmid R29368, complete sequence
lenght: 44544
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||| ||||| ||||||| | ||  ||||||||||||| || ||||||| | |  ||||||  |||||||||...
TTTTTTTCTTTTTAGAGACAGGGCCTTGCTCTGTCACCCAGTCTGGAGTGCAGTGGCACCATCTCTGCTCACTGC.

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
|||| |||||||||||||| ||||||| |||| ||||||||||| ||||||| | |||| |||| ||||||||||...
TTTTCTTTTTTTTTGAGACGGAGTCTCGCTCTATCACCCAGGCTGGAGTGCAGTGGTGCAATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||  ||||||| ||||||  |||||||| ||||||||| |||| |||||||||||||||...
TTTTTTTTTTTTTTGAGATGGAGTCTCACTCTGTTGCCCAGGCTGGAGTGCAATGGTGCGATCTTGGCTCACTGC...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCA...
|||||||||||||||||  | ||||| |||| ||||||||||| ||||||||| | || ||||||||||||||||...
TTTTTTTTTTTTTGAGAT

sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTGGAGTTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCT...
||||||| |||  | ||||||||||||||  |||| |  |||||||||||||||| ||||||| |||||  ||||...
TTTGGAGGTAAAGTCTTTTTTTTTTGAGATGGAGTTTTGCTCTGTCACCCAGGCTGGAGTGCAGTAGTGTGATCT...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| ||||||| |||||||| ||||||| | | || |||| ||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
||||||||||||||||||  |||| |  |||| ||  |||||||| ||||||||| | || |||| ||||||| |...
TTTTTTTTTTTTTTGAGATGGAGTTTTGCTCTTGTT

lenght: 96330374
ATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||| ||||||||||||||  |||| || || ||| | ||||||| | | || |||| |||||||||...
ATTTTTTTTTTTCTTGAGACAGAGTCTTGCTCTATCGCCAAGGTTGGAGTGCAGTGGCGCGATCTCGGCTCACTG...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
|||||||||||||| |||  ||||||| ||||||  |||||||| ||||||| | |||  |||| ||||||||||...
TTTTTTTTTTTTTTAAGATGGAGTCTCACTCTGTTGCCCAGGCTGGAGTGCAGTGGTGTGATCTCGGCTCACTGC...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCC-ATCTTGGCTCACT...
||||||||||||||||||  ||||||| || | ||| |||||||| ||||||||  |   | |||| ||||||| ...
TTTTTTTTTTTTTTGAGATGGAGTCTCGCTGTTGTCGCCCAGGCTGGAGTGCAACGGGCTCGATCTCGGCTCACC...

****Alignment****
sequence:  gi|20337

sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCA...
||| || ||| ||| ||| |||||||||||| ||||||||||||||||||||||||   |||  |||| ||||||...
TTATTTATTTATTTATTTAAGACAGAGTCTCGCTCTGTCACCCAGGCTAGAGTGCAGAGGTGTGATCTCGGCTCA...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCA...
|||| |||||| |||||||||| |  |||| ||  | ||||||||||| |||| | |  ||||||||||||||||...
TTTTGTTTTTTAGAGACAGAGTTTTGCTCTTGTAGCTCAGGCTAGAGTACAATGGCGTGATCTTGGCTCACTGCA...

****Alignment****
sequence:  gi|2033714908|gb|CP068262.2| Homo sapiens isolate CHM13 chromosome 16
lenght: 96330374
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||| ||||||| |||||||| ||||||| | | |  |||| ||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTCGCTCTGTCG

lenght: 94690957
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||||||||||||||||||||||| || |||| ||  |||||||| ||||||||| |||| |||||||||||| |...
TTTTTTTTTTTTTTGAGACAGAGTTTCGCTCTTGTTGCCCAGGCTGGAGTGCAATGGTGCAATCTTGGCTCACCG...

****Alignment****
sequence:  gi|1909942457|dbj|AP023476.1| Homo sapiens DNA, chromosome 16, nearly complete genome
lenght: 94690957
TAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCA...
||||||||||||||||||||||||||| |  |||| ||  |||||||| |||| |||| |||| |||||||||||...
TAATTTTTTTTTTTTTTGAGACAGAGTTTTGCTCTTGTTGCCCAGGCTGGAGTACAATGGTGCGATCTTGGCTCA...

****Alignment****
sequence:  gi|1909942457|dbj|AP023476.1| Homo sapiens DNA, chromosome 16, nearly complete genome
lenght: 94690957
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||| ||||||  | ||||  |||||||| ||||||| | |||| |||||||||||||||...
TTTTTTTTTTTTTTGAGACGGAGTCTTGCCCTGTTGCCCAGGCTGGAGTGCAGTGGTGCGATCTTGGCTCACTGC...

****A

TTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAACA...
|||| |||||||||||||||||| |||| || |||||||| ||||||| | |  | |||| ||||||||||||| ...
TTTTGTTTTTGAGACAGAGTCTCGCTCTATCGCCCAGGCTGGAGTGCATTGGCACGATCTCGGCTCACTGCAACC...

****Alignment****
sequence:  gi|1909942457|dbj|AP023476.1| Homo sapiens DNA, chromosome 16, nearly complete genome
lenght: 94690957
TTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCA...
|||||||||||||||||   |||||   ||||||||  ||||| ||||||||  |  | ||||||||||||||||...
TTTTTTTTTTTTTGAGATGAAGTCTTGTTCTGTCACT-AGGCTGGAGTGCAACGGCACGATCTTGGCTCACTGCA...

****Alignment****
sequence:  gi|1909942457|dbj|AP023476.1| Homo sapiens DNA, chromosome 16, nearly complete genome
lenght: 94690957
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||| | ||||||| |||||||||||||||| ||||| | | | ||||||| ||||||||||...
TTTTTTTTTTTTTTGAGGCGGAGTCTCGCTCTGTCACCCAGGCTGGAGTGTAGTGGCGCCATCTCGGCTCACTGC...

****Alignment****
sequ

TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTG...
|||| ||||| |||  ||| ||||||| |||| ||  |||||||| ||||||||| | || | ||  ||||||||...
TTTTGTTTTTGTTTTTGACGGAGTCTCGCTCTTGTTCCCCAGGCTGGAGTGCAATGGCGCAACCTCAGCTCACTG...

****Alignment****
sequence:  gi|1909942457|dbj|AP023476.1| Homo sapiens DNA, chromosome 16, nearly complete genome
lenght: 94690957
TTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCA...
||| | |||||||||||||||||||  | |  |  ||||||||  |||||||| |||  ||||||||||||||||...
TTTGTATTTTTTTGAGACAGAGTCTTGCGCCATTGCCCAGGCTGCAGTGCAATGGTGTAATCTTGGCTCACTGCA...

****Alignment****
sequence:  gi|1909942457|dbj|AP023476.1| Homo sapiens DNA, chromosome 16, nearly complete genome
lenght: 94690957
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
|||||||||||||||||||| |||||| ||||||  |||||||| |||| |||| |||  |||| ||||||||||...
TTTTTTTTTTTTTTGAGACAAAGTCTCACTCTGTTGCCCAGGCTGGAGTACAATGGTGTGATCTGGGCTCACTGC...

****Alignment****
sequ


****Alignment****
sequence:  gi|16596520|gb|AC008870.8| Homo sapiens chromosome 16 clone CTD-2196E14, complete sequence
lenght: 150522
TTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCA-----AT-----AGTGCCATCTTG----...
||| ||| ||| |||||||||| ||| || ||||||||| |||||||     ||     | ||| || |||    ...
TTTCTTTCTGAAACAGAGTCTCACTCAGT-ACCCAGGCTGGAGTGCATGGGCATGACTCACTGC-ATTTTGAGGT...

****Alignment****
sequence:  gi|16596520|gb|AC008870.8| Homo sapiens chromosome 16 clone CTD-2196E14, complete sequence
lenght: 150522
TTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGC...
||||||||||||||||||||| ||||| || ||| |       || |||||| | |||  | | | |||||||||...
TTTTTTTTTTTTTTGAGACAGGGTCTCACTGTGTTA-------TACAGTGCAGTGGTGTGACCATAGCTCACTGC...

****Alignment****
sequence:  gi|16596520|gb|AC008870.8| Homo sapiens chromosome 16 clone CTD-2196E14, complete sequence
lenght: 150522
TTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCAT--CT-TGGCTCACT...
|||||  |||||||||| || |||||  |||||  ||||

TAATTTTTTTTTTTTTTGAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCAC...
||||||||||||||||| |||||   |||  ||||||| || ||||| ||||||| | |  | ||||||||||||...
TAATTTTTTTTTTTTTT-AGACAAGATCTTGCTCTGTCGCC-AGGCTGGAGTGCAGTGGCACGATCTTGGCTCAC...

****Alignment****
sequence:  gi|1024846813|ref|NG_046987.1| Homo sapiens DEAD-box helicase 10 (DDX10), RefSeqGene on chromosome 11
lenght: 282906
TTTTTTTTTGAGACAGAGTCTCCCTCT-GTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACTGCAACA...
|||||||||||||  |||| |  |||| ||  |||||||| ||||||||| |  | ||||  |||||| ||| | ...
TTTTTTTTTGAGATGGAGTTTTGCTCTTGTTGCCCAGGCTGGAGTGCAATGGCACGATCTCCGCTCACCGCAGCC...

****Alignment****
sequence:  gi|1024846813|ref|NG_046987.1| Homo sapiens DEAD-box helicase 10 (DDX10), RefSeqGene on chromosome 11
lenght: 282906
TTTTTTTTTTTTTT--GAGACAGAGTCTCCCTCTGTCACCCAGGCTAGAGTGCAATAGTGCCATCTTGGCTCACT...
||||||||||||||  ||||  ||||||| || |||| |||||||| ||||||| | | || | |||||||||||...
TTTTTTTTTTTTTTCCGAGATGGAGTCTCGCTGTGTCGCCCAGGCTGGAGTGCAGTGGCGCGAACTTGGCTCAC

In [17]:
#filtering the "predicted" alignments
results_Blastn= open('blast_DNA_GLI2_file.xml')
blastn_records = NCBIXML.read(results_Blastn)
E_VALUE_THRESH = 0.001
count_preditc, count_homo=0,0
list_filtered_alignments=[]
for alignment in  blastn_records.alignments:
    for hsp in alignment.hsps:
        if hsp.expect < E_VALUE_THRESH:
            exist = re.search(r'PREDICTED:\s', alignment.title)
            if exist:
                # print( existe[0] )
                pre = re.match(r'PREDICTED:\s', exist[0] )
                if pre:
                    count_preditc+=1
                    #print(id)
            else:
                list_filtered_alignments.append(alignment.accession)
                homo=re.search(r'Homo\ssapiens',alignment.title)
                if homo:
                    count_homo+=1
print(list_filtered_alignments)
print('Total {} PREDICTED seqs found and remaining {} ids are from Homo sapiens'.format(count_preditc, count_homo))
#counting of Homo sapiens it's not conclusive, just to have an ideia, because the titles don't follow any pattern

['NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'NG_009030', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'AC016764', 'NM_001374354', 'NM_001374354', 'NM_001374354', 'NM_001374354', 'NM_001374353', 'NM_001374353', 'NM_001374353', 'NM_001374353', 'NM_001371271', 'NM_001371271', 'NM_00

In [18]:
with open('CDS_GLI2_nucleotide_result_blast.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## BLASTP

In [None]:
record = SeqIO.read(open("10001bp_sequence_GLI2.gb"), format="gb") 
print (len(record.seq))

In [13]:
Trans=[]
record = SeqIO.read(open("10001bp_sequence_GLI2.gb"), format="gb") 
for feat in record.features:
    if feat.type == "CDS":
        Trans.append(str(feat.qualifiers['translation']))
        print(feat.qualifiers['translation'])

['METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHNVNPGPLPPCADRRGLRLQSHPST

In [None]:
result_handle = NCBIWWW.qblast("blastp", "swissprot", Trans)

In [None]:
with open('blast_PROT_GLI2_file.xml', "w") as out_handle:
        out_handle.write(result_handle.read())
result_handle.close()

In [2]:
results_Blastp = open("blast_PROT_GLI2_file.xml")
blastp_records = NCBIXML.read(results_Blastp)
for parameter in blastp_records.alignments[0:5]:
    print()
    print('Accession: ', parameter.accession)
    print('Definition: ', parameter.hit_def)    
    for e_v in parameter.hsps:
        print('E-value: ', e_v.expect)


Accession:  P10070
Definition:  RecName: Full=Zinc finger protein GLI2; AltName: Full=GLI family zinc finger protein 2; AltName: Full=Tax helper protein [Homo sapiens]
E-value:  0.0

Accession:  Q0VGT2
Definition:  RecName: Full=Zinc finger protein GLI2; AltName: Full=Tax helper protein [Mus musculus]
E-value:  0.0

Accession:  Q91661
Definition:  RecName: Full=Zinc finger protein GLI4; AltName: Full=Neural-specific DNA-binding protein xGLI4; Short=xGLI-4 [Xenopus laevis]
E-value:  0.0

Accession:  Q9IA31
Definition:  RecName: Full=Transcriptional activator GLI3; AltName: Full=GLI3 full-length protein; Short=GLI3FL; Contains: RecName: Full=Transcriptional repressor GLI3R; AltName: Full=GLI3 C-terminally truncated form [Gallus gallus]
E-value:  0.0
E-value:  1.2649e-18

Accession:  P55879
Definition:  RecName: Full=Zinc finger protein GLI2 [Gallus gallus]
E-value:  0.0


In [19]:
results_Blastp= open('blast_PROT_GLI2_file.xml')
blastp_records = NCBIXML.read(results_Blastp)
E_VALUE_THRESH = 0.001
list_filtered_alignments,list_species=[],[]
for alignment in  blastp_records.alignments:
    for hsp in alignment.hsps:
#         print(hsp.identities)    # maybe add more 
        if hsp.expect < E_VALUE_THRESH:
            list_filtered_alignments.append(alignment.accession)
            title_organism=re.search(r'\[.+\s.+\]', alignment.title)
            if title_organism:
                m = re.match(r'\[.+\s.+\]', title_organism[0] )
                specie = m.group(0)
                #print(specie)
                list_species.append(specie)
for x in sorted(set(list_species)):
    print("number of times: {} that appeared specie: {}".format(list_species.count(x),x))

print(list_filtered_alignments)

number of times: 5 that appeared specie: [Bos taurus]
number of times: 1 that appeared specie: [Caenorhabditis briggsae]
number of times: 1 that appeared specie: [Caenorhabditis elegans]
number of times: 4 that appeared specie: [Danio rerio]
number of times: 3 that appeared specie: [Drosophila melanogaster]
number of times: 1 that appeared specie: [Drosophila yakuba]
number of times: 5 that appeared specie: [Gallus gallus]
number of times: 21 that appeared specie: [Homo sapiens]
number of times: 14 that appeared specie: [Mus musculus]
number of times: 2 that appeared specie: [Pan troglodytes]
number of times: 17 that appeared specie: [Xenopus laevis]
number of times: 6 that appeared specie: [Xenopus tropicalis]
['P10070', 'Q0VGT2', 'Q91661', 'Q9IA31', 'Q9IA31', 'P55879', 'Q91660', 'Q91660', 'P10071', 'P10071', 'Q5IS56', 'Q5IS56', 'Q61602', 'Q61602', 'Q91690', 'Q91690', 'P55878', 'P08151', 'P47806', 'P47806', 'P19538', 'Q8NEA6', 'Q8NEA6', 'Q6XP49', 'Q6XP49', 'Q8NBF1', 'Q8K1M4', 'P34708'

In [20]:
with open('CDS_GLI2_protein_result_blastp.txt', 'w') as f:
    for line in list_filtered_alignments:
        f.write(f"{line}\n")

## Uniprot search of Blastp results

In [None]:
def get_url(url, **kwargs):
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [None]:
#saving sequences
results_Blastp= open('blastp_CDS_prot_seq.xml')
blastp_records = NCBIXML.read(results_Blastp)
titles_list=[]
for alignment in  blastp_records.alignments:   
    titles_list.append(alignment.title)

file= open("CDS_protein_result_blastp.txt", "r")
fields="sequence"
WEBSITE_API="https://rest.uniprot.org"
seqs=[]

for i in file:
    r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
    seqs.append(str(r.content))

with open('allOrg_CDS_prot_new.fasta', 'w') as f:
    for index, seq in enumerate(seqs):
        existe = re.search(r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL)
        if existe:
            m = re.match( r'b\'Sequence\\n(.+?(?=\\n\'))', seq, re.DOTALL )
            f.write(f">{titles_list[index]}\n{m.group(1)}\n\n")                    
            #print(m.group(1) )       

In [None]:
file= open("CDS_protein_result_blastp.txt", "r")
fields="accession,organism_name,protein_name,cc_subcellular_location,cc_function"
WEBSITE_API="https://rest.uniprot.org"
with open('uniprot_result_CDS_filtered.txt', 'w',encoding='utf-8') as f:
    for i in file:
        r=get_url("{}/uniprotkb/search?query={} AND (reviewed:true)&fields={}&size=1&format=tsv".format(WEBSITE_API,i, fields))
        print(r.text)
        f.write(r.text)
        f.write('\n')

## Alignment and Phylo

In [16]:
lista=[]
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    print (a)
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SeqIO.read(handle, "swiss")
        lista.append(len(seq_record.seq))
print(min(lista))
print (lista)

334
[1586, 1544, 1361, 1544, 663, 1569, 1580, 1580, 1583, 1360, 556, 1106, 1111, 1397, 775, 780, 620, 789, 1110, 1165, 492, 524, 521, 384, 341, 515, 403, 334, 622, 530, 663, 467, 497, 503, 532, 530, 466, 444, 443, 447, 447, 609, 441, 441, 613, 565, 638, 567, 623, 753]


In [48]:
results_Blast= open('blast_PROT_GLI2_file.xml')
blast_records = NCBIXML.read(results_Blast)
for alignment in  blast_records.alignments:
    #print (alignment.hit_id)
    m = alignment.hit_id.replace("sp|","")
    a=m[0:6]
    print(f"id:{a}")
    with ExPASy.get_sprot_raw(a) as handle:
        seq_record = SwissProt.read(handle)
        print(">",seq_record.entry_name,"\n",seq_record.description, "\n", seq_record.organism_classification)
        print(f"keywords:{seq_record.keywords}")
        print()
        

id:P10070
> GLI2_HUMAN 
 RecName: Full=Zinc finger protein GLI2 {ECO:0000305}; AltName: Full=GLI family zinc finger protein 2 {ECO:0000312|HGNC:HGNC:4318}; AltName: Full=Tax helper protein {ECO:0000303|PubMed:9557682}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
keywords:['Acetylation', 'Activator', 'Alternative splicing', 'Cell projection', 'Cilium', 'Cytoplasm', 'Developmental protein', 'Disease variant', 'DNA-binding', 'Holoprosencephaly', 'Isopeptide bond', 'Metal-binding', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Repressor', 'Transcription', 'Transcription regulation', 'Ubl conjugation', 'Zinc', 'Zinc-finger']

id:Q0VGT2
> GLI2_MOUSE 
 RecName: Full=Zinc finger protein GLI2 {ECO:0000305}; AltName: Full=Tax helper protein {ECO:0000250|UniProtKB:P10070}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Eu

> CI_DROME 
 RecName: Full=Transcriptional activator cubitus interruptus; Short=Transcriptional activator ci; AltName: Full=ci form of 155 kDa; Short=ci-155; AltName: Full=ci full-length protein; Short=ciFL; Contains: RecName: Full=Transcriptional repressor cubitus interruptus; Short=Transcriptional repressor ci; AltName: Full=ci C-terminally truncated form; AltName: Full=ci form of 75 kDa; Short=ci-75; 
 ['Eukaryota', 'Metazoa', 'Ecdysozoa', 'Arthropoda', 'Hexapoda', 'Insecta', 'Pterygota', 'Neoptera', 'Endopterygota', 'Diptera', 'Brachycera', 'Muscomorpha', 'Ephydroidea', 'Drosophilidae', 'Drosophila', 'Sophophora']
keywords:['3D-structure', 'Activator', 'Developmental protein', 'DNA-binding', 'Metal-binding', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Repressor', 'Segmentation polarity protein', 'Transcription', 'Transcription regulation', 'Ubl conjugation', 'Zinc', 'Zinc-finger']

id:Q8NEA6
> GLIS3_HUMAN 
 RecName: Full=Zinc finger protein GLIS3; AltName: Full=GL

> ZIC5_MOUSE 
 RecName: Full=Zinc finger protein ZIC 5; AltName: Full=Odd paired-related protein; Short=Opa-related protein; AltName: Full=Zinc finger protein of the cerebellum 5; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Glires', 'Rodentia', 'Myomorpha', 'Muroidea', 'Muridae', 'Murinae', 'Mus', 'Mus']
keywords:['Developmental protein', 'Differentiation', 'DNA-binding', 'Metal-binding', 'Neurogenesis', 'Nucleus', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Zinc', 'Zinc-finger']

id:A0JC51
> ZIC4_XENLA 
 RecName: Full=Zinc finger protein ZIC 4 {ECO:0000303|PubMed:16871625}; Short=XlZic4 {ECO:0000303|PubMed:16871625}; AltName: Full=Zinc finger protein of the cerebellum 4 {ECO:0000250|UniProtKB:Q8N9L1}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Amphibia', 'Batrachia', 'Anura', 'Pipoidea', 'Pipidae', 'Xenopodinae', 'Xenopus', 'Xenopus']
keywords:['Developmental

> ZIC3_XENTR 
 RecName: Full=Zinc finger protein ZIC 3 {ECO:0000250|UniProtKB:O57311}; AltName: Full=Zinc finger protein of the cerebellum 3 {ECO:0000250|UniProtKB:O57311}; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Amphibia', 'Batrachia', 'Anura', 'Pipoidea', 'Pipidae', 'Xenopodinae', 'Xenopus', 'Silurana']
keywords:['Activator', 'Cytoplasm', 'Developmental protein', 'Differentiation', 'DNA-binding', 'Metal-binding', 'Neurogenesis', 'Nucleus', 'Reference proteome', 'Repeat', 'Transcription', 'Transcription regulation', 'Zinc', 'Zinc-finger']

id:O57311
> ZIC3_XENLA 
 RecName: Full=Zinc finger protein ZIC 3; Short=XZic3; Short=XlZic3; AltName: Full=Zinc finger protein Zic3-A; AltName: Full=Zinc finger protein of the cerebellum 3; 
 ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Amphibia', 'Batrachia', 'Anura', 'Pipoidea', 'Pipidae', 'Xenopodinae', 'Xenopus', 'Xenopus']
keywords:['Activator', 'Cytoplasm', 'Developme

In [23]:
alignments = AlignIO.parse("blast_PROT_GLI2_SEQ_file.fasta", "fasta") 
for alignment in alignments: 
    print (alignment)

Alignment with 50 rows and 334 columns
METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAA...TSI P10070
METSAPAPALEKKEAKSGLLEDSSFPDPGKKACPLAVAAAVAAH...TMP Q0VGT2
MEHYLRSVHNSPTLSMISAARGLSPAEVAHEHLKERGIYGLAPP...EQK Q91661
MEAQSHSSTTTEKKKVENSIVKCSNRTDVSEKAVASSTTSNEDE...GSY Q9IA31
LMAGHPNYGDILMQSGGAAGTAHLHEYLSPVDVSRFSSPRVTPR...CEH P55879
MEAQSRSTTASEKKKVENSIVKGHSRTEVSEKAVASSTTSNEDE...SAS Q91660
MEAQSHSSTTTEKKKVENSIVKCSTRTDVSEKAVASSTTSNEDE...GSY P10071
MEAQSHSSTTTEKKKVENSIVKCSTRTDVSEKAVASSTTSNEDE...GSY Q5IS56
MEAQAHSSTATERKKAENSIGKCPTRTDVSEKAVASSTTSNEDE...GSY Q61602
MASRQCPPAAVFNSMNPPVNSYVEHCYLRSPNVMAEGMNEMPYC...NLK Q91690
MFNPVTPQARPYAEHCCPRPLHGASAGTPGLQGLDFPVCHQPNL...THL P55878
MFNSMTPPPISSYGEPCCLRPLPSQGAPSVGTEGLSGPPFCHQA...MCE P08151
MFNPMTPPQVNSYSEPCCLRPLHSQGVPSMGTEGLSGLPFCHQA...KPY P47806
MDAYALPTYFPLAYSELQFLASRRAAAVAAAATVLPGSPCINQH...TSP P19538
MMVQRLGLISPPASQVSTACNQISPSLQRAMNAANLNIPPSDTR...LDD Q8NEA6
MMVQRLGPISPPASQVSTACKQISPSLPRAVNAANLNRPPSDTR...LEE Q6XP49
MAEARTSLSAHCRGPLATGLHPDLDLPGRSLAT

In [19]:
record = SeqIO.parse(open("blast_PROT_GLI2_SEQ_file.fasta"), format="fasta")

In [22]:
lista = []
for seq in record:
    #print(seq)
    lista.append(seq)

alinhamento = MultipleSeqAlignment(lista)


Alignment with 0 rows and 0 columns


In [27]:
file = AlignIO.write(alinhamento, "resultados_alinhamento_multiplo", "fasta")

In [28]:
record_seq = AlignIO.parse("resultados_alinhamento_multiplo", "fasta")

converter_stock = AlignIO.parse("resultados_alinhamento_multiplo", "fasta")
ficheirostock = AlignIO.write([c for c in converter_stock], "resultados_alinhamento_multiplo_stock.sth", "stockholm")

In [30]:
alinhamento_stock = AlignIO.read("resultados_alinhamento_multiplo_stock.sth", "stockholm")
print(alinhamento_stock)

Alignment with 49 rows and 334 columns
METSAPAPALEKKEAKSGLLEDSSFPDPGKKACPLAVAAAVAAH...TMP Q0VGT2
MEHYLRSVHNSPTLSMISAARGLSPAEVAHEHLKERGIYGLAPP...EQK Q91661
MEAQSHSSTTTEKKKVENSIVKCSNRTDVSEKAVASSTTSNEDE...GSY Q9IA31
LMAGHPNYGDILMQSGGAAGTAHLHEYLSPVDVSRFSSPRVTPR...CEH P55879
MEAQSRSTTASEKKKVENSIVKGHSRTEVSEKAVASSTTSNEDE...SAS Q91660
MEAQSHSSTTTEKKKVENSIVKCSTRTDVSEKAVASSTTSNEDE...GSY P10071
MEAQSHSSTTTEKKKVENSIVKCSTRTDVSEKAVASSTTSNEDE...GSY Q5IS56
MEAQAHSSTATERKKAENSIGKCPTRTDVSEKAVASSTTSNEDE...GSY Q61602
MASRQCPPAAVFNSMNPPVNSYVEHCYLRSPNVMAEGMNEMPYC...NLK Q91690
MFNPVTPQARPYAEHCCPRPLHGASAGTPGLQGLDFPVCHQPNL...THL P55878
MFNSMTPPPISSYGEPCCLRPLPSQGAPSVGTEGLSGPPFCHQA...MCE P08151
MFNPMTPPQVNSYSEPCCLRPLHSQGVPSMGTEGLSGLPFCHQA...KPY P47806
MDAYALPTYFPLAYSELQFLASRRAAAVAAAATVLPGSPCINQH...TSP P19538
MMVQRLGLISPPASQVSTACNQISPSLQRAMNAANLNIPPSDTR...LDD Q8NEA6
MMVQRLGPISPPASQVSTACKQISPSLPRAVNAANLNRPPSDTR...LEE Q6XP49
MAEARTSLSAHCRGPLATGLHPDLDLPGRSLATPAPSCYLLGSE...RYT Q8NBF1
MHCEVAEALSDKRPKEAPGAPGQGRGPVSLGAH

In [32]:
calculator = DistanceCalculator('blosum62')
dm1 = calculator.get_distance(alinhamento_stock)
print(dm1)

Q0VGT2	0
Q91661	1.1585227272727272	0
Q9IA31	1.1387151790790222	1.1403409090909091	0
P55879	1.1666666666666667	1.175055928411633	1.145973154362416	0
Q91660	1.0748999428244712	1.134659090909091	0.7686185332575327	1.175055928411633	0
P10071	1.1322690992018245	1.1278409090909092	0.08129619101762364	1.1515659955257271	0.7805017103762828	0
Q5IS56	1.133257403189066	1.128409090909091	0.07447413303013073	1.1498881431767338	0.7835990888382688	0.006833712984054663	0
Q61602	1.1392836839113132	1.1375	0.08698123934053437	1.1549217002237135	0.7942012507106311	0.0454803865832859	0.04377487208641273	0
Q91690	1.1770546056260343	1.110865968008825	1.1583011583011582	1.14726971869829	1.1798124655267512	1.1489244346387204	1.1494760066188638	1.150027578599007	0
P55878	1.1580680570801318	1.1838638858397366	1.1361141602634468	1.12403951701427	1.159165751920966	1.1333699231613612	1.1344676180021953	1.1383095499451152	1.1383095499451152	0
P08151	1.149481723949809	1.1587561374795416	1.1789416257501364	1.153300600

In [35]:
constructor = DistanceTreeConstructor()
upgmatree = constructor.upgma(dm1)

In [38]:
Phylo.draw_ascii(upgmatree)

                                                 _ Q8VDL9
         _______________________________________|
   _____|                                       |_ Q9BZE0
  |     |
 ,|     |_________________________________________ Q98T94
 ||
 ||  _____________________________________________ Q14872
 ||_|
 |  |_____________________________________________ Q8NBF1
 |
 |                                 __________________________ Q1LYE3
 |               _________________|
 |              |                 |                         , Q58DZ6
 |              |                 |_________________________|
 |      ________|                                           | Q91853
 |     |        |
 |     |        |        ____________________________________ P52747
 |   __|        |_______|
 |  |  |                |____________________________________ A6QQW0
_|  |  |
 |  |  |   ___________________________________________ Q17308
 | _|  |__|
 || |     |___________________________________________ P34708
 || |

In [39]:
njtree = constructor.nj(dm1)
Phylo.draw_ascii(njtree)

    _______________________________________________________________ Q61467
  _|
 | |        ______________________________________________________ P47806
 | |_______|
 |         |_______________________________________________________ P08151
 |
 |                                                   _____________ Q62520
 |     _____________________________________________|
 |    |                                             |____________ O95409
 |  __|
 | |  |          _________________________________________________ Q91689
 | |  |_________|
 | |            |________________________________________________ Q9YIB7
 | |
 | |                                                              , P46684
 | |         _____________________________________________________|
 |,|    ____|                                                     | Q15915
 |||   |    |
 |||   |    |______________________________________________________ O73689
 ||| __|
 ||||  |                                                    