In [1]:
# 1. imports 

# pip install biotite plotly nbformat 

import plotly.express as px 
from biotite.sequence.io.fasta import FastaFile

In [2]:
# 2. input file 

# uniprot "acylphosphatase family > with active site annotation > length less than 200 > to fasta"

# look at the file on disk 

input_file = "./fasta/raw.fa"

! head "$input_file"

>tr|A0A0Q0WE68|A0A0Q0WE68_9EURY acylphosphatase OS=Methanosaeta sp. SDB OX=1735328 GN=APR56_14035 PE=3 SV=1
MKRVDIIASGDVQKVGFRDVVQKIGRDLGLSGTVENREPYDVRIVAEGEEDGLKEFIEAL
KIKRGPIHVRELEVSWSEATGEFPYFKILRGDWQEELGERFDVAVGLLYRSIEIGEENLA
LGRENLALGKENLAVSKENLAIGKKMLEKQDTMIERQDETIGEIRGMRSDFQDHMEKRFT
KIEGEIAEIKAAIADIKGNA
>tr|A0A1S3CGZ8|A0A1S3CGZ8_CUCME acylphosphatase OS=Cucumis melo OX=3656 GN=LOC103500820 PE=3 SV=1
MASASSSSSAAFAVGPLIRNRSTRFLPCRNLHHKWSFKDAGSHSRLSLFNYYNSSPLLPS
LPIRHRLLLPHPPLHLLLRSRSPSFRYPLPLMASTVPHQAGPETPQSNPTKTVRVVIKGR
VQGVFYRDWTVENATELGLKGWVRNRRDGSVEALFSGRPESVTEMEQRCRRGPPAAMVTG
FQVFPSSDDPGPGFERLRTA


In [8]:
# 3. save the FASTA headers, the sequences, and the sequence lengths 

fn = FastaFile.read(input_file)

headers = []
sequences = []
lengths = []

for header, sequence in fn.items():
    
    header = header.split("|")[1]
    sequence = str(sequence)
    length = len(sequence)
    
    headers.append(header)
    sequences.append(sequence)
    lengths.append(length)

print(f"Read {len(headers)} sequences")

sequences[:2]

Read 28856 sequences


['MKRVDIIASGDVQKVGFRDVVQKIGRDLGLSGTVENREPYDVRIVAEGEEDGLKEFIEALKIKRGPIHVRELEVSWSEATGEFPYFKILRGDWQEELGERFDVAVGLLYRSIEIGEENLALGRENLALGKENLAVSKENLAIGKKMLEKQDTMIERQDETIGEIRGMRSDFQDHMEKRFTKIEGEIAEIKAAIADIKGNA',
 'MASASSSSSAAFAVGPLIRNRSTRFLPCRNLHHKWSFKDAGSHSRLSLFNYYNSSPLLPSLPIRHRLLLPHPPLHLLLRSRSPSFRYPLPLMASTVPHQAGPETPQSNPTKTVRVVIKGRVQGVFYRDWTVENATELGLKGWVRNRRDGSVEALFSGRPESVTEMEQRCRRGPPAAMVTGFQVFPSSDDPGPGFERLRTA']

In [4]:
# 4. examine the length distribution 

px.histogram(x=lengths, width=1024)

In [5]:
# 5. construct a new FASTA with just the sequences we want 

output_fasta = FastaFile() 
count = 0 

for header, sequence in zip(headers, sequences):
    if 64 < len(sequence) < 128:
        if sequence.startswith("M"):
            if not "X" in sequence:
                count += 1
                output_fasta[header] = sequence

print(f"Selected {count} of {len(sequences)} sequences")

output_fasta.write("fasta/hypf.fa")

Selected 26779 of 28856 sequences


In [6]:
# examine sequences bioinformatically 

# Alignment? 

# Embedding? (can we learn a good embedding?)

# Examine predicted structures in PyMOL 