<a href="https://colab.research.google.com/github/ground-Jiang/DNA-sequences-translation/blob/main/DNA_sequences_into_protein_sequences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install biopython



In [2]:
#@title **input your DNA sequences in the right box**

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Input and output file paths
input_file = 'Pseudomonas_aeruginosa_PAO1_ORF.ffn' #@param {type:"string"}
# Replace with the path to your .ffn file
output_file = '/content/output_file.fasta'  # Replace with the path to save the .fasta file

# List to hold translated protein sequences
protein_records = []

# Read the DNA sequences from the .ffn file
for dna_record in SeqIO.parse(input_file, "fasta"):
    # Translate the DNA sequence into a protein sequence
    protein_seq = dna_record.seq.translate()

    # Create a SeqRecord for the protein sequence
    protein_record = SeqRecord(protein_seq, id=dna_record.id, description="translated protein")

    # Add the protein record to the list
    protein_records.append(protein_record)

# Write the protein sequences to a .fasta file
SeqIO.write(protein_records, output_file, "fasta")

print(f"Protein sequences saved to {output_file}")




Protein sequences saved to /content/output_file.fasta


In [5]:
#@title **remove all the * in the sequences**

import os
from Bio import SeqIO
from Bio.Seq import Seq

def remove_asterisks(directory):
    for filename in os.listdir(directory):
        if filename.endswith('.fasta'):
            file_path = os.path.join(directory, filename)
            sequences = list(SeqIO.parse(file_path, 'fasta'))

            # Remove '*' from sequences
            for seq_record in sequences:
                seq_record.seq = Seq(str(seq_record.seq).replace('*', ''))

            # Write cleaned sequences back to the file
            with open(file_path, 'w') as output_file:
                SeqIO.write(sequences, output_file, 'fasta')
            print(f'Processed {filename}')


directory_path = '/content/'

remove_asterisks(directory_path)

Processed output_file.fasta


In [6]:
import os
from Bio import SeqIO

# Input and output paths
input_file = 'output_file.fasta'  # Replace with the path to your input .fasta file
#os.mkdir('output',True)
output_dir = '/content/output/'  # Replace with the path to your output directory

# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Read the sequences from the input .fasta file
sequences = list(SeqIO.parse(input_file, 'fasta'))

# Ensure we have at least 500 sequences
if len(sequences) < 500:
    raise ValueError("The input file contains fewer than 500 sequences.")

# Take the first 500 sequences
sequences = sequences[:500]

# Split the sequences into pairs and save them
for i in range(0, 500, 2):
    pair_sequences = sequences[i:i+2]
    if len(pair_sequences) == 2:  # Ensure there are at least 2 sequences to form a pair
        # Create a file name using the sequence names
        file_name = f"{pair_sequences[0].id}_{pair_sequences[1].id}.fasta"
        pair_file_path = os.path.join(output_dir, file_name)
        SeqIO.write(pair_sequences, pair_file_path, 'fasta')

print(f'Created {500//2} .fasta files in the directory {output_dir}')


Created 250 .fasta files in the directory /content/output/


Input the fixed_sequences which is the chosen one for mining and unput also out_file.fasta which is the file including all the files.
And this script will output the first 500 pairing with output_file.fasta in one json file of AF3 server format.

In [11]:
import json
import os
from Bio import SeqIO

# Input and output paths
input_file = 'output_file.fasta'  # Replace with the path to your input .fasta file
output_file = '1-100andAzeB_M1.json'  # Replace with the desired output path for the JSON file

# The fixed sequence to include in every pair
fixed_sequence = "RLPLSPYQRDIWVAAAQFPELDQYTIFSYDRFTGEVDTQALERALLQAARDTEAFRLRLGETDGTPYQWLDTDAEFEARHVDLRADRDPEAAVRSWLRDAFRHAYPLDGRSLVDLALLHSDQALYVYVRTHHIVSDAWGLQLFLSRVRAGYLGELGEPQAQMPTASLLAQLETDDYSGSEQYRGDRAYFAEALEGLEPALFTRRRPAGLRRTARHRLTLERTLLDAIRDRGESPFLFLSAAVALYLARIHQNDDVVLGVPVLNRADRAAKQVVGHFANTLPLRIRTAPEQTVDEFLAQLREATRTLLRHQKMPLGDLLRGASPLFDTTLSYMRWPAAQAIPNASVETVAQTHAHDPDALAIWVSEFDGHSDAQVDFEYACDVFDADFPMDAAARHIETFLRALVEGGERRLGELDPLSAAEREELIHTRNATDQAFPEQATLPTLFAEQVARTPQRTALLEADGGTLSYAELDAKVQAVADALRAAGVRTDERVALLVARGPHLLPAILGVQRAGGAYVPINPDHPLERVRLLLEDCGARVVLVDERAATLGESLGETRVLHLERLPQSTGDLPAANVAPGDLAYVIYTSGSTGMPKGVMVEHRSVVNRLNWMQRRYPIGERDVLLQKTPVTFDVSVWELFWWSFTGARLSLLPPGAEKDPREMLRSIQRDAVTVIHFVPSMLTPFLDLLDGDPTARAAASSLRLVFCSGEALAPLQVARFRRLFGDAVRLVNLYGPTEATVDVSDHECASDNPTRVPIGRPIDNLRLYVLDRALRPQPLGAVGELYIGGVGVARGYLNRPELNAERFLVDPFVAGGRLYRTGDLARWLADGNLEYLGRADDQVKIRGNRVEPDEVRDRLAALPGVRDAAVVARDSAVRGTHLVGYYVAAAELDPGQLRAGLSATLPDFMLPAFFVRIDSLPLSANGKLDRRQLPAPPEQVAAVAPRTATEAELAAVWADVLGVAEVGVHDDFYALGGDSILMLRIRAAAQRRGLGFELADLMRNPTVAGLAERL"

# Read the sequences from the input .fasta file
sequences = list(SeqIO.parse(input_file, 'fasta'))

# Ensure we have at least 500 sequences
if len(sequences) < 100:
    raise ValueError("The input file contains fewer than 500 sequences.")

# Take the first 500 sequences
sequences = sequences[:100]

# Prepare the JSON structure
jobs = []
for i, seq_record in enumerate(sequences):
    job_name = f"{seq_record.id}"
    job_data = {
        "name": job_name,
        "modelSeeds": [],
        "sequences": [
            {
                "proteinChain": {
                    "sequence": fixed_sequence,
                    "count": 1
                }
            },
            {
                "proteinChain": {
                    "sequence": str(seq_record.seq),
                    "count": 1
                }
            }
        ]
    }
    jobs.append(job_data)

# Save the JSON structure to a file
with open(output_file, 'w') as json_file:
    json.dump(jobs, json_file, indent=4)

print(f"JSON file saved to {output_file}")


JSON file saved to 1-100andAzeB_M1.json
