# henipavirus_RBP_alignment.ipynb
This script imports canonical henipavirus RBPs amino acid sequences, aligns them, changes text in name, then makes similarity plot of alignment from biotite package

In [None]:
# this cell is tagged as parameters for `papermill` parameterization
fasta_input_file = None
henipavirus_alignment = None

In [None]:
import matplotlib.pyplot as plt
from Bio import Entrez
from Bio import SeqIO
from Bio import AlignIO
from io import StringIO
import os
import re

import biotite.sequence as seq
import biotite.sequence.align as align
import biotite.sequence.io.fasta as fasta
import biotite.database.entrez as entrez
import biotite.sequence.graphics as graphics
import biotite.application.mafft as mafft

In [None]:
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

In [None]:
#fasta_input_file = "results/alignments/temp_sequences.fasta"
#henipavirus_alignment = "results/images/henipavirus_RBP_alignment.png"

In [None]:
def modify_sequence_names(input_file, output_file, name_mapping):
    # Read sequences from the input file
    sequences = list(SeqIO.parse(input_file, "fasta"))

    # Modify each sequence name based on partial matches in the dictionary
    for seq_record in sequences:
        for partial_name, new_name in name_mapping.items():
            if re.search(partial_name, seq_record.id):
                seq_record.id = new_name
                seq_record.description = new_name  # Update description as well
                break  # Exit the loop once a match is found

    # Write modified sequences to the output file
    SeqIO.write(sequences, output_file, "fasta")

# Define the mapping dictionary
name_mapping = {
    'NP_112027.1': 'Nipah-M',
    'QDJ04463.1': 'Nipah_Cambodia',
    'QKV44014.1': 'Nipah_India',
    'NP_047112.2': 'Hendra',
    'UCY33670.1': 'Hendra_G2',    

    'YP_009094086.1': 'Cedar',
    'AJP33320.1': 'Cedar2',
    'AFH96011.1': 'Ghana',
}
# Usage
input_fasta = fasta_input_file  # Replace with your FASTA file path
output_fasta = "results/alignments/output.fasta"  # Replace with your desired output path
modify_sequence_names(input_fasta, output_fasta, name_mapping)

def reorder_sequences(input_file, output_file, order_list):
    # Read sequences into a dictionary
    sequences = {record.id: record for record in SeqIO.parse(input_file, "fasta")}

    # Reorder sequences based on the order list
    reordered_sequences = [sequences[seq_id] for seq_id in order_list if seq_id in sequences]

    # Write reordered sequences to the output file
    SeqIO.write(reordered_sequences, output_file, "fasta")

# Define the order list
order_list = [
    'Nipah-M',
    'Nipah_Cambodia',
    'Nipah_India',
    'Hendra',
    'Hendra_G2',  
    'Cedar',
    'Cedar2',
    'Ghana',
]

input_fasta = "results/alignments/output.fasta"  # Replace with your FASTA file path
output_fasta = "results/alignments/reordered.fasta"  # Replace with your desired output path
reorder_sequences(input_fasta, output_fasta, order_list)

In [None]:
fasta_file = fasta.FastaFile.read('results/alignments/reordered.fasta')

ids = []
sequences = []

for header, seq_str in fasta_file.items():
    ids.append(header)
    sequences.append(seq.ProteinSequence(seq_str))

matrix = align.SubstitutionMatrix.std_protein_matrix()
alignment, order, tree, distances = align.align_multiple(
    sequences, matrix, gap_penalty=(-10,-1))

In [None]:
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111)
graphics.plot_alignment_similarity_based(
    ax, alignment, matrix=matrix, labels=ids,
    show_numbers=True, show_line_position=False, symbols_per_line=100,color="blue",spacing=3
)
fig.tight_layout()
plt.savefig(henipavirus_alignment,dpi=300)
plt.show()