In [1]:
%cd ~/SSMuLA

/disk2/fli/SSMuLA


In [2]:
%load_ext blackcellmagic

In [3]:
# !python sandbox/esmif/score_log_likelihoods.py sandbox/esmif/5YH2.pdb \
#   sandbox/esmif/5YH2_mutated_seqs.fasta --chain C \
#   --outpath sandbox/esmif/5YH2_mutated_seqs_scores.csv

In [4]:
import pandas as pd

In [5]:
import warnings

from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1
from Bio import SeqIO, pairwise2
from Bio.pairwise2 import format_alignment

from SSMuLA.landscape_global import LIB_INFO_DICT

warnings.filterwarnings("ignore")


def pdb2seq(pdb_file_path: str, chain_id: str = "A") -> str:

    """
    A function for extracting chain in string format from pdb

    Args:
    - pdb_file_path: str,
    - chain_id: str = "A"
    """

    chains = {
        record.id: record.seq for record in SeqIO.parse(pdb_file_path, "pdb-atom")
    }

    return str(chains[[chain for chain in chains.keys() if chain_id in chain][0]])


def find_missing_str(longer: str, shorter: str) -> [str, str]:
    """
    A function for finding the missing part of a string

    Args:
    - longer: str, longer string
    - shorter: str, shorter string

    Returns:
    - part_before: str, part of the longer string before the shorter
    - part_after: str, part of the longer string after the shorter
    """
    # Find the start index of the shorter in the longer string
    start_index = longer.find(shorter)

    # If the shorter is not found need to do alingment to figure out
    if start_index == -1:
        
        return "", ""

    # Find the end index of the shorter
    end_index = start_index + len(shorter)

    # Extract parts of the longer string that are not the shorter
    part_before = longer[:start_index]
    part_after = longer[end_index:]

    return part_before, part_after

def mut_csv2fasta(lib: str, ev_esm_dir: str = "ev_esm2") -> None:
    """
    A function for converting mutation csv to fasta

    Args:
    - lib: str, path to mutation csv
    - ev_esm_dir: str = "ev_esm2"
    """

    csv_path = f"{ev_esm_dir}/{lib}/{lib}.csv"

    if "TrpB" in lib:
        protein = "TrpB"
    else:
        protein = lib

    seq = SeqIO.read(f"data/{protein}/{protein}.fasta", "fasta").seq

    if lib == "DHFR":
        seq = str(seq.translate())
    else:
        seq = str(seq)

    pdb_seq = pdb2seq(f"data/{protein}/{protein}.pdb", "A")

    df = pd.read_csv(csv_path)

    for col in ["muts", "seq"]:
        if col not in df.columns:
            raise ValueError(f"{col} column not found")

    fasta = csv_path.replace(".csv", ".fasta")

    print(f"Writing to {fasta}...")

    # pdb has more than fasta should only be for dhfr
    if len(seq) < len(pdb_seq):
        print("PDB seq is longer than fasta")
        part_before, part_after = find_missing_str(longer=pdb_seq, shorter=seq)
        with open(fasta, "w") as f:
            for mut, seq in zip(df["muts"].values, df["seq"].values):
                f.write(f">{mut}\n{part_before+seq+part_after}\n")
    elif len(seq) == len(pdb_seq):
        print("PDB seq length is equal to fasta")
        with open(fasta, "w") as f:
            for mut, seq in zip(df["muts"].values, df["seq"].values):
                f.write(f">{mut}\n{seq}\n")
    else:
        print("Fasta seq is longer than PDB")
        part_before, part_after = find_missing_str(longer=seq, shorter=pdb_seq)
        with open(fasta, "w") as f:
            for mut, seq in zip(df["muts"].values, df["seq"].values):
                f.write(f">{mut}\n{seq[len(part_before):len(seq)-len(part_after)]}\n")




In [6]:
protein = "T7"
lib = "T7"

seq = SeqIO.read(f"data/{protein}/{protein}.fasta", "fasta").seq

seq = str(seq)

pdb_seq = pdb2seq(f"data/{protein}/{protein}.pdb", "A")

global_alignments = pairwise2.align.globalxx(seq, pdb_seq)

print(len(seq), len(pdb_seq))


883 878


In [7]:
df = pd.read_csv(f"ev_esm2/{lib}/{lib}.csv")

In [8]:
df

Unnamed: 0,AAs,AA1,AA2,AA3,fitness,active,muts,n_mut,seq,combo,pos,esm_score
0,HTY,H,T,Y,-1.173150,True,N748H:R756T:Q758Y,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['H', 'T', 'Y']","[748, 756, 758]",-32.022656
1,WNG,W,N,G,-0.367862,True,N748W:R756N:Q758G,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['W', 'N', 'G']","[748, 756, 758]",-35.874612
2,QAM,Q,A,M,-0.181963,True,N748Q:R756A:Q758M,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['Q', 'A', 'M']","[748, 756, 758]",-30.975377
3,WMA,W,M,A,-0.153341,True,N748W:R756M:Q758A,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['W', 'M', 'A']","[748, 756, 758]",-34.443870
4,PPV,P,P,V,-0.766341,True,N748P:R756P:Q758V,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['P', 'P', 'V']","[748, 756, 758]",-38.719343
...,...,...,...,...,...,...,...,...,...,...,...,...
6720,SVR,S,V,R,-0.699961,True,N748S:R756V:Q758R,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['S', 'V', 'R']","[748, 756, 758]",-26.470713
6721,QMI,Q,M,I,-0.819314,True,N748Q:R756M:Q758I,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['Q', 'M', 'I']","[748, 756, 758]",-30.680741
6722,LFS,L,F,S,-0.209776,True,N748L:R756F:Q758S,3,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['L', 'F', 'S']","[748, 756, 758]",-30.605366
6723,NSM,N,S,M,-1.381359,False,R756S:Q758M,2,MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEM...,"['S', 'M']","[756, 758]",-15.644491


In [9]:
seq

'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'

In [10]:
pdb_seq

'IAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFXXXXXXXXXXXXXXXXPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'

In [11]:
print(format_alignment(*pairwise2.align.globalms(seq, pdb_seq, 2, -1, -0.5, -0.1)[0]))

MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAK----------------PLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA
     ||||||||||||||||||||||||||||||||||||||||||||||||||                                |||||||||||||

In [12]:
def alignmutseq2pdbseq(mut_seq: str, pdb_seq: str) -> list[int]:
    """
    A function for aligning mutation sequence to pdb sequence and
    return the indices of the aligned sequence so that the mutation
    sequence can be trimmed to the lenght of the pdb sequence

    Args:
    - mut_seq: str, mutation sequence
    - pdb_seq: str, pdb sequence
    """

    # Define a custom scoring function so that X is aligned with anything
    def custom_match_function(x, y):
        if x == "X" or y == "X":
            return 2  # High score for aligning X with anything
        elif x == y:
            return 2  # Match score
        else:
            return -1  # Mismatch score

    _, aligned_pdb_seq, _, _, _ = pairwise2.align.globalcs(
        mut_seq, pdb_seq, custom_match_function, -0.5, -0.1
    )[0]

    return [
        aligned_pdb_seq.find(aligned_pdb_seq.replace("-", "")[:1]),
        aligned_pdb_seq.rfind(aligned_pdb_seq.replace("-", "")[-1]),
    ]

In [40]:

# Define a custom scoring function
def custom_match_function(x, y):
    if x == 'X' or y == 'X':
        return 2  # High score for aligning X with anything
    elif x == y:
        return 2  # Match score
    else:
        return -1  # Mismatch score


In [24]:
print(format_alignment(*pairwise2.align.globalcs(seq, pdb_seq, custom_match_function, -0.5, -0.1)[0]))

MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA
     ||||||||||||||||||||||||||||||||||||||||||||||||||................|||||||||||||||||||||||||||||||||||||||||||||

In [49]:
# Extract the aligned sequences
aligned_seq1, aligned_seq2, score, begin, end = pairwise2.align.globalcs(seq, pdb_seq, custom_match_function, -0.5, -0.1)[0]

# Determine the start and end positions of the aligned regions (ignoring gaps)
start_index_seq1 = aligned_seq1.find(aligned_seq1.replace('-', '')[:1])
end_index_seq1 = aligned_seq1.rfind(aligned_seq1.replace('-', '')[-1])

start_index_seq2 = aligned_seq2.find(aligned_seq2.replace('-', '')[:1])
end_index_seq2 = aligned_seq2.rfind(aligned_seq2.replace('-', '')[-1])

# Print the start and end indices
print(f"Aligned region in seq1: Start = {start_index_seq1}, End = {end_index_seq1}")
print(f"Aligned region in seq2: Start = {start_index_seq2}, End = {end_index_seq2}")


Aligned region in seq1: Start = 0, End = 882
Aligned region in seq2: Start = 5, End = 882


In [55]:
# Step 1: Find all indices of 'X' in pdb_seq
x_indices = [i for i, letter in enumerate(aligned_seq2) if letter == 'X']
x_indices

[55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]

In [56]:
# Step 2: Modify the original seq by replacing characters at the found indices with 'X'
seq_list = list(seq)  # Convert the sequence to a list to allow mutation
for idx in x_indices:
    seq_list[idx] = 'X'

# Convert the modified list back to a string
modified_seq = ''.join(seq_list)
modified_seq

'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFXXXXXXXXXXXXXXXXPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'

In [20]:
from Bio import PDB

from Bio import PDB

def chop_pdb(input_pdb, output_pdb, start_resid, chain_id):
    # Initialize the parser and structure
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure("structure", input_pdb)
    
    # Initialize the writer
    io = PDB.PDBIO()

    # Define a select class to filter the residues in the specific chain
    class ResidueSelect(PDB.Select):
        def accept_residue(self, residue):
            # Only accept residues in the specified chain with a residue ID greater than or equal to start_resid
            if residue.parent.id == chain_id and residue.id[1] >= start_resid:
                return True
            return False

    # Save the chopped structure to the output file
    io.set_structure(structure)
    io.save(output_pdb, ResidueSelect())

    print(f"Saved chopped structure starting from residue {start_resid} in chain {chain_id} to {output_pdb}")

# Example usage
input_pdb = "/disk2/fli/SSMuLA/data/T7/T7.pdb"  # Replace with your input PDB file
output_pdb = "/disk2/fli/SSMuLA/data/T7/T7_processed.pdb"  # Replace with your desired output PDB file
start_resid = 71  # Replace with the starting residue ID you want

chop_pdb(input_pdb, output_pdb, start_resid, "A")



Saved chopped structure starting from residue 71 in chain A to /disk2/fli/SSMuLA/data/T7/T7_processed.pdb


In [45]:
begin, end

(0, 883)

In [50]:
aligned_seq1[start_index_seq2:end_index_seq2+1]

'IAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'

In [44]:
aligned_seq2

'-----IAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFXXXXXXXXXXXXXXXXPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'

In [16]:
len("PLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLHLMFLGQFTLYPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAF")

811

In [26]:
from Bio.Align import substitution_matrices, PairwiseAligner

aligner = PairwiseAligner()

# Use the default substitution matrix and modify it for the wildcard 'X'
matrix = substitution_matrices.load("BLOSUM62")

# Allow 'X' to align with anything with no penalty
for residue in matrix.alphabet:
    matrix['X', residue] = 2  # High score for aligning 'X' with anything
    matrix[residue, 'X'] = 2

aligner.substitution_matrix = matrix
aligner.mode = 'global'

# Perform the alignment

# Print the best alignment
print(aligner.align(seq, pdb_seq)[0])


target            0 MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLK
                  0 ---|--|||||||||||||||||||||||||||||||||||||||||||||||||.....
query             0 ---I--AKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFXXXXX

target           60 AGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIK
                 60 ...........|||||||||||||||||||||||||||||||||||||||||||||||||
query            55 XXXXXXXXXXXPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIK

target          120 TTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKK
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           115 TTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKK

target          180 AFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQD
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           175 AFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQD

target          240 SETI

In [35]:
part_before, part_after = find_missing_str(longer=seq, shorter=pdb_seq)

In [36]:
part_before

''

In [37]:
part_after

''

In [48]:

    if "TrpB" in lib:
        protein = "TrpB"
    else:
        protein = lib

    print(f"data/{protein}/{protein}.pdb")

    seq = SeqIO.read(f"data/{protein}/{protein}.fasta", "fasta").seq
    if lib == "DHFR":
        seq = str(seq.translate())
    else:
        seq = str(seq)

    pdb_seq = pdb2seq(f"data/{protein}/{protein}.pdb", "A")

    global_alignments = pairwise2.align.globalxx(seq, pdb_seq)

    print(len(seq), len(pdb_seq))
    
    # pdb has more than fasta should only be for dhfr
    if len(seq) < len(pdb_seq):
        
        
        
        
        
        

DHFR
data/DHFR/DHFR.pdb
73 159
MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVT MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR
Global Alignment:
MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVT--------------------------------------------------------------------------------------
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||                                                                                      
MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR
  Score=73

ParD2
data/ParD2/ParD2.pdb
91 87
MANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRELREAEAERLRKAWIEGLESGPFAPFDIEDIKQKARSRLVDAIKK NVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRELREAEAERLRKAWIEGLESGPFAPFDIEDIKQKARSRLVDAI
Global Align

In [55]:
global_alignments[0].seqA

'MKGYFGPYGGQYVPEILMGALEELEAAYEGIMKDESFWKEFNDLLRDYAGRPTPLYFARRLSEKYGARVYLKREDLLHTGAHKINNAIGQVLLAKLMGKTRIIAETGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKLLGAKVVPVKSGSRTLKDAIDEALRDWITNLQTTYYVFGSVVGPHPYPIIVRNFQKVIGEETKKQIPEKEGRLPDYIVACVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKIGYLHGSKTFVLQDDWGQVQVSHSVSAGLDYSGVGPEHAYWRETGKVLYDAVTDEEALDAFIELSRLEGIIPALESSHALAYLKKINIKGKVVVVNLSGRGDKDLESVLNHPYVRERIRL'

In [56]:
global_alignments[0].seqB

'--GYFGPYGGQYVPEILMGALEELEAAYEGIMKDESFWKEFNDLLRDYAGRPTPLYFARRLSEKYGARVYLKREDLLHTGAHKINNAIGQVLLAKLMGKTRIIAETGAGQHGVATATAAALFGMECVIYMGEEDTIRQKLNVERMKLLGAKVVPVKSGSRTLKDAIDEALRDWITNLQTTYYVFGSVVGPHPYPIIVRNFQKVIGEETKKQIPEKEGRLPDYIVACVSGGSNAAGIFYPFIDSGVKLIGVEAGGEGLETGKHAASLLKGKIGYLHGSKTFVLQDDWGQVQVSHSVSAGLDYSGVGPEHAYWRETGKVLYDAVTDEEALDAFIELSRLEGIIPALESSHALAYLKKINIKGKVVVVNLSGRGDKDLESVLNHPYVRERIRL'

In [1]:
def find_missing_str(longer: str, shorter: str) -> [str, str]:
    """
    A function for finding the missing part of a string

    Args:
    - longer: str, longer string
    - shorter: str, shorter string

    Returns:
    - part_before: str, part of the longer string before the shorter
    - part_after: str, part of the longer string after the shorter
    """
    # Find the start index of the shorter in the longer string
    start_index = longer.find(shorter)

    # If the shorter is not found, return the longer string as the "missing" part
    if start_index == -1:
        return longer, ""

    # Find the end index of the shorter
    end_index = start_index + len(shorter)

    # Extract parts of the longer string that are not the shorter
    part_before = longer[:start_index]
    part_after = longer[end_index:]

    return part_before, part_after

In [16]:
ev_esm_dir = "ev_esm2"
csv_path = f"{ev_esm_dir}/{lib}/{lib}.csv"

if "TrpB" in lib:
    protein = "TrpB"
else:
    protein = lib
    
seq = SeqIO.read(f"data/{protein}/{protein}.fasta", "fasta").seq
if lib == "DHFR":
    seq = str(seq.translate())
else:
    seq = str(seq)

pdb_seq = pdb2seq(f"data/{protein}/{protein}.pdb", "A")

global_alignments = pairwise2.align.globalxx(seq, pdb_seq)

df = pd.read_csv(df_path)

for col in ["muts", "seq"]:
    if col not in df.columns:
        raise ValueError(f"{col} column not found")

fasta = csv_path.replace(".csv", ".fasta")

# pdb has more than fasta should only be for dhfr
if len(seq) < len(pdb_seq):
    part_before, part_after = find_missing_str(longer=pdb_seq, shorter=seq)
    with open(fasta, "w") as f:
        for mut, seq in zip(df["muts"].values, df["seq"].values):
            f.write(f">{mut}\n{part_before+seq+part_after}\n")
elif len(seq) == len(pdb_seq):
    with open(fasta, "w") as f:
        for mut, seq in zip(df["muts"].values, df["seq"].values):
            f.write(f">{mut}\n{seq}\n")
else:
    part_before, part_after = find_missing_str(longer=seq, shorter=pdb_seq)
    with open(fasta, "w") as f:
        for mut, seq in zip(df["muts"].values, df["seq"].values):
            f.write(f">{mut}\n{seq[len(part_before):len(seq)-len(part_after)]}\n")



NameError: name 'df_path' is not defined

In [14]:
mut, seq = df["muts"].values[0], df["seq"].values[0]

In [38]:
seq[len(part_before):len(seq)-len(part_after)]

'MNTINIAKNDFSDIELAAIPFNTLADHYGERLAREQLALEHESYEMGEARFRKMFERQLKAGEVADNAAAKPLITTLLPKMIARINDWFEEVKAKRGKRPTAFQFLQEIKPEAVAYITIKTTLACLTSADNTTVQAVASAIGRAIEDEARFGRIRDLEAKHFKKNVEEQLNKRVGHVYKKAFMQVVEADMLSKGLLGGEAWSSWHKEDSIHVGVRCIEMLIESTGMVSLHRQNAGVVGQDSETIELAPEYAEAIATRAGALAGISPMFQPCVVPPKPWTGITGGGYWANGRRPLALVRTHSKKALMRYEDVYMPEVYKAINIAQNTAWKINKKVLAVANVITKWKHCPVEDIPAIEREELPMKPEDIDMNPEALTAWKRAAAAVYRKDKARKSRRISLEFMLEQANKFANHKAIWFPYNMDWRGRVYAVSMFNPQGNDMTKGLLTLAKGKPIGKEGYYWLKIHGANCAGVDKVPFPERIKFIEENHENIMACAKSPLENTWWAEQDSPFCFLAFCFEYAGVQHHGLSYNCSLPLAFDGSCSGIQHFSAMLRDEVGGRAVNLLPSETVQDIYGIVAKKVNEILQADAINGTDNEVVTVTDENTGEISEKVKLGTKALAGQWLAYGVTRSVTKRSVMTLAYGSKEFGFRQQVLEDTIQPAIDSGKGLMFTQPNQAAGYMAKLIWESVSVTVVAAVEAMNWLKSAAKLLAAEVKDKKTGEILRKRCAVHWVTPDGFPVWQEYKKPIQTRLNLMFLGQFRLQPTINTNKDSEIDAHKQESGIAPNFVHSQDGSHLRKTVVWAHEKYGIESFALIHDSFGTIPADAANLFKAVRETMVDTYESCDVLADFYDQFADQLHESQLDKMPALPAKGNLNLRDILESDFAFA'

In [31]:
len(part_before)

0

In [33]:
len(seq), len(part_after), len(seq)-len(part_after)

(883, 883, 0)

In [None]:

def csv2fasta(csv: str) -> None:
    """
    A function for converting a csv file to a fasta file
    ie /disk2/fli/SSMuLA/ev_esm2/DHFR/DHFR.csv

    """
    df = pd.read_csv(csv)

    for col in ["muts", "seq"]:
        if col not in df.columns:
            raise ValueError(f"{col} column not found")

    fasta = csv.replace(".csv", ".fasta")
    with open(fasta, "w") as f:
        for mut, seq in zip(df["muts"].values, df["seq"].values):
            f.write(f">{mut}\n{seq}\n")
