In [5]:
%reload_ext autoreload
%autoreload 2

import os
import subprocess
import tempfile

import h5py
from tqdm import tqdm

import src.rust_modules as rust_modules

In [None]:
with h5py.File('../tmp/data/tokenized/mdcath/tokenized_data.h5', 'r') as file:
    _tokenized_keys = list(file['foldseek'].keys())
    _sequence = file['foldseek']['12asA00'].attrs['sequence']
    print(_tokenized_keys[:10])
    print(len(_tokenized_keys))
    print(_sequence)
    
    _sequence = [x.decode('utf-8') for x in file['foldseek']['12asA00']['320_0']]
    print(_sequence)
    _sequence = [x.decode('utf-8') for x in file['foldseek']['12asA00']['320_1']]
    print(_sequence)
    _sequence = [x.decode('utf-8') for x in file['foldseek']['12asA00']['413_1']]
    print(_sequence)

In [None]:
with tempfile.TemporaryDirectory() as tmp_dir:
    input_path = ""
    sub_mat_path = ""
    foldseek_path = "foldseek"
    mmseqs_path = "mmseqs"
    
    parameters = f"-pca 1.4 --pcb 1.5 --sub-mat {sub_mat_path} --mask-profile 0 " \
                "--comp-bias-corr 0 --e-profile 0.1 -e 0.1 --profile-output-mode 1 " \
                "--gap-open 11 --gap-extend 1"

In [27]:
def generate_pssm(input_path, foldseek_path, mmseqs_path, sub_mat_path):
    """
    Generate PSSM profiles from input structures using Foldseek and MMseqs2
    
    Args:
        input_path (str): Path to input structure files
        foldseek_path (str): Path to foldseek executable
        mmseqs_path (str): Path to mmseqs executable 
        sub_mat_path (str): Path to foldseek mat3di.out substitution matrix
        
    Returns:
        str: Path to the generated profile.tsv file
    """
    with tempfile.TemporaryDirectory() as tmp_dir:
        parameters = f"-pca 1.4 --pcb 1.5 --sub-mat {sub_mat_path} --mask-profile 0 " \
                    "--comp-bias-corr 0 --e-profile 0.1 -e 0.1 --profile-output-mode 1 " \
                    "--gap-open 11 --gap-extend 1"
        
        # Define paths for temporary files
        inputdb = os.path.join(tmp_dir, "inputdb")
        fake_aln = os.path.join(tmp_dir, "fake_aln.tsv")
        fake_aln_db = os.path.join(tmp_dir, "fake_aln_db")
        profile_out = os.path.join(tmp_dir, "profile.tsv")
        
        # Create database from input structures
        print(f"Creating database from {input_path} ----------")
        # Create input directory if it doesn't exist
        os.makedirs("input", exist_ok=True)
        subprocess.run([foldseek_path, "createdb", "input/", inputdb], check=True)
        print(f"Database created at {inputdb} ----------")
        
        # Generate fake alignment file
        index_file = f"{inputdb}.index"
        with open(index_file, "r") as f, open(fake_aln, "w") as out:
            for line in f:
                fields = line.split()
                length = int(fields[2]) - 2
                out.write(f"0\t{fields[0]}\t0\t1.00\t0\t0\t{length-1}\t{length}\t0\t{length-1}\t{length}\t{length}M\n")
        
        # Convert alignment to database
        subprocess.run([mmseqs_path, "tsv2db", fake_aln, fake_aln_db, "--output-dbtype", "5"], 
                      check=True)
        
        # Generate PSSM profile
        subprocess.run([mmseqs_path, "result2profile", f"{inputdb}_ss", f"{inputdb}_ss", 
                      fake_aln_db, profile_out] + parameters.split(), check=True)
        
        # Read the profile and return it
        with open(profile_out, 'r') as f:
            profile_content = f.read()
            
    return profile_content

In [None]:
import os
import tempfile

with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as tmp_file:
    tmp_file.write(">dummy_structure\n")
    tmp_file.write(f"{_sequence[0]}\n{_sequence[1]}\n{_sequence[2]}\n")
    print(tmp_file.name)
    dummy_input = tmp_file.name

with open(dummy_input, 'r') as f:
    print(f.read())

foldseek_path = "foldseek"
mmseqs_path = "mmseqs"
sub_mat_path = "/home/finnlueth/repos/bachelor-thesis/mdcath-to-3di/submodules/foldseek/data/mat3di.out"

profile = generate_pssm(
    dummy_input,
    foldseek_path,
    mmseqs_path,
    sub_mat_path
)

print("Generated PSSM profile:")
print(profile[:200] + "...")  # Print first 200 characters of the profile

# # Clean up
# os.unlink(dummy_input)

In [None]:
# def generate_pssm_from_3di(sequences, foldseek_path, mmseqs_path, sub_mat_path):
#     """Generate PSSM profile from a list of 3di sequences.
    
#     Args:
#         sequences (list): List of 3di sequences
#         foldseek_path (str): Path to foldseek executable
#         mmseqs_path (str): Path to mmseqs executable
#         sub_mat_path (str): Path to foldseek mat3di.out substitution matrix
        
#     Returns:
#         str: Generated PSSM profile content
#     """
#     import tempfile
#     import os
    
#     # Create temporary directory for input sequences
#     with tempfile.TemporaryDirectory() as seq_dir:
#         # Write sequences to temporary file
#         input_file = os.path.join(seq_dir, "input.fas")
#         with open(input_file, "w") as f:
#             for i, seq in enumerate(sequences):
#                 f.write(f">sequence_{i}\n{seq}\n")
        
#         # Generate PSSM using the input file
#         return generate_pssm(input_file, foldseek_path, mmseqs_path, sub_mat_path)

In [None]:
import os
import subprocess
import tempfile


def generate_pssms_with_mmseqs(input_fasta, output_profile_tsv, victors_parameters):
    """
    Generate PSSMs with MMseqs2, emulating the structure-based flow but purely on sequences.
    :param input_fasta: Path to a FASTA file containing your input sequences.
    :param output_profile_tsv: Path to the final PSSM/profile file (TSV).
    :param victors_parameters: String of extra parameters for MMseqs2 result2profile.
    """
    with tempfile.TemporaryDirectory() as tmpdir:
        # Paths within the temp directory
        inputdb = os.path.join(tmpdir, "inputdb")
        fake_aln_tsv = os.path.join(tmpdir, "fake_aln.tsv")
        fake_aln_db = os.path.join(tmpdir, "fake_aln_db")
        
        # 1) Create an MMseqs database from the input fasta
        subprocess.run(
            ["mmseqs", "createdb", input_fasta, inputdb],
            check=True
        )
        
        # 2) Build a "fake alignment" to trick result2profile into generating a PSSM
        #    We'll mimic the AWK logic from your Foldseek script to create an alignment TSV.
        inputdb_index = f"{inputdb}.index"
        with open(inputdb_index, "r") as idx, open(fake_aln_tsv, "w") as out_tsv:
            for line in idx:
                # Each line in *.index is typically:
                # seqID  offset  length  ...
                cols = line.strip().split()
                seq_id = cols[0]
                length = int(cols[2])
                
                # The following mimics:
                #   awk '{ len = $3 - 2; print ... }'
                # We create an alignment entry that says
                # "The sequence aligns fully to itself."
                fake_len = length - 2
                # Fields (tab-separated):
                # [targetID, queryID, e-value, bitScore, qStart, qEnd, tStart, tEnd, qLen, tLen, alignment length, CIGAR]
                # The numbers here are placeholders that keep each sequence aligned to itself, for PSSM building.
                out_tsv.write(
                    f"0\t{seq_id}\t0\t1.00\t0\t0\t{fake_len - 1}\t{fake_len}\t0\t{fake_len - 1}\t{fake_len}\t{fake_len}M\n"
                )
        
        # 3) Convert that TSV to a DB (output-dbtype=5 ensures we get an alignment DB)
        subprocess.run(
            ["mmseqs", "tsv2db", fake_aln_tsv, fake_aln_db, "--output-dbtype", "5"],
            check=True
        )
        
        # 4) Run result2profile to compute a PSSM-style profile from the "fake" alignment
        #    We pass the extra parameters (victors_parameters) as a list of flags.
        cmd = [
            "mmseqs", 
            "result2profile", 
            inputdb,       # query DB
            inputdb,       # target DB
            fake_aln_db,   # alignment DB
            output_profile_tsv
        ] + victors_parameters.split()
        
        subprocess.run(cmd, check=True)

if __name__ == "__main__":
    # Example usage:
    # Suppose you have an input FASTA: "my_sequences.fasta"
    # and you want the final profile in "my_profile.tsv".
    # Provide additional parameters if needed.
    
    # VICTORS_PARAMETERS = (
    #     "-pca 1.4 "
    #     "--pcb 1.5 "
    #     "--mask-profile 0 "
    #     "--comp-bias-corr 0 "
    #     "--e-profile 0.1 "
    #     "-e 0.1 "
    #     "--profile-output-mode 1 "
    #     "--gap-open 11 "
    #     "--gap-extend 1"
    # )
    
    VICTORS_PARAMETERS= (
        "-pca 1.4"
        "--pcb 1.5"
        "--sub-mat /home/finnlueth/repos/bachelor-thesis/mdcath-to-3di/submodules/foldseek/data/mat3di.out"
        "--mask-profile 0"
        "--comp-bias-corr 0"
        "--e-profile 0.1"
        "-e 0.1"
        "--profile-output-mode 1"
        "--gap-open 11"
        "--gap-extend 1"
    )
    
with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as tmp_file:
    tmp_file.write(">dummy_structure\n")
    tmp_file.write(f"{_sequence[0]}\n{_sequence[1]}\n{_sequence[2]}\n")
    print(tmp_file.name)
    dummy_input = tmp_file.name

with open(dummy_input, 'r') as f:
    print(f.read())

generate_pssms_with_mmseqs(
    input_fasta=dummy_input,
    output_profile_tsv="my_profile.tsv",
    victors_parameters=VICTORS_PARAMETERS
)

/tmp/tmprb_nbh52.fasta
DLVLQVVVLVLLVVQLVVLCCVLQVADEDEDDFKAFDQFLQADCFLVDDDFAWDCDPVDHDTTITHHDACQLVQQFCCLVVPDDASYWHKYWDWDADDPDPDDDQQDGRTFTKIWIKGFHDDVSQAQVVVQVVVLSVVVSLVVSLVVCCVPPVFDDQADNGAAEDEQLCLCVVPVPDDRQVSVFVVQVVRQWYFYDQADDQHDVGHGNHHDDLQKAFNACQHPVRHGGQWTFRWGAQVLLSGIDGFKTWGWIDALVSSQVSCVVVVVNVSCPRNSNVCRVVVSTTTMIIIMGTSLVSVCSRVVDRGPLQSDPDDDDPCVVCVDPPSD
DLVLLVVLLVLLLVQLVVLVCVQQVADEDEDDFKDWPFQLQADPDDPDDDFDWDQDPVCRPTTIGGHQARQLVQLLVCLVVDDDARHWHWYFAFAAAPPDPDDDFADDRRFGWTWIKGFHDPVCLALVVVVVVVLSVVVSLVVSLVVVCPVSVDDDDADPDADEAEPQCLCVVPVVDDLPVSLLVVCVVRFKYFYPQFDDQPPVRHTSDHDDLQWFFRADAHPVGHGGQWTWIWGQLPQQSGTDGFWTWHWIDQLVCSVVRCVVVVVPVSCPRNSNVCRNVPSGGGMTIIIGTSLVSSCRSVSNDGSVSNDDPPDDDVPDDDDPPPD
DVVLVVCLLVVVLVLVVVLCCVLQNADEDEDDFKDFPDQLLADCPDPVDPAAWDQPVVVHPTTIGGHPACVLVQQLVCLVDDAEASYWYKYFDWDAQPPDPDDDQQDDRIFTWTWMKTFHDDPCPAQVVVVVVVLSVVVSLVVVLVVVCVVSVDDDDADPDAAEDEVQCLCVVPVVDDPVVRVLVVLVPRFKYKYFQQCEAYPVGDHPHHDDQQWWWNADQHPVRHGTFKIWIWGQQVLQRTTQRFWTWGWTDAQVSVVVRCVVVVVCVCCPHNSNVCRNVVSTGGMTIIMGTSQSSVCRSVVPNGSCVNDPPDDDPVPDD