In [2]:
import numpy as np
import pandas as pd
import os

In [12]:
def reference_builder(numb_prot: list[str], protein_name: list[str], wt_sequence: list[str], DMS_binarization_cutoff: list[float], MSA_name: list[str], MSA_start: list[int], MSA_end: list[int], MSA_num_seqs: list[float], pdb_file: list[str],
                      reference_name: list[str], custom_identifier: list[str]):
    """
    This function builds a reference file that contains all relevant information about the proteins of interest. This reference file
    is used to pass information to the other functions in the pipeline.

    Args:
        numb_prot (list[str]): The number of the proteins
        protein_name (list[str]): The names of the proteins of interest
        wt_sequence (list[str]): The wild-type sequences of the proteins of interest
        DMS_binarization_cutoff (list[float]): The cutoffs for binarizing DMS data, often just WT experimental value
        MSA_name (list[str]): The names of the MSA files without the file extension
        MSA_start (list[int]): The starting positions of the proteins of interest in the MSAs
        MSA_end (list[int]): The ending positions of the proteins of interest in the MSAs
        MSA_num_seqs (list[float]): The numbers of sequences in the MSAs
        pdb_file (list[str]): The names of the pdb files
        custom_identifier (list[str]): Custom identifiers for the proteins of interest
        reference_name (str): The name of the reference file.
    
    Returns:
        reference_df (pd.DataFrame): The reference file in a pandas DataFrame format
    """

    # Initialize dictionary to store reference information
    reference = {}

    # Iterate over proteins to build reference dictionary
    for i in range(numb_prot):

        # If custom identifiers are provided, use them, otherwise only use protein names
        if custom_identifier[i] is not None:
            DMS_id = protein_name[i] + "_" + custom_identifier[i]
        else:
            DMS_id = protein_name[i]

        # Initializing and save weights for MSA if they do not already exist. Will be changed in future pipelines: 
        msa_ = MSA_name[i]
        if f"../data/protein_information/msa/files/{msa_}_weights.npy" not in os.listdir():
            weights = np.ones(len(wt_sequence[i]))
            np.save(f"../data/protein_information/msa/weights/{msa_}_weights.npy", weights)
        
        # Build reference dictionary
        reference[i] = {
            'DMS_id': DMS_id,
            'DMS_filename': f'{DMS_id}.csv',
            'target_seq': wt_sequence[i],
            'seq_len': len(wt_sequence[i]),
            'DMS_binarization_cutoff': DMS_binarization_cutoff[i],
            'MSA_filename': MSA_name[i]+'.a2m',
            'MSA_start': MSA_start[i],
            'MSA_end': MSA_end[i],
            'MSA_len': MSA_end[i] - MSA_start[i],
            'MSA_num_seqs': MSA_num_seqs[i],
            'weight_file_name': f"{msa_}_weights.npy",
            'pdb_file': pdb_file[i]
        }

    reference_df = pd.DataFrame.from_dict(reference, orient='index')
    reference_df.to_csv(f"../ModellerModule/reference_files/"+reference_name, index=False)
    return reference_df

In [15]:
# Change the following variables to fit the proteins of interest
prot_name = ['protein1', 'protein2'] # The names/identifiers of the proteins
seq = ['APRTEINSEQEVNCE', 'YPRTEINSEQEVNCE'] # The wild-type sequences of the proteins
bin_cutoff = [0.5, 0.5] # The cutoffs for binarizing DMS data, often just WT experimental value
msa_name = ['msa_file1', 'msa_file2'] # The names of the MSA files without the file extension
msa_start = [1, 1] # The starting positions of the proteins of interest in the MSAs
msa_end = [14, 14] # The ending positions of the proteins of interest in the MSAs
msa_num_seqs = [100, 200] # The numbers of sequences in the MSAs
pdb = ['pdb_file1.pdb', 'pdb_file2.pdb'] # The names of the pdb files
custom_id = ['custom_id1', None] # Custom identifiers for the proteins of interest
reference_name = 'custom_reference.csv' # The name of the resulting reference file

reference = reference_builder(2, prot_name, seq, bin_cutoff, msa_name, msa_start, msa_end, msa_num_seqs, pdb, reference_name, custom_id)





In [17]:
# Print the reference file to check if it was built correctly
reference

Unnamed: 0,DMS_id,DMS_filename,target_seq,seq_len,DMS_binarization_cutoff,MSA_filename,MSA_start,MSA_end,MSA_len,MSA_num_seqs,weight_file_name,pdb_file
0,protein1_custom_id1,protein1_custom_id1.csv,APRTEINSEQEVNCE,15,0.5,msa_file1.a2m,1,14,13,100,msa_file1_weights.npy,pdb_file1.pdb
1,protein2,protein2.csv,YPRTEINSEQEVNCE,15,0.5,msa_file2.a2m,1,14,13,200,msa_file2_weights.npy,pdb_file2.pdb
