In [1]:
import numpy as np
import pandas as pd
from src.PRIZM_helper_functions import reference_builder

# Example Reference Building

In [2]:
# Change the following variables to fit the proteins of interest
prot_name = ['protein1', 'protein2'] # The names/identifiers of the proteins
seq = ['APRTEINSEQEVNCE', 'YPRTEINSEQEVNCE'] # The wild-type sequences of the proteins
bin_cutoff = [0.5, 0.5] # The cutoffs for binarizing DMS data, often just WT experimental value
msa_name = ['msa_file1', 'msa_file2'] # The names of the MSA files without the file extension
msa_start = [1, 1] # The starting positions of the proteins of interest in the MSAs
msa_end = [14, 14] # The ending positions of the proteins of interest in the MSAs
msa_num_seqs = [100, 200] # The numbers of sequences in the MSAs
pdb = ['pdb_file1.pdb', 'pdb_file2.pdb'] # The names of the pdb files
custom_id = ['custom_id1', None] # Custom identifiers for the proteins of interest
reference_name = 'custom_reference.csv' # The name of the resulting reference file

reference = reference_builder(2, prot_name, seq, bin_cutoff, msa_name, msa_start, msa_end, msa_num_seqs, pdb, reference_name, custom_id)

In [3]:
# Print the reference file to check if it was built correctly
reference

Unnamed: 0,DMS_id,DMS_filename,target_seq,seq_len,DMS_binarization_cutoff,MSA_filename,MSA_start,MSA_end,MSA_len,MSA_num_seqs,weight_file_name,pdb_file
0,protein1_custom_id1,protein1_custom_id1.csv,APRTEINSEQEVNCE,15,0.5,msa_file1.a2m,1,14,14,100,msa_file1_weights.npy,pdb_file1.pdb
1,protein2,protein2.csv,YPRTEINSEQEVNCE,15,0.5,msa_file2.a2m,1,14,14,200,msa_file2_weights.npy,pdb_file2.pdb


# FlA Reference Building

In [4]:
# Change the following variables to fit the proteins of interest
prot_name = ['FlA_HIM_RelAct', 'FlA_HIM_RelAct', 'FlA_HIM_RelAct'] # The names/identifiers of the proteins
seq = ['MAANGSQRPIIAFMSDLGTTDDSVAQCKGLMHSICPGVTVVDVCHSMTPWDVEEGARYIVDLPRFFPEGTVFATTTYPATGTTTRSVAVRIRQAAKGGARGQWAGSGDGFERADGSYIYIAPNNGLLTTVLEEHGYIEAYEVTSTKVIPANPEPTFYSREMVAIPSAHLAAGFPLAEVGRRLDDSEIVRFHRPAVEISGEALSGVVTAIDHPFGNIWTNIHRTDLEKAGIGQGKHLKIILDDVLPFEAPLTPTFADAGAIGNIAFYLNSRGYLSLARNAASLAYPYNLKAGLKVRVEAR',
       'MAANGSQRPIIAFMSDLGTTDDSVAQCKGLMHSICPGVTVVDVCHSMTPWDVEEGARYIVDLPRFFPEGTVFATTTYPATGTTTRSVAVRIRQAAKGGARGQWAGSGDGFERADGSYIYIAPNNGLLTTVLEEHGYIEAYEVTSTKVIPANPEPTFYSREMVAIPSAHLAAGFPLAEVGRRLDDSEIVRFHRPAVEISGEALSGVVTAIDHPFGNIWTNIHRTDLEKAGIGQGKHLKIILDDVLPFEAPLTPTFADAGAIGNIAFYLNSRGYLSLARNAASLAYPYNLKAGLKVRVEAR',
       'MAANGSQRPIIAFMSDLGTTDDSVAQCKGLMHSICPGVTVVDVCHSMTPWDVEEGARYIVDLPRFFPEGTVFATTTYPATGTTTRSVAVRIRQAAKGGARGQWAGSGDGFERADGSYIYIAPNNGLLTTVLEEHGYIEAYEVTSTKVIPANPEPTFYSREMVAIPSAHLAAGFPLAEVGRRLDDSEIVRFHRPAVEISGEALSGVVTAIDHPFGNIWTNIHRTDLEKAGIGQGKHLKIILDDVLPFEAPLTPTFADAGAIGNIAFYLNSRGYLSLARNAASLAYPYNLKAGLKVRVEAR'] # The wild-type sequences of the proteins
bin_cutoff = [80, 80, 80] # The cutoffs for binarizing DMS data, often just WT experimental value
msa_name = ['FlA_HIM_alignment', 'FlA_HIM_alignment', 'FlA_HIM_alignment'] # The names of the MSA files without the file extension
msa_start = [1, 1, 1] # The starting positions of the proteins of interest in the MSAs
msa_end = [299, 299, 299] # The ending positions of the proteins of interest in the MSAs
msa_num_seqs = [9519, 9519, 9519] # The numbers of sequences in the MSAs
pdb = ['fluorinase_5794d_unrelaxed_rank_004_alphafold2_ptm_model_1_seed_000.pdb', 'fluorinase_5794d_unrelaxed_rank_004_alphafold2_ptm_model_1_seed_000.pdb', 'fluorinase_5794d_unrelaxed_rank_004_alphafold2_ptm_model_1_seed_000.pdb'] # The names of the pdb files
custom_id = [None, 'SMscan_insilico_library', 'DMscan_combi_insilico_library'] # Custom identifiers for the proteins of interest
reference_name = 'FlA_reference.csv' # The name of the resulting reference file

reference = reference_builder(3, prot_name, seq, bin_cutoff, msa_name, msa_start, msa_end, msa_num_seqs, pdb, reference_name, custom_id)

In [5]:
# Print the reference file to check if it was built correctly
reference

Unnamed: 0,DMS_id,DMS_filename,target_seq,seq_len,DMS_binarization_cutoff,MSA_filename,MSA_start,MSA_end,MSA_len,MSA_num_seqs,weight_file_name,pdb_file
0,FlA_HIM_RelAct,FlA_HIM_RelAct.csv,MAANGSQRPIIAFMSDLGTTDDSVAQCKGLMHSICPGVTVVDVCHS...,299,80,FlA_HIM_alignment.a2m,1,299,299,9519,FlA_HIM_alignment_weights.npy,fluorinase_5794d_unrelaxed_rank_004_alphafold2...
1,FlA_HIM_RelAct_SMscan_insilico_library,FlA_HIM_RelAct_SMscan_insilico_library.csv,MAANGSQRPIIAFMSDLGTTDDSVAQCKGLMHSICPGVTVVDVCHS...,299,80,FlA_HIM_alignment.a2m,1,299,299,9519,FlA_HIM_alignment_weights.npy,fluorinase_5794d_unrelaxed_rank_004_alphafold2...
2,FlA_HIM_RelAct_DMscan_combi_insilico_library,FlA_HIM_RelAct_DMscan_combi_insilico_library.csv,MAANGSQRPIIAFMSDLGTTDDSVAQCKGLMHSICPGVTVVDVCHS...,299,80,FlA_HIM_alignment.a2m,1,299,299,9519,FlA_HIM_alignment_weights.npy,fluorinase_5794d_unrelaxed_rank_004_alphafold2...
