#### Jupyter notebook that takes in a concatenated file of output logs for colabfold and parses through to produce a dataframe connecting the pdb file name to its given plddt and ptm score
Useful for filtering models for specific plddt or ptm scores

In [None]:
import pandas as pd

In [5]:
#Function that takes in a file of all of the colabfold output (essentially the output logs concatenated together) and parses through them to pull out the lines corresponding to the plddt of each given model 
def parse_plddt(file_path):
    with open(file_path, "r") as file:
    #Iterate over all the lines in the file
        line = file.readline().strip().split(' ')
        plddt_dict = {}
        refold_list = []
        while line: 
            if line[2]=='Query':
                next = file.readline().strip().split(' ')
                #This will only take the top ranked model 
                if next[2]=='rank_001_alphafold2_ptm_model_1_seed_000': 
                    query = line[4]
                    plddt = next[3]
                    ptm = next[4]
                    plddt_dict[query] = [plddt, ptm]
                    line = file.readline().strip().split(' ')
                elif next[2] == 'Query':
                    refold_list.append(line[4])
                    line = next
    #close the file
    file.close()   
    return plddt_dict, refold_list     

['2023-05-18', '15:21:33,702', 'Query', '1/5:', '1000_D4994_C39_H1_Bin_319_scaffold_53331_11', '(length', '1000)']
['2023-05-18', '15:30:26,648', 'Query', '2/5:', '1000_D4994_C39_H1_Bin_33_scaffold_20337_7', '(length', '1000)']
['2023-05-18', '15:37:28,855', 'Query', '3/5:', '1002_D4994_C39_H1_Bin_122_scaffold_285550_9', '(length', '1002)']
['2023-05-18', '15:44:19,379', 'Query', '4/5:', '1002_D4994_C39_H1_Bin_37_scaffold_645156_1', '(length', '1002)']
['2023-05-18', '15:50:44,867', 'Query', '5/5:', '1003_D4994_C39_H1_Bin_629_scaffold_26817_8', '(length', '1003)']
['2023-05-19', '08:40:37,855', 'Query', '1/10:', '2333_D4994_C39_H1_Bin_33_scaffold_432330_1', '(length', '2333)']
['2023-05-19', '09:03:48,566', 'Query', '2/10:', '2370_D4994_C39_H1_Bin_319_scaffold_80428_3', '(length', '2370)']
['2023-05-19', '09:29:36,308', 'Query', '3/10:', '2397_D4994_C39_H1_Bin_674_scaffold_381877_17', '(length', '2397)']
['2023-05-19', '09:57:22,946', 'Query', '4/10:', '2419_D4994_C39_H1_Bin_319_scaffo

IndexError: list index out of range

In [None]:
#File path of concatenated log (output not error) files from colabfold 
plddt_file_path = '../plddt_all.txt'
output_folder = '../plddt_output/'

In [None]:
#Generate a dictionary of the plddt values matched to model names using the above function and obtain the list of proteins that didn't fold 
plddt_dict, refold_list = parse_plddt(plddt_file_path)

In [6]:

#save a pandas dictionary as a csv file with headers
df = pd.DataFrame.from_dict(plddt_dict, orient='index', columns=['plddt', 'ptm'])
#name index column
#df.index.name = 'query'
df.head()
#Output dataframe as tsv file
output_file = folder_path + 'plddt_dict_all.tsv'
df.to_csv(output_file, sep='\t', index=True)
#plddt_dict

In [7]:
#Output list of proteins to refold as .txt file
output_file = folder_path + 'refold_list_all_ls6.txt'
with open(output_file, 'w') as f:
    for item in refold_list:
        f.write("%s\n" % item)
#refold_list

In [1]:
! pwd

/stor/work/Marcotte/project/drbarth/asgard/membrane_proteome/scripts


#### Now that you've saved the plddt scores, read back in the file and do whatever filtering is desired!

In [1]:
#Read in the two pLDDT files and merge them
# Path: plddt_parse_test.ipynb
file_path = '../ls6_cf_old/plddt_dict_all_ls6.tsv'
file_path2 = '../colabfold_dbarth/plddt_dict_all_cf.tsv'
ls6 = pd.read_csv(file_path, sep='\t', index_col=0)
cf = pd.read_csv(file_path2, sep='\t', index_col=0)

In [2]:
#split the entries in the plddt and ptm columns by '=' and take the second entry
ls6['plddt'] = ls6['plddt'].str.split('=').str[1]
ls6['ptm'] = ls6['ptm'].str.split('=').str[1]   
cf['plddt'] = cf['plddt'].str.split('=').str[1]
cf['ptm'] = cf['ptm'].str.split('=').str[1]

In [11]:
#Put both dataframes into one dataframe
plddt_all = pd.concat([ls6, cf])

#Name index column
plddt_all.index.name = 'ProteinID'

#Parse the index column and remove all characters before and including the first underscore
plddt_all.index = plddt_all.index.str.replace(r'^[^_]*_', '', regex=True)
#Reset index
plddt_all.reset_index(inplace=True)
plddt_all.head()
#Export plddt_all to csv
plddt_all.to_csv('plddt_all.csv', index=False)

In [5]:
print(plddt_all.shape, cf.shape, ls6.shape)
plddt_all.head()

(8876, 2) (6041, 2) (2835, 2)


Unnamed: 0,plddt,ptm
1000_D4994_C39_H1_Bin_319_scaffold_53331_11,67.8,0.674
1000_D4994_C39_H1_Bin_33_scaffold_20337_7,68.7,0.629
1002_D4994_C39_H1_Bin_122_scaffold_285550_9,51.2,0.356
1002_D4994_C39_H1_Bin_37_scaffold_645156_1,80.4,0.758
1003_D4994_C39_H1_Bin_629_scaffold_26817_8,86.9,0.758


In [6]:
#Filter plddt_all to only include entries with a plddt greater than 60
plddt_70 = plddt_all[plddt_all['plddt'].astype(float) >= 70]
print(plddt_70.shape)
plddt_all.head()


(6306, 2)


Unnamed: 0,plddt,ptm
1000_D4994_C39_H1_Bin_319_scaffold_53331_11,67.8,0.674
1000_D4994_C39_H1_Bin_33_scaffold_20337_7,68.7,0.629
1002_D4994_C39_H1_Bin_122_scaffold_285550_9,51.2,0.356
1002_D4994_C39_H1_Bin_37_scaffold_645156_1,80.4,0.758
1003_D4994_C39_H1_Bin_629_scaffold_26817_8,86.9,0.758


In [18]:
length = plddt_all_60.shape[0]
length
what_is_left = plddt_all_60.index[6854:]
len(what_is_left)

489

In [7]:
#Now, we need to use this list to get all of the matching pdb files into a new folder
file_path = '../clustering/pdb_files/'
folder_path = '../clustering/plddt_over70/.'
for name in plddt_70.index:
    pdb_file = file_path + name + '_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb'
    print(pdb_file)
    ! cp $pdb_file $folder_path

../clustering/pdb_files/1002_D4994_C39_H1_Bin_37_scaffold_645156_1_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/1003_D4994_C39_H1_Bin_629_scaffold_26817_8_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/381_D4994_C39_H1_Bin_37_scaffold_367816_2_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/382_D4994_C39_H1_Bin_261_scaffold_703355_7_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/2930_D4994_C39_H1_Bin_33_scaffold_380869_32_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/442_D4994_C39_H1_Bin_733_scaffold_177621_3_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/442_D4994_C39_H1_Bin_733_scaffold_564289_9_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/442_D4994_C39_H1_Bin_797_scaffold_585345_6_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb
../clustering/pdb_files/

489