In [1]:
import os
from torchdrug import data

atp_folder = '../atp'

failed_to_load = []

from tqdm import tqdm

for filename in tqdm(os.listdir(atp_folder), desc="Loading PDB files"):
    if filename.endswith('.pdb'):
        file_path = os.path.join(atp_folder, filename)
        try:
            protein = data.Protein.from_pdb(file_path)
        except Exception as e:
            failed_to_load.append(filename)

Loading PDB files: 100%|██████████| 435/435 [00:38<00:00, 11.24it/s]


In [6]:
def remove_chain(file_name):
    pdb_id = file_name.split('.')[0]
    return pdb_id[:-1] + '.pdb'

print("\nPDB files that failed to load (sorted by file size):")
failed_files = [
    (file, 
     os.path.getsize(os.path.join(atp_folder, file)), 
     os.path.getsize(os.path.join('../pdb_tmp', remove_chain(file))))
    for file in failed_to_load]
failed_files.sort(key=lambda x: x[1])  # Sort by file size

for file, size, size_before in failed_files:
    print(f"{file}: File size = {size} bytes (before: {size_before} bytes)")


PDB files that failed to load (sorted by file size):


In [7]:
for filename in failed_to_load:
    try:
        protein = data.Protein.from_pdb(f'{atp_folder}/{filename}')
    except Exception as e:
        print(f"Failed to load: {filename}, error: {e}")

In [8]:
unmatched_proteins = ['5J1SB', '4TU0A', '3BG5A']

from data.script.generate_pdb import read_file, try_loading_pdb
from torchdrug import utils, layers, data
from torchdrug.layers import geometry

def validate_unmatched(pdb_ids):
    _, sequences, _, _ = read_file(os.path.join(atp_folder, 'train.txt'))
    
    for pdb_id in pdb_ids:
        print(f'Validating {pdb_id}...')
        file_path = os.path.join(atp_folder, f'{pdb_id}.pdb')
        protein = try_loading_pdb(file_path)
        if not protein:
            print(f'{pdb_id}: error: protein not loaded')
            continue

        sequence = next(seq for seq, id in zip(sequences, _) if id == pdb_id)

        # Get sequence from protein after graph construction model leaving only alpha carbon nodes
        graph_construction_model = layers.GraphConstruction(
            node_layers=[geometry.AlphaCarbonNode()],
            edge_layers = [
                geometry.SpatialEdge(radius=10.0, min_distance=5),
                geometry.KNNEdge(k=10, min_distance=5),
                geometry.SequentialEdge(max_distance=2),
            ],
            edge_feature="gearnet",
        )
        dataloader = data.DataLoader([protein], batch_size=1)
        batch = utils.cuda(next(iter(dataloader)))
        batch = graph_construction_model(batch)
        
        protein_sequence = ''.join(
            i for i in batch.to_sequence()[0] if i != '.'
        )
        if protein_sequence != sequence:
            print(f'Validation failed for {pdb_id}: sequence mismatch.')
            print(f'Length of alpha carbons: {len(protein_sequence)}, length of given sequence: {len(sequence)}')
            print('Sequence from protein:')
            print(protein_sequence)
            print('Sequence from txt:')
            print(sequence)
        elif protein.num_residue != len(sequence):
            print(f'Validation failed for {pdb_id}: length mismatch.')
            print(f'Number of residues: {protein.num_residue}, length of sequence: {len(sequence)}')
        else:
            print(f'Validation passed for {pdb_id}')

validate_unmatched(unmatched_proteins)


ModuleNotFoundError: No module named 'data.script'

In [9]:
# Try loading 5DN3A
print("Attempting to load 5DN3A...")
file_path = os.path.join(atp_folder, '5DN3A.pdb')
protein = try_loading_pdb(file_path)

if protein:
    print("Successfully loaded 5DN3A")
    print(f"Number of residues: {protein.num_residue}")
    print(f"Sequence: {protein.to_sequence()}")
else:
    print("Failed to load 5DN3A")



Attempting to load 5DN3A...


NameError: name 'try_loading_pdb' is not defined

In [10]:
def try_loading_pdb(file_path):
    try:
        protein = data.Protein.from_pdb(file_path)
        return protein
    except Exception as e:
        print("Error loading %s" % file_path)
        return None

# Try loading 3EPSA
print("Attempting to load 3EPSA...")
file_path = os.path.join(atp_folder, '3EPSA.pdb')
protein = try_loading_pdb(file_path)

if protein:
    print("Successfully loaded 3EPSA")
    print(f"Number of residues: {protein.num_residue}")
    print(f"Sequence: {protein.to_sequence()}")
else:
    print("Failed to load 3EPSA")

Attempting to load 3EPSA...
Successfully loaded 3EPSA
Number of residues: 566
Sequence: PRGLELLIAQTILQGFDAQYGRFLEVTSGAQQRFEQADWHAVQQAMKNRIHLYDHHVGLVVEQLRCITNGQSTDAEFLLRVKEHYTRLLPDYPRFEIAESFFNSVYCRLFDHRSLTPERLFIFSSQPERRFRTIPRPLAKDFHPDHGWESLLMRVISDLPLRLHWQNKSRDIHYIIRHLTETLGPENLSKSHLQVANELFYRNKAAWLVGKLITPSGTLPFLLPIHQTDDGELFIDTCLTTTAEASIVFGFARSYFMVYAPLPAALVEWLREILPGKTTAELYMAIGCQKHAKTESYREYLVYLQGCNEQFIEAPGIRGMVMLVFTLPGFDRVFKVIKDKFAPQKEMSAAHVRACYQLVKEHDRVGRMADTQEFENFVLEKRHISPALMELLLQEAAEKITDLGEQIVIRHLYIERRMVPLNIWLEQVEGQQLRDAIEEYGNAIRQLAAANIFPGDMLFKNFGVTRHGRVVFYDYDEICYMTEVNFRDIPPPRYP.PWYSVSPGDVFPEEFRHWLCADPRIGPLFEEMHADLFRADYWRALQNRIREGHVEDVYAYRRRQRFSVRYG
