In [2]:
import requests
import os
import pandas as pd
from Bio.PDB import MMCIFParser
from Bio.PDB import DSSP
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from multiprocessing import Value
import warnings

warnings.filterwarnings("ignore")

counter = Value('i', 0)

def save_pdb_file(pdb_id, save_path):
    file_path = os.path.join(save_path, f"{pdb_id.upper()}.cif")
    
    if os.path.exists(file_path):
        print(f"File {file_path} already exists. Skipping download.")
        return True
    
    try:
        url = f"https://files.rcsb.org/download/{pdb_id.upper()}.cif"
        response = requests.get(url)
        response.raise_for_status()
        pdb_content = response.text
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(pdb_content)
        return True
    except Exception as e:
        print(f"Failed to fetch and save PDB file for PDB ID {pdb_id}: {e}")
        return False

def save_multiple_pdb_files(df, save_path, column='PDB'):
    if column not in df.columns:
        print(f"Column '{column}' not found in DataFrame.")
        return
    
    unique_pdb_ids = set()
    for pdb_ids_str in df[column]:
        pdb_ids = pdb_ids_str.split(';')[:-1]
        unique_pdb_ids.update(pdb_ids)
    
    with ThreadPoolExecutor(max_workers=24) as executor:
        results = list(executor.map(lambda pdb_id: save_pdb_file(pdb_id, save_path), unique_pdb_ids))
    
    print(f"Downloaded and saved {sum(results)} out of {len(unique_pdb_ids)} PDB files.")

def extract_secondary_structures(structure, file):
    model = structure[0]

    dssp = DSSP(model, file)
    
    data = []
    current_ss_type = None
    current_sequence = []
    current_start_pos = None
    current_end_pos = None
    
    for key in dssp.keys():
        chain_id, res_id = key
        index, aa, ss, rel_acc, phi, psi, *_ = dssp[key]
        pos = res_id[1]
    
        if current_ss_type is None:
            current_ss_type = ss
            current_start_pos = pos
    
        if ss == current_ss_type:
            current_sequence.append(aa)
            current_end_pos = pos
        else:
            if current_ss_type != '-':
                data.append([current_ss_type, ''.join(current_sequence), (current_start_pos, current_end_pos)])
            current_ss_type = ss
            current_sequence = [aa]
            current_start_pos = pos
            current_end_pos = pos
    
    if current_sequence and current_ss_type != '-':
        data.append([current_ss_type, ''.join(current_sequence), (current_start_pos, current_end_pos)])
    
    return data

def worker(files_subset, save_cif_path):
    global counter
    rows = []
    errors = []
    for filename in files_subset:
        if filename.endswith('.cif'):
            filepath = os.path.join(save_cif_path, filename)
            pdb_id = filename.split('.')[0]
            
            try:
                mmcif_dict = MMCIF2Dict(filepath)
                method = mmcif_dict.get("_exptl.method", ["Not available"])
                method = ', '.join(method) if isinstance(method, list) else method
                
                resolution = mmcif_dict.get("_refine.ls_d_res_high", ["Not available"])
                try:
                    resolution = float(resolution[0]) if isinstance(resolution, list) else resolution
                except ValueError:
                    resolution = resolution[0] if isinstance(resolution, list) else resolution
                
                parser = MMCIFParser(QUIET=True)
                structure = parser.get_structure(pdb_id, filepath)
                
                secondary_structures = extract_secondary_structures(structure, filepath)
                
                for ss_type, sequence, positions in secondary_structures:
                    row = [pdb_id, method, resolution, sequence, ss_type, positions]
                    rows.append(row)
                    
            except Exception as e:
                errors.append(f"An error occurred while processing {filename}: {e}")
            
            with counter.get_lock():
                counter.value += 1
                print(f"Processed {counter.value} files", end='\r')
    
    return rows, errors

def parse_cif_files(df, save_cif_path, num_cores=8, num_files=None):
    global counter
    counter.value = 0

    files = os.listdir(save_cif_path)[:num_files]
    total_files = len(files)
    
    print(f"Total files to process: {total_files}")

    avg_len = total_files // num_cores
    subsets = [files[i:i + avg_len] for i in range(0, total_files, avg_len)]

    all_rows = []
    all_errors = []

    with ProcessPoolExecutor(max_workers=num_cores) as executor:
        results = executor.map(worker, subsets, [save_cif_path]*len(subsets))

    for rows, errors in results:
        all_rows.extend(rows)
        all_errors.extend(errors)

    columns = ["PDB", "Method", "Resolution", "Sequence", "Secondary Structure Type", "Positions"]
    df_to_append = pd.DataFrame(all_rows, columns=columns)
    df = pd.concat([df, df_to_append], ignore_index=True)
    
    for error in all_errors:
        print(error)

    return df

In [3]:
save_cif_path = './cif_files'
os.makedirs(save_cif_path, exist_ok=True)

dataset = pd.read_csv('./uniprot_files/uniprotkb_reviewed.tsv', sep='\t')
dataset.dropna(subset=['PDB'], inplace=True)
dataset

Unnamed: 0,Entry,PDB
0,A0A009IHW8,7UWG;7UXU;
1,A0A023I7E1,4K35;4K3A;5XBZ;5XC2;
2,A0A024B7W1,5GOZ;5GP1;5H30;5H32;5H37;5IRE;5IZ7;5JMT;5KQR;5...
3,A0A024SC78,4PSC;4PSD;4PSE;
7,A0A059TC02,4R1S;4R1T;
...,...,...
569347,Q9X1Q6,3DCM;
569440,Q9YC08,2CXH;
569495,Q9Z7A3,3Q9D;
569541,Q9ZB78,4XNG;


In [12]:
save_multiple_pdb_files(dataset, save_cif_path)

KeyboardInterrupt: 

In [4]:
columns = ["PDB", "Method", "Resolution", "Sequence", "Secondary Structure Type", "Positions"]
df = pd.DataFrame(columns=columns)

df = parse_cif_files(df, save_cif_path, num_cores=12, num_files=1000)
df.to_csv('secondary_structures.csv', index=False)
df

Total files to process: 1000
An error occurred while processing 5LER.cif: '1'
An error occurred while processing 5LFB.cif: '1'
An error occurred while processing 7UXX.cif: 'A'
An error occurred while processing 7UXZ.cif: 'A'
An error occurred while processing 7NKT.cif: 'A'
An error occurred while processing 6Z1Q.cif: 'A'
An error occurred while processing 6Z1T.cif: 'A'
An error occurred while processing 6Z2E.cif: 'A'


Unnamed: 0,PDB,Method,Resolution,Sequence,Secondary Structure Type,Positions
0,101M,X-RAY DIFFRACTION,2.07,EGEWQLVLHVWAKV,H,"(4, 17)"
1,101M,X-RAY DIFFRACTION,2.07,EAD,G,"(18, 20)"
2,101M,X-RAY DIFFRACTION,2.07,VAGHGQDILIRLFKS,H,"(21, 35)"
3,101M,X-RAY DIFFRACTION,2.07,PETLEK,G,"(37, 42)"
4,101M,X-RAY DIFFRACTION,2.07,DRVKH,T,"(44, 48)"
...,...,...,...,...,...,...
145015,3L2C,X-RAY DIFFRACTION,1.868,SK,T,"(158, 159)"
145016,3L2C,X-RAY DIFFRACTION,1.868,FIKV,E,"(160, 163)"
145017,3L2C,X-RAY DIFFRACTION,1.868,A,S,"(167, 167)"
145018,3L2C,X-RAY DIFFRACTION,1.868,TG,T,"(168, 169)"


# Decypher of DSSP codes
| Code | Structure                   |
|------|-----------------------------|
| H    | Alpha helix (4-12)          |
| B    | Isolated beta-bridge residue|
| E    | Strand                      |
| G    | 3-10 helix                  |
| I    | Pi helix                    |
| T    | Turn                        |
| S    | Bend                        |
| -    | None                        |