In [2]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os
import re
import requests
from Bio import SeqIO

def save_fasta_file(pdb_id, save_path):
    file_path = os.path.join(save_path, f"{pdb_id.upper()}.fasta")
    
    if os.path.exists(file_path):
        print(f"File {file_path} already exists. Skipping download.")
        return True
    
    try:
        url = f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}"
        response = requests.get(url)
        response.raise_for_status()
        pdb_content = response.text
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(pdb_content)
        return True
    except Exception as e:
        print(f"Failed to fetch and save PDB file for PDB ID {pdb_id}: {e}")
        return False

def save_multiple_fasta_files(df, save_path, column='PDB'):
    if column not in df.columns:
        print(f"Column '{column}' not found in DataFrame.")
        return
    
    unique_pdb_ids = set()
    for ids in df[column]:
        unique_pdb_ids.update(ids.split(';')[:-1])
    
    with ThreadPoolExecutor(max_workers=24) as executor:
        results = list(executor.map(lambda pdb_id: save_fasta_file(pdb_id, save_path), unique_pdb_ids))
    
    print(f"Downloaded and saved {sum(results)} out of {len(unique_pdb_ids)} PDB files.")

def parse_fasta_files(df, directory):
    pattern = re.compile(r'\bChain[s]?\s|\[auth[^\]]*\]', re.IGNORECASE)
    pdb_ids = df['PDB'].str.split(';').explode().dropna().unique()
    
    data = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.fasta'):
            file_path = os.path.join(directory, file_name)
            with open(file_path, 'r') as f:
                for record in SeqIO.FastaIO.SimpleFastaParser(f):
                    pdb = record[0].split('_')[0]
                    chain = re.sub(pattern, '', record[0].split('|')[1])
                    sequence = record[1]
                    if pdb in pdb_ids:
                        data.append({'PDB': pdb, 'Chain': chain, 'Sequence': sequence})
    result_df = pd.DataFrame(data)
    return result_df

In [3]:
uniprot_files = './uniprot_files'
fasta_files = './fasta_files'
os.makedirs(uniprot_files, exist_ok=True)
os.makedirs(fasta_files, exist_ok=True)

pdb_df = pd.read_csv('./uniprot_files/uniprotkb_reviewed.tsv', sep='\t')
pdb_df.dropna(subset=['PDB'], inplace=True)
pdb_df

Unnamed: 0,Entry,PDB
0,A0A009IHW8,7UWG;7UXU;
1,A0A023I7E1,4K35;4K3A;5XBZ;5XC2;
2,A0A024B7W1,5GOZ;5GP1;5H30;5H32;5H37;5IRE;5IZ7;5JMT;5KQR;5...
3,A0A024SC78,4PSC;4PSD;4PSE;
7,A0A059TC02,4R1S;4R1T;
...,...,...
569347,Q9X1Q6,3DCM;
569440,Q9YC08,2CXH;
569495,Q9Z7A3,3Q9D;
569541,Q9ZB78,4XNG;


In [18]:
save_multiple_fasta_files(pdb_df, fasta_files)

File ./fasta_files/1E9W.fasta already exists. Skipping download.
File ./fasta_files/3THP.fasta already exists. Skipping download.
File ./fasta_files/4KVL.fasta already exists. Skipping download.
File ./fasta_files/6TR6.fasta already exists. Skipping download.
File ./fasta_files/6KWU.fasta already exists. Skipping download.
File ./fasta_files/2PAD.fasta already exists. Skipping download.
File ./fasta_files/3HSF.fasta already exists. Skipping download.
File ./fasta_files/4VHB.fasta already exists. Skipping download.
File ./fasta_files/2JAJ.fasta already exists. Skipping download.
File ./fasta_files/5P0W.fasta already exists. Skipping download.
File ./fasta_files/4C14.fasta already exists. Skipping download.
File ./fasta_files/4M12.fasta already exists. Skipping download.
File ./fasta_files/7C21.fasta already exists. Skipping download.
File ./fasta_files/4X2P.fasta already exists. Skipping download.
File ./fasta_files/6KHY.fasta already exists. Skipping download.
File ./fasta_files/7DOH.f

KeyboardInterrupt: 

In [4]:
fasta_df = parse_fasta_files(pdb_df, fasta_files)
fasta_df

Unnamed: 0,PDB,Chain,Sequence
0,13GS,"A, B",MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKA...
1,1A1A,"A, C",MDSIQAEEWYFGKITRRESERLLLNAENPRGTFLVRESETTKGAYS...
2,1A1A,"B, D",XYEX
3,1A6V,"A, C, E",QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLF...
4,1A6V,"B, D, F",QVQLQQPGAELVKPGASVKLSCKASGYTFTSYWMHWVKQRPGRGLE...
...,...,...,...
1649,1FIW,B,DNTTCDGPCGVRFRQNRQGGVR
1650,1FKL,A,GVQVETISPGDGRTFPKRGQTCVVHYTGMLEDGKKFDSSRDRNKPF...
1651,1FOE,"A, C, E, G",AMGRQLSDADKLRKVICELLETERTYVKDLNCLMERYLKPLQKETF...
1652,1FOE,"B, D, F, H",MQAIKCVVVGDGAVGKTCLLISYTTNAFPGEYIPTVFDNYSANVMV...


In [5]:
fasta_df['PDB'].unique()

array(['13GS', '1A1A', '1A6V', '1A81', '1AKN', '1ASS', '1AVA', '1B09',
       '1B8O', '1BC7', '1C09', '1C1X', '1C2B', '1C5V', '1CGT', '1CIE',
       '1CIZ', '1CJ7', '1CM0', '1CMK', '4AU9', '4AVU', '4AVV', '4B3B',
       '4BLJ', '4BWI', '4C14', '4C1G', '4CR8', '4CTM', '4D4Y', '4DA7',
       '4DDP', '4DMD', '4DOV', '4DRU', '4E76', '4EAX', '4ESQ', '4EWO',
       '4F4M', '4F5A', '4F5Z', '4FA1', '4FAI', '4FM5', '4FTN', '4FX7',
       '4G6H', '4GAD', '4GAF', '4GFV', '5SYK', '5SZJ', '5T4I', '5TOZ',
       '5U05', '5UIR', '5UQ8', '5UU9', '5VB6', '5VO1', '5VO5', '5VS4',
       '5VWI', '5VZ4', '5W0F', '5WBY', '5WFC', '5WOL', '5WSV', '5WTJ',
       '5X5D', '5X9Z', '5XQA', '5XTQ', '5XXF', '5XZH', '5YBF', '5YUH',
       '2I5T', '2ID8', '2IEO', '2IJH', '2IOO', '2IYV', '2J10', '2JAJ',
       '2K1N', '2K1S', '2K6M', '2KFD', '2L5F', '2LIF', '2LMJ', '2LSO',
       '2M16', '2MEI', '2MP9', '2N1F', '7BRJ', '7BTF', '7C21', '7CT1',
       '7D02', '7D45', '7D6W', '7DJT', '7DLQ', '7DOH', '7E5V', '7EDP',
      