In [61]:
import requests
import pandas as pd

def fetch_pdb_header(pdb_id):
    url = f"https://www.ebi.ac.uk/pdbe/static/entry/download/{pdb_id.lower()}.header"
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch header for PDB ID {pdb_id}")
        return None

def fetch_multiple_pdb_headers(pdb_ids):
    from concurrent.futures import ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        results = {pdb_id: executor.submit(fetch_pdb_header, pdb_id) for pdb_id in pdb_ids}
    return {pdb_id: future.result() for pdb_id, future in results.items()}

def format_dbref(dbref_list):
    pos_to_chain = {}
    
    for entry in dbref_list:
        start_pos = entry['Start_Pos']
        end_pos = entry['End_Pos']
        chain = entry['Chain']
        
        pos_key = f"{start_pos}-{end_pos}"
        
        if pos_key not in pos_to_chain:
            pos_to_chain[pos_key] = []
        
        pos_to_chain[pos_key].append(chain)
    
    formatted_dbref = []
    for pos_key, chains in pos_to_chain.items():
        chain_str = "/".join(chains)
        formatted_entry = f"{chain_str}={pos_key}"
        formatted_dbref.append(formatted_entry)
    
    return formatted_dbref

def parse_pdb_header(header_content, protein_entry):
    parsed_info = {}
    
    lines = header_content.strip().split('\n')
    
    resolution_found = method_found = False
    dbref_list = []
    
    for line in lines:
        # Parsing Resolution
        if not resolution_found:
            if "RESOLUTION." in line:
                resolution_value = line.split("RESOLUTION.")[-1].split("ANGSTROMS")[0].strip()
                if resolution_value:
                    parsed_info['Resolution'] = resolution_value
                    resolution_found = True
            elif "RESOLUTION RANGE HIGH" in line:
                resolution_value = line.split(":")[-1].strip().replace("(A)", "").replace("ANGSTROMS", "").strip()
                if resolution_value:
                    parsed_info['Resolution'] = resolution_value
                    resolution_found = True

        # Parsing Method
        if not method_found and line.startswith("EXPDTA"):
            method_str = line.split("EXPDTA")[-1].strip()
            if method_str:
                parsed_info['Method'] = method_str
                method_found = True
        
        # Parsing DBREF lines
        if line.startswith("DBREF"):
            tokens = line.split()
            if len(tokens) >= 6 and tokens[5] == "UNP" and tokens[6] == protein_entry:
                chain_info = {
                    'Protein_ID': tokens[1],
                    'Chain': tokens[2],
                    'Start_Pos': tokens[3],
                    'End_Pos': tokens[4]
                }
                dbref_list.append(chain_info)
                
    if dbref_list:
        parsed_info['DBREF'] = format_dbref(dbref_list)
            
    return parsed_info

def extract_pdb_info(dataset):
    parsed_data_list = []
    
    all_pdb_ids = set()
    for pdb_ids in dataset['PDB'].str.split(';'):
        all_pdb_ids.update(pdb_ids[:-1])

    fetched_headers = fetch_multiple_pdb_headers(list(all_pdb_ids))
    
    for index, row in dataset.iterrows():
        entry = row['Entry']
        pdb_ids = row['PDB'].split(';')[:-1]
        
        for pdb_id in pdb_ids:
            header_content_fetched = fetched_headers.get(pdb_id, None)
            
            if header_content_fetched:
                parsed_info = parse_pdb_header(header_content_fetched, entry)
                
                parsed_info['Entry'] = entry
                parsed_data_list.append(parsed_info)
    
    return pd.DataFrame(parsed_data_list)

In [65]:
dataset = pd.read_csv('./uniprotkb_reviewed.tsv', sep='\t')
dataset.dropna(subset=['Helix', 'Turn', 'Beta strand', 'PDB'], how='all', inplace=True)
dataset

Unnamed: 0,Entry,Entry Name,Sequence,Helix,Turn,Beta strand,PDB
0,A0A009IHW8,ABTIR_ACIB9,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,"HELIX 143..145; /evidence=""ECO:0007829|PDB:7UW...","TURN 146..149; /evidence=""ECO:0007829|PDB:7UWG...","STRAND 135..142; /evidence=""ECO:0007829|PDB:7U...",7UWG;7UXU;
1,A0A023I7E1,ENG1_RHIMI,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,"HELIX 42..44; /evidence=""ECO:0007829|PDB:4K35""...","TURN 287..289; /evidence=""ECO:0007829|PDB:4K35...","STRAND 56..58; /evidence=""ECO:0007829|PDB:4K35...",4K35;4K3A;5XBZ;5XC2;
2,A0A024B7W1,POLG_ZIKVF,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,"HELIX 222..225; /evidence=""ECO:0007829|PDB:6CO...","TURN 237..241; /evidence=""ECO:0007829|PDB:6CO8...","STRAND 234..236; /evidence=""ECO:0007829|PDB:6C...",5GOZ;5GP1;5H30;5H32;5H37;5IRE;5IZ7;5JMT;5KQR;5...
3,A0A024SC78,CUTI1_HYPJR,MRSLAILTTLLAGHAFAYPKPAPQSVNRRDWPSINEFLSELAKVMP...,"HELIX 51..69; /evidence=""ECO:0007829|PDB:4PSC""...","TURN 94..100; /evidence=""ECO:0007829|PDB:4PSC""...","STRAND 48..50; /evidence=""ECO:0007829|PDB:4PSC...",4PSC;4PSD;4PSE;
7,A0A059TC02,CCR1_PETHY,MRSVSGQVVCVTGAGGFIASWLVKILLEKGYTVRGTVRNPDDPKNG...,"HELIX 17..28; /evidence=""ECO:0007829|PDB:4R1S""...","TURN 3..6; /evidence=""ECO:0007829|PDB:4R1T""; T...","STRAND 8..12; /evidence=""ECO:0007829|PDB:4R1S""...",4R1S;4R1T;
...,...,...,...,...,...,...,...
569347,Q9X1Q6,Y1570_THEMA,MLEKVYVALIHYPIKGKDGSIISTAVTNLDVHDIARTARTYNLKGY...,"HELIX 28..40; /evidence=""ECO:0007829|PDB:3DCM""...","TURN 161..164; /evidence=""ECO:0007829|PDB:3DCM...","STRAND 4..10; /evidence=""ECO:0007829|PDB:3DCM""...",3DCM;
569440,Q9YC08,BRIX_AERPE,MTTSRRPSPRIRSFVKDLSATIPGAFRFTRGHYSMEELAREAIIRG...,"HELIX 9..19; /evidence=""ECO:0007829|PDB:2CXH"";...",,"STRAND 1..6; /evidence=""ECO:0007829|PDB:2CXH"";...",2CXH;
569495,Q9Z7A3,Y803_CHLPN,MAAKTKTLELEDNVFLLLEGNLKRIFATPIGYTTFREFQNVVFNCA...,"HELIX 12..25; /evidence=""ECO:0007829|PDB:3Q9D""...",,"STRAND 100..111; /evidence=""ECO:0007829|PDB:3Q...",3Q9D;
569541,Q9ZB78,Y218A_MYCGE,MVNNEYQQLNTLVESDDEADLVIANLVKQLNELKQILVSLDNQEAS...,"HELIX 70..86; /evidence=""ECO:0007829|PDB:4XNG""...",,,4XNG;


In [66]:
parsed_df = extract_pdb_info(dataset)
parsed_df

Failed to fetch header for PDB ID 4V6C
Failed to fetch header for PDB ID 7A5A
Failed to fetch header for PDB ID 6FTG
Failed to fetch header for PDB ID 6NDK
Failed to fetch header for PDB ID 6UU6
Failed to fetch header for PDB ID 6OM6
Failed to fetch header for PDB ID 7PNT
Failed to fetch header for PDB ID 6OG7
Failed to fetch header for PDB ID 7VAO
Failed to fetch header for PDB ID 5APO
Failed to fetch header for PDB ID 4BTS
Failed to fetch header for PDB ID 4U55
Failed to fetch header for PDB ID 7Q18
Failed to fetch header for PDB ID 5LZD
Failed to fetch header for PDB ID 6RXV
Failed to fetch header for PDB ID 5UYN
Failed to fetch header for PDB ID 4V68
Failed to fetch header for PDB ID 8FOM
Failed to fetch header for PDB ID 4V7Q
Failed to fetch header for PDB ID 7PIC
Failed to fetch header for PDB ID 4V7O
Failed to fetch header for PDB ID 7OA4
Failed to fetch header for PDB ID 5VFU
Failed to fetch header for PDB ID 7KZV
Failed to fetch header for PDB ID 4V7C
Failed to fetch header fo

Unnamed: 0,Method,Resolution,Entry,DBREF
0,X-RAY DIFFRACTION,2.16,A0A009IHW8,
1,ELECTRON MICROSCOPY,2.74,A0A009IHW8,
2,X-RAY DIFFRACTION,2.00,A0A023I7E1,
3,X-RAY DIFFRACTION,2.30,A0A023I7E1,
4,X-RAY DIFFRACTION,2.70,A0A023I7E1,
...,...,...,...,...
212298,X-RAY DIFFRACTION,2.00,Q9X1Q6,[X=1-192]
212299,X-RAY DIFFRACTION,1.80,Q9YC08,[A=1-197]
212300,X-RAY DIFFRACTION,2.00,Q9Z7A3,[A/B=1-184]
212301,X-RAY DIFFRACTION,3.00,Q9ZB78,[A/B/C/D=63-204]


In [67]:
parsed_df.to_csv('./uniprotkb_reviewed (PDB PARSED).csv')