In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.1 MB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m2.5/3.1 MB[0m [31m36.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


### Parsing PDB data for protein structure information

Instructions for below code :

Input -
    1. Enter the path to the text file containing PDB IDs . The text file should have PDB IDs which user wants to parse . 
    2. Enter Enter the path to the folder where the output will be saved.
Output - 
    1. PDB and mmcif files of entered PDB IDs will be downloaded from RCSB PDB website directly.
    2. Interatomic Euclidean distance matrices of uniprots present in protein complexes (generated by parsing PDB data). The distance matrices will be saved in the output path (from Input - 2.).
    3. A csv named "master-distances" will be generated parsing PDB data containing Predictive Analysis that would determing Positive PPI dataset and Negative PPI dataset
        a. PDB ID
        b. Uniprot Pair IDs
        c. Chain Pairs
        d. Chain Lengths of Chain Pairs
        e. Gold Data Set Chain Pair
        f. Gold Dataset Distance Value (Inter-atomic Euclidean Distance < 5 Å)
        g. Silver Data Set Chain Pair
        h. Silver Dataset Distance Value (5 Å ≤ Inter-atomic Euclidean Distance ≤ 10 Å)
        i. Non-Interacting Protein Chain Pairs
        j. Non-Interacting Dataset Distance Value (Inter-atomic Euclidean Distance > 10 Å)
    4. A Statistics on distance values generated in cmd showing the following:
        a. PDB ID
        b. Uniprot IDs
        c. Chain Combinations
        d. Chain Lengths
        e. Min Distance
        f. Max Distance
        g. Avg Distance
        h. Std Distance

In [None]:
import os
import csv
import json
import urllib.request
import numpy as np
from Bio.PDB import MMCIFParser, PDBParser, PPBuilder
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from itertools import combinations
import warnings


warnings.filterwarnings("ignore", category=PDBConstructionWarning)

def download_pdb(pdb_id, output_folder):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    pdb_filename = os.path.join(output_folder, f"{pdb_id}.pdb")
    if not os.path.exists(pdb_filename):
        urllib.request.urlretrieve(url, pdb_filename)
    return pdb_filename

def download_mmCIF(pdb_id, output_folder):
    url = f"https://files.rcsb.org/download/{pdb_id}.cif"
    mmcif_filename = os.path.join(output_folder, f"{pdb_id}.cif")
    if not os.path.exists(mmcif_filename):
        urllib.request.urlretrieve(url, mmcif_filename)
    return mmcif_filename

def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def generate_distance_matrix(chain1, chain2):
    distance_matrix = np.zeros((len(chain1), len(chain2)))
    for i, coord1 in enumerate(chain1):
        for j, coord2 in enumerate(chain2):
            distance_matrix[i, j] = calculate_distance(coord1, coord2)
    return distance_matrix

def save_distance_matrix_to_csv(filename, matrix):
    np.savetxt(filename, matrix, delimiter=",")

def get_uniprot(pdb_id, output_folder):
    mmcif_filename = download_mmCIF(pdb_id, output_folder)
    parser = MMCIFParser()
    mmcif_dict = MMCIF2Dict(mmcif_filename)
    uniprot_ids = mmcif_dict.get("_struct_ref.pdbx_db_accession", [])
    strand_id = mmcif_dict.get("_entity_poly.pdbx_strand_id", [])
    return uniprot_ids, strand_id

def get_chain_count_and_names(pdb_id, master_csv_filename, output_folder, processed_ids_filename):
    uniprot_ids, strand_id = get_uniprot(pdb_id, output_folder)
    chain_count_and_names = []

    chains_by_uniprot = {}
    for uniprot_id, strand_ids in zip(uniprot_ids, strand_id):
        chains = strand_ids.split(",")
        if uniprot_id not in chains_by_uniprot:
            chains_by_uniprot[uniprot_id] = chains
        else:
            chains_by_uniprot[uniprot_id].extend(chains)

    chain_combinations = list(combinations(set(chain for chains in chains_by_uniprot.values() for chain in chains), 2))
    overall_min_distance = None

    
    csv_data = []

    for combination in chain_combinations:
        entry = {
            "PDB_ID": pdb_id if chain_combinations.index(combination) == 0 else "",
            "Uniprot_IDs": [uniprot_id for uniprot_id, chains in chains_by_uniprot.items() if any(chain in combination for chain in chains)],
            "Chain_Combination": combination
        }

        pdb_filename = download_pdb(pdb_id, output_folder)
        structure = PDBParser(QUIET=True).get_structure('structure', pdb_filename)

        chain1 = structure[0][entry["Chain_Combination"][0]]
        chain2 = structure[0][entry["Chain_Combination"][1]]

        if len(list(chain1.get_atoms())) < 16 or len(list(chain2.get_atoms())) < 16:
            print(f"Chains {entry['Chain_Combination'][0]} or {entry['Chain_Combination'][1]} in PDB ID {pdb_id} have insufficient length.")
            continue

        chain1_coords = np.array([atom.get_coord() for atom in chain1.get_atoms() if atom.name == 'CA'])
        chain2_coords = np.array([atom.get_coord() for atom in chain2.get_atoms() if atom.name == 'CA'])

        distances = generate_distance_matrix(chain1_coords, chain2_coords)

        entry["Chain_Lengths"] = f"{len(chain1_coords)}, {len(chain2_coords)}"

       
        if distances.size > 0:
            overall_min_distance = np.min(distances)
            gold_set = []
            silver_set = []
            non_interacting_pairs = []

            if overall_min_distance < 5:
                gold_set.append(f"{entry['Chain_Combination'][0]} & {entry['Chain_Combination'][1]}")
            elif 5 <= overall_min_distance <= 10:
                silver_set.append(f"{entry['Chain_Combination'][0]} & {entry['Chain_Combination'][1]}")
            else:
                non_interacting_pairs.append(f"{entry['Chain_Combination'][0]} & {entry['Chain_Combination'][1]}")

            entry["Gold_Data_Set"] = gold_set
            entry["Silver_Data_Set"] = silver_set
            entry["Non_Interacting_Chain_Pairs"] = non_interacting_pairs

            
            if len(entry['Uniprot_IDs']) >= 2:
                gold_distance = overall_min_distance if overall_min_distance is not None and overall_min_distance < 5 else None
                silver_distance = overall_min_distance if overall_min_distance is not None and 5 <= overall_min_distance <= 10 else None
                non_interacting_distance = overall_min_distance if overall_min_distance is not None and overall_min_distance > 10 else None

                data = {
                    "PDB ID": entry["PDB_ID"],
                    "Uniprot pair IDs": f"{entry['Uniprot_IDs'][0]}, {entry['Uniprot_IDs'][1]}",
                    "Chain pairs": f"{entry['Chain_Combination'][0]}, {entry['Chain_Combination'][1]}",
                    "Chain lengths of chain pairs": entry["Chain_Lengths"],
                    "Gold Data Set": ", ".join(entry["Gold_Data_Set"]),
                    "Gold Dataset Distance Value": gold_distance,
                    "Silver Data Set": ", ".join(entry["Silver_Data_Set"]),
                    "Silver Dataset Distance Value": silver_distance,
                    "Non-interacting pairs": ", ".join(entry["Non_Interacting_Chain_Pairs"]),
                    "Non-Interacting Dataset Distance Value": non_interacting_distance
                }

                csv_data.append(data)

    
    csv_exists = os.path.exists(master_csv_filename)

    with open(master_csv_filename, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=data.keys())

        if not csv_exists:
            writer.writeheader()

        for row in csv_data:
            writer.writerow(row)

    
    record_processed_id(pdb_id, processed_ids_filename)

def generate_distance_statistics(pdb_id, output_folder):
    uniprot_ids, strand_id = get_uniprot(pdb_id, output_folder)
    chain_count_and_names = []

    chains_by_uniprot = {}
    for uniprot_id, strand_ids in zip(uniprot_ids, strand_id):
        chains = strand_ids.split(",")
        if uniprot_id not in chains_by_uniprot:
            chains_by_uniprot[uniprot_id] = chains
        else:
            chains_by_uniprot[uniprot_id].extend(chains)

    chain_combinations = list(combinations(set(chain for chains in chains_by_uniprot.values() for chain in chains), 2))

    
    column_widths = {
        "PDB_ID": 12,
        "Uniprot_IDs": 40,
        "Chain_Combinations": 25,
        "Chain_Lengths": 25,
        "Min_Distance": 18,
        "Max_Distance": 18,
        "Avg_Distance": 18,
        "Std_Distance": 18,
    }

    
    print(f"{str('PDB ID').ljust(column_widths['PDB_ID'])}\t{str('Uniprot IDs').ljust(column_widths['Uniprot_IDs'])}\t{str('Chain Combinations').ljust(column_widths['Chain_Combinations'])}\t{str('Chain Lengths').ljust(column_widths['Chain_Lengths'])}\t{str('Min Distance').ljust(column_widths['Min_Distance'])}\t{str('Max Distance').ljust(column_widths['Max_Distance'])}\t{str('Avg Distance').ljust(column_widths['Avg_Distance'])}\t{str('Std Distance').ljust(column_widths['Std_Distance'])}")

    for combination in chain_combinations:
        entry = {
            "PDB_ID": pdb_id if chain_combinations.index(combination) == 0 else "",
            "Uniprot_IDs": [uniprot_id for uniprot_id, chains in chains_by_uniprot.items() if any(chain in combination for chain in chains)],
            "Chain_Combination": combination
        }

        pdb_filename = download_pdb(pdb_id, output_folder)
        structure = PDBParser(QUIET=True).get_structure('structure', pdb_filename)

        chain1_coords = np.array([atom.get_coord() for atom in structure[0][entry["Chain_Combination"][0]].get_atoms() if atom.name == 'CA'])
        chain2_coords = np.array([atom.get_coord() for atom in structure[0][entry["Chain_Combination"][1]].get_atoms() if atom.name == 'CA'])

        if len(chain1_coords) == 0 or len(chain2_coords) == 0:
            print(f"Chains {entry['Chain_Combination'][0]} or {entry['Chain_Combination'][1]} in PDB ID {pdb_id} have insufficient length.")
            continue

        distances = generate_distance_matrix(chain1_coords, chain2_coords)

        if distances.size > 0:
            entry["Min_Distance"] = np.min(distances)
            entry["Max_Distance"] = np.max(distances)
            entry["Avg_Distance"] = np.mean(distances)
            entry["Std_Distance"] = np.std(distances)
        else:
            entry["Min_Distance"] = None
            entry["Max_Distance"] = None
            entry["Avg_Distance"] = None
            entry["Std_Distance"] = None

        entry["Chain_Lengths"] = f"{len(chain1_coords)}, {len(chain2_coords)}"

        chain_combination_str = ", ".join(entry["Chain_Combination"])
        uniprot_ids_str = ", ".join(entry["Uniprot_IDs"])
        print(f"{entry['PDB_ID'].ljust(column_widths['PDB_ID'])}\t{uniprot_ids_str.ljust(column_widths['Uniprot_IDs'])}\t{chain_combination_str.ljust(column_widths['Chain_Combinations'])}\t{entry['Chain_Lengths'].ljust(column_widths['Chain_Lengths'])}\t{entry['Min_Distance']}\t{entry['Max_Distance']}\t{entry['Avg_Distance']}\t{entry['Std_Distance']}")

       
        output_filename = os.path.join(output_folder, f"{pdb_id}_{uniprot_ids_str.replace(' ', '_')}_{chain_combination_str.replace(' ', '_')}_distances.csv")
        save_distance_matrix_to_csv(output_filename, distances)

def record_processed_id(pdb_id, processed_ids_filename):
    with open(processed_ids_filename, 'a') as file:
        file.write(f"{pdb_id}\n")

def process_pdb_ids(input_file_path, master_csv_filename, output_folder, processed_ids_filename):
    processed_ids = set()

    if os.path.exists(processed_ids_filename):
        with open(processed_ids_filename, 'r') as file:
            processed_ids = set(file.read().splitlines())

    with open(input_file_path, "r") as pdb_id_file:
        pdb_ids = pdb_id_file.read().splitlines()

    for pdb_id in pdb_ids:
        if pdb_id in processed_ids:
            continue

        print(f"\nProcessing PDB ID: {pdb_id}")

        try:
            get_chain_count_and_names(pdb_id, master_csv_filename, output_folder, processed_ids_filename)
            generate_distance_statistics(pdb_id, output_folder)
        except Exception as e:
            print(f"An error occurred while processing PDB ID: {pdb_id}. Error: {str(e)}")
            print(f"Skipping PDB ID: {pdb_id}")

       
        record_processed_id(pdb_id, processed_ids_filename)

if __name__ == "__main__":
    input_file_path = input("Enter the path to the text file containing PDB IDs: ")
    output_folder = input("Enter the path to the folder where the output will be saved: ")
    master_csv_filename = os.path.join(output_folder, "master_distances.csv")
    processed_ids_filename = os.path.join(output_folder, "processed_ids.txt")

    process_pdb_ids(input_file_path, master_csv_filename, output_folder, processed_ids_filename)


Enter the path to the text file containing PDB IDs: /content/drive/My Drive/MTech_Thesis/18.4.2024/Input/set1.txt
Enter the path to the folder where the output will be saved: /content/drive/My Drive/MTech_Thesis/18.4.2024/Output/

Processing PDB ID: 6RWH
PDB ID      	Uniprot IDs                             	Chain Combinations       	Chain Lengths            	Min Distance      	Max Distance      	Avg Distance      	Std Distance      
6RWH        	P31947, P04637                          	A, P                     	228, 12                  	4.08633279800415	43.520816802978516	21.37800337801203	6.798706920049141

Processing PDB ID: 6S53
PDB ID      	Uniprot IDs                             	Chain Combinations       	Chain Lengths            	Min Distance      	Max Distance      	Avg Distance      	Std Distance      
6S53        	P61088, P19474                          	H, E                     	76, 148                  	37.87306594848633	99.932861328125	69.06601490506407	10.750264543720437
 

### Parsing PDB data for protein sequence information

Instructions for below code :

Input-
Replace the text file path with user file path containing PDB IDs whose distance information has already been generated from above code. The specific line has been marked with a comment below where the file path needs to be replaced.

Output-
In cmd, following will be generated:
    1. PDB ID
    2. Chain_name
    3. UniProt ID
    4. Sequence 

In [None]:
import os
import urllib.request
from Bio.PDB import PDBParser, PPBuilder
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.PDBExceptions import PDBConstructionWarning
import warnings


warnings.filterwarnings("ignore", category=PDBConstructionWarning)

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    pdb_filename = f"{pdb_id}.pdb"
    if not os.path.exists(pdb_filename):
        try:
            urllib.request.urlretrieve(url, pdb_filename)
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f"PDB ID {pdb_id} not found. Skipping...")
                return None
            else:
                raise
    return pdb_filename

def get_uniprot_ids(pdb_id):
    mmcif_filename = f"{pdb_id}.cif"

   
    if not os.path.exists(mmcif_filename):
        url = f"https://files.rcsb.org/download/{pdb_id}.cif"
        urllib.request.urlretrieve(url, mmcif_filename)

    parser = MMCIFParser()
    structure = parser.get_structure(pdb_id, mmcif_filename)

    mmcif_dict = MMCIF2Dict(mmcif_filename)
    uniprot_ids = mmcif_dict.get("_struct_ref.pdbx_db_accession", [])
    return uniprot_ids

def get_chain_sequences(pdb_id):
    pdb_filename = download_pdb(pdb_id)
    if pdb_filename is None:
        return {}

    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, pdb_filename)
    ppb = PPBuilder()

    chain_sequences = {}
    uniprot_ids = get_uniprot_ids(pdb_id)

    for model in structure:
        for chain in model:
            polypeptides = ppb.build_peptides(chain)
            if polypeptides:
                protein_sequence = "".join([str(pp.get_sequence()) for pp in polypeptides])

                uniprot_id = ""
                if uniprot_ids:
                    uniprot_id = uniprot_ids.pop(0)

                chain_sequences[chain.id] = {'sequence': protein_sequence, 'uniprot_id': uniprot_id}

    return chain_sequences


def read_pdb_ids_from_file(file_path):
    with open(file_path, 'r') as file:
        pdb_ids = file.read().splitlines()
    return pdb_ids


input_file_path = "/content/drive/My Drive/MTech_Thesis/18.4.2024/Input/set1.txt"  # Replace with your file path


pdb_ids = read_pdb_ids_from_file(input_file_path)


for pdb_id in pdb_ids:
    print(f"Sequences for PDB ID: {pdb_id}")
    chain_sequences = get_chain_sequences(pdb_id)
    for chain_id, sequence_info in chain_sequences.items():
        print(f"Chain {chain_id}:")
        print(f"UniProt ID: {sequence_info['uniprot_id']}")
        print(f"Sequence: {sequence_info['sequence']}")
        print()


Sequences for PDB ID: 6RWH
Chain A:
UniProt ID: P31947
Sequence: GAMGSMERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSVAYKNVVGGQRAAWRVLSSIEQKSNGPEVREYREKVETELQGVCDTVLGLLDSHLIKEAGDAESRVFYLKMKGDYYRYLAEVATDKKRIIDSARSAYQEAMDISKKEMPPTNPIRLGLALNFSVFHYEIANSPEEAISLAKTTFDEAMADLHTLSEDSYKDSTLIMQLLRDNLTLWT

Chain P:
UniProt ID: P04637
Sequence: KLMFKEGPDSD

Sequences for PDB ID: 6S53
Chain E:
UniProt ID: P61088
Sequence: PRRIIKETQRLLAEPVPGIKAEPDESNARYFHVVIAGPQDSPFEGGTFKLELFLPEEYPMAAPKVRFMTKIYHPNVDKLGRIKLDILADKWSPALQIRTVLLSIQALLSAPNPDDPLANDVAEQWKTNEAQAIETARAWTRLYAMNNI

Chain F:
UniProt ID: P0CG48
Sequence: MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG

Chain C:
UniProt ID: P19474
Sequence: LPRRIIKETQRLLAEPVPGIKAEPDESNARYFHVVIAGPQDSPFEGGTFKLELFLPEEYPMAAPKVRFMTKIYHPNVDKLGRIKLDILADKWSPALQIRTVLLSIQALLSAPNPDDPLANDVAEQWKTNEAQAIETARAWTRLYAMNNI

Chain D:
UniProt ID: 
Sequence: MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG

Chain B:
UniProt ID: 
Seq

In [None]:
pip install seaborn matplotlib

### Distance matrices of protein information are normalized within value [0,1] and heatmaps of normalized distance matrices generated with following colour code: 
    Summary
        Lower Distances (closer interactions):
        Normalized Values: Close to 0
        Color: Green

        Higher Distances (farther interactions):
        Normalized Values: Close to 1
        Color: Red
        Intermediate Distances:

        Normalized Values: Between 0 and 1
        Color: Gradient from Green to Yellow to Red

Instructions for below code:

Input -
    1. Replace the path within code to the text file containing PDB IDs. The text file should have PDB IDs which user wants to parse . 
    2. Replace the path within code to the folder where the output will be saved.

Output -
    In the user defined output path, separate folders generated for each of the PDB IDs that are given as input.
        Each folder contains :
            1. Inter-atomic Distance Matrix of specific protein chains within PDB ID
            2. Their Normalized Distance Matrices 
            3. Their Normalized Heatmaps with colour coded as explained in previous markdown cell.

In [None]:
import os
import csv
import json
import urllib.request
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from Bio.PDB import MMCIFParser, PDBParser, PPBuilder
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from itertools import combinations
import warnings


warnings.filterwarnings("ignore", category=PDBConstructionWarning)

def download_pdb(pdb_id, output_folder):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    pdb_filename = os.path.join(output_folder, f"{pdb_id}.pdb")
    if not os.path.exists(pdb_filename):
        urllib.request.urlretrieve(url, pdb_filename)
    return pdb_filename

def download_mmCIF(pdb_id, output_folder):
    url = f"https://files.rcsb.org/download/{pdb_id}.cif"
    mmcif_filename = os.path.join(output_folder, f"{pdb_id}.cif")
    if not os.path.exists(mmcif_filename):
        urllib.request.urlretrieve(url, mmcif_filename)
    return mmcif_filename

def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

def generate_distance_matrix(chain1, chain2):
    distance_matrix = np.zeros((len(chain1), len(chain2)))
    for i, coord1 in enumerate(chain1):
        for j, coord2 in enumerate(chain2):
            distance_matrix[i, j] = calculate_distance(coord1, coord2)
    return distance_matrix

def min_max_normalize(matrix):
    min_val = np.min(matrix)
    max_val = np.max(matrix)
    normalized_matrix = (matrix - min_val) / (max_val - min_val)
    return normalized_matrix

def save_distance_matrix_to_csv(filename, matrix):
    np.savetxt(filename, matrix, delimiter=",")

def save_normalized_distance_matrix_to_csv(filename, matrix):
    np.savetxt(filename, matrix, delimiter=",")

def generate_heatmap(matrix, output_filename, uniprot_ids, chain1_id, chain2_id, vmin=None, vmax=None):
    plt.figure(figsize=(10, 8))
    cmap = sns.diverging_palette(220, 20, as_cmap=True)
    sns.heatmap(matrix, cmap="RdYlGn_r", cbar=True, vmin=vmin, vmax=vmax)
    plt.title(f"Normalized Distance Matrix Heatmap: {pdb_id}_{uniprot_ids[0]}_{chain1_id}_{uniprot_ids[1]}_{chain2_id}")
    plt.xlabel(f"{uniprot_ids[0]}_{chain1_id}")
    plt.ylabel(f"{uniprot_ids[1]}_{chain2_id}")
    plt.savefig(output_filename)
    plt.close()

def get_uniprot(pdb_id, output_folder):
    mmcif_filename = download_mmCIF(pdb_id, output_folder)
    parser = MMCIFParser()
    mmcif_dict = MMCIF2Dict(mmcif_filename)
    uniprot_ids = mmcif_dict.get("_struct_ref.pdbx_db_accession", [])
    strand_id = mmcif_dict.get("_entity_poly.pdbx_strand_id", [])
    return uniprot_ids, strand_id

def generate_distance_statistics(pdb_id, output_folder):
    uniprot_ids, strand_id = get_uniprot(pdb_id, output_folder)
    chain_count_and_names = []

    chains_by_uniprot = {}
    for uniprot_id, strand_ids in zip(uniprot_ids, strand_id):
        chains = strand_ids.split(",")
        if uniprot_id not in chains_by_uniprot:
            chains_by_uniprot[uniprot_id] = chains
        else:
            chains_by_uniprot[uniprot_id].extend(chains)

    chain_combinations = list(combinations(set(chain for chains in chains_by_uniprot.values() for chain in chains), 2))

    for combination in chain_combinations:
        entry = {
            "Uniprot_IDs": [uniprot_id for uniprot_id, chains in chains_by_uniprot.items() if any(chain in combination for chain in chains)],
            "Chain_Combination": combination
        }

        pdb_filename = download_pdb(pdb_id, output_folder)
        structure = PDBParser(QUIET=True).get_structure('structure', pdb_filename)

        chain1_coords = np.array([atom.get_coord() for atom in structure[0][entry["Chain_Combination"][0]].get_atoms() if atom.name == 'CA'])
        chain2_coords = np.array([atom.get_coord() for atom in structure[0][entry["Chain_Combination"][1]].get_atoms() if atom.name == 'CA'])

        if len(chain1_coords) == 0 or len(chain2_coords) == 0:
            print(f"Chains {entry['Chain_Combination'][0]} or {entry['Chain_Combination'][1]} in PDB ID {pdb_id} have insufficient length.")
            continue

        distances = generate_distance_matrix(chain1_coords, chain2_coords)

        if distances.size > 0:
            entry["Chain_Lengths"] = f"{len(chain1_coords)}, {len(chain2_coords)}"

            
            pdb_output_folder = os.path.join(output_folder, pdb_id)
            os.makedirs(pdb_output_folder, exist_ok=True)

           
            output_filename = os.path.join(pdb_output_folder, f"{pdb_id}_{uniprot_ids[0]}_{entry['Chain_Combination'][0]}_{uniprot_ids[1]}_{entry['Chain_Combination'][1]}_distances.csv")
            save_distance_matrix_to_csv(output_filename, distances)

            
            normalized_distances = min_max_normalize(distances)
            normalized_output_filename = os.path.join(pdb_output_folder, f"{pdb_id}_{uniprot_ids[0]}_{entry['Chain_Combination'][0]}_{uniprot_ids[1]}_{entry['Chain_Combination'][1]}_normalized_distances.csv")
            save_normalized_distance_matrix_to_csv(normalized_output_filename, normalized_distances)

            
            heatmap_output_filename = os.path.join(pdb_output_folder, f"{pdb_id}_{uniprot_ids[0]}_{entry['Chain_Combination'][0]}_{uniprot_ids[1]}_{entry['Chain_Combination'][1]}_normalized_heatmap.png")
            generate_heatmap(normalized_distances, heatmap_output_filename, uniprot_ids, entry["Chain_Combination"][0], entry["Chain_Combination"][1], vmin=0, vmax=1)


input_path = "/content/drive/My Drive/MTech_Thesis/15.3.2024/Input/PDBids.txt"  # Assuming the text file containing PDB IDs is named "pdb_ids.txt" and located in the same directory as the script


with open(input_path, 'r') as f:
    pdb_ids = f.read().splitlines()

# Specify the output folder
output_folder = "/content/drive/My Drive/MTech_Thesis/15.3.2024/BulkOutput/"


for pdb_id in pdb_ids:
    generate_distance_statistics(pdb_id, output_folder)


### Code for Heatmap generation from pairwise inter-atomic Euclidean distance matrices of PDB IDs without normalization

Instructions for below code:

Input -
    User to input PDB IDs one by one. [Note - In below code, replace "1" with number of PDB IDs entered in text file input. It is commented in below code]

Output - 
    Generates and saves heatmaps for each pair of chains, showing the distances with a color gradient from green (short distances) to red (long distances).


In [None]:
import os
import csv
import urllib.request
import numpy as np
from Bio.PDB import PDBParser, MMCIFParser, PPBuilder
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from itertools import combinations
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


warnings.filterwarnings("ignore", category=PDBConstructionWarning)

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    pdb_filename = f"{pdb_id}.pdb"
    if not os.path.exists(pdb_filename):
        urllib.request.urlretrieve(url, pdb_filename)
    return pdb_filename

def download_mmCIF(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.cif"
    mmcif_filename = f"{pdb_id}.cif"
    if not os.path.exists(mmcif_filename):
        urllib.request.urlretrieve(url, mmcif_filename)
    return mmcif_filename

def get_uniprot(pdb_id):
    mmcif_filename = download_mmCIF(pdb_id)
    parser = MMCIFParser()
    mmcif_dict = MMCIF2Dict(mmcif_filename)
    uniprot_ids = mmcif_dict.get("_struct_ref.pdbx_db_accession", [])
    strand_id = mmcif_dict.get("_entity_poly.pdbx_strand_id", [])
    return uniprot_ids

def get_protein_details(pdb_id):
    pdb_filename = download_pdb(pdb_id)
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_id, pdb_filename)
    protein_chains = []
    alpha_carbon_coordinates = {}

    ppb = PPBuilder()

    for model in structure:
        for chain in model:
            polypeptides = ppb.build_peptides(chain)
            if polypeptides:
                chain_id = chain.id
                protein_sequence = "".join([str(pp.get_sequence()) for pp in polypeptides])

                uniprot_ids = get_uniprot(pdb_id)

                chain_details = {
                    "PDB_ID": pdb_id,
                    "Chain_ID": chain_id,
                    "Uniprot_ID": uniprot_ids,
                    "Sequence": protein_sequence,
                    "Sequence_Length": len(protein_sequence),
                }

                protein_chains.append(chain_details)
                alpha_carbon_coords = []
                for residue in chain:
                    for atom in residue:
                        if atom.get_name() == "CA":
                            alpha_carbon_coords.append(atom.get_coord())
                alpha_carbon_coordinates[chain_id] = alpha_carbon_coords

    for uniprot_id in uniprot_ids:
        print("Uniprot - " + uniprot_id)

    generate_distance_matrices_combinations(pdb_id, alpha_carbon_coordinates, uniprot_ids)

def generate_distance_matrices_combinations(pdb_id, alpha_carbon_coordinates, uniprot_ids):
    def generate_distance_matrix(chain1_coords, chain2_coords):
        distance_matrix = np.zeros((len(chain1_coords), len(chain2_coords)))
        for i, coord1 in enumerate(chain1_coords):
            for j, coord2 in enumerate(chain2_coords):
                distance_matrix[i, j] = np.linalg.norm(coord1 - coord2)
        return distance_matrix

    chain_ids = list(alpha_carbon_coordinates.keys())

    for chain1_id, chain2_id in combinations(chain_ids, 2):
        chain1_coords = alpha_carbon_coordinates[chain1_id]
        chain2_coords = alpha_carbon_coordinates[chain2_id]
        distance_matrix = generate_distance_matrix(chain1_coords, chain2_coords)

        
        plt.figure(figsize=(10, 8))
        sns.heatmap(distance_matrix, cmap="RdYlGn", cbar=False)
        plt.title(f"Distance Matrix Heatmap: {pdb_id}_{uniprot_ids[0]}_{chain1_id}_{uniprot_ids[1]}_{chain2_id}")
        plt.xlabel(f"Chain {chain1_id}")
        plt.ylabel(f"Chain {chain2_id}")

        
        heatmap_filename = f"{pdb_id}_{uniprot_ids[0]}_{chain1_id}_{uniprot_ids[1]}_{chain2_id}_heatmap.png"
        plt.savefig(heatmap_filename)
        plt.close()



pdb_ids = []
for i in range(1): #Replace 1 with number of PDB IDs entered in text file input
    pdb_id = input(f"Enter PDB ID {i+1}: ")
    pdb_ids.append(pdb_id)


for pdb_id in pdb_ids:
    get_protein_details(pdb_id)
