In [1]:
import re
import requests
import pandas as pd
import numpy as np
import os

# Read the list of PDB codes from a text file
with open('output_file.txt', 'r') as file:
    pdb_codes = [line.strip().lower() for line in file]

# Define the list of keywords to search for
keywords = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO',
            'SER', 'THR', 'TRP', 'TYR', 'VAL']

# Create an empty list to store the DataFrames for each PDB code
dfs = []

# Iterate through each PDB code
for i, pdb_code in enumerate(pdb_codes, start=1):
    ligtype = 1
    while True:
        # Define the URL of the text file for the current PDB code and ligtype
        url = "http://www.ebi.ac.uk/thornton-srv/databases/cgi-bin/pdbsum/GetLigInt.pl?pdb={}&ligtype={:02d}&ligno=01&metal=TRUE".format(pdb_code, ligtype)

        # Send a GET request to the URL and fetch the content as text
        response = requests.get(url)
        text_content = response.text

        # Extract the PDB code and Metal from the text content
        pdb_code_start = text_content.find("PDB code: ") + len("PDB code: ")
        pdb_code_end = text_content.find(" ", pdb_code_start)
        pdb_code = text_content[pdb_code_start:pdb_code_end]

        Metal_start = text_content.find("Metal  ") + len("Metal  ")
        Metal_end = text_content.find(" ", Metal_start)
        Metal = text_content[Metal_start:Metal_end].strip()

        # Extract the numbers from the lines
        num_hydrogen_bonds = re.search(r'Number of hydrogen bonds:\s+(\d+)', text_content)
        if num_hydrogen_bonds:
            num_hydrogen_bonds = int(num_hydrogen_bonds.group(1))
        else:
            num_hydrogen_bonds = None

        num_non_bonded_contacts = re.search(r'Number of non-bonded contacts:\s+(\d+)', text_content)
        if num_non_bonded_contacts:
            num_non_bonded_contacts = int(num_non_bonded_contacts.group(1))
        else:
            num_non_bonded_contacts = None

        # Ignore the ligtype if both hydrogen bonds and non-bonded contacts are not available
        if num_hydrogen_bonds is None and num_non_bonded_contacts is None:
            break

        # Extract the values between "Hydrogen bonds" and "Non-bonded contacts" using regular expressions
        distance_start = text_content.find("Hydrogen bonds")
        distance_end = text_content.find("Non-bonded contacts")
        distance_text = text_content[distance_start:distance_end]

        distances = re.findall(r'\d+\.\d+', distance_text)
        distances = [float(d) for d in distances]

        # Calculate maximum, minimum, average, and standard deviation for HB distances
        max_distance_hb = np.max(distances) if distances else np.nan
        min_distance_hb = np.min(distances) if distances else np.nan
        avg_distance_hb = np.mean(distances) if distances else np.nan
        std_distance_hb = np.std(distances) if distances else np.nan

        # Extract the values between "Non-bonded contacts" and "Metal close contacts" using regular expressions
        nb_distance_start = text_content.find("Non-bonded contacts")
        nb_distance_end = text_content.find("Metal close contacts")
        nb_distance_text = text_content[nb_distance_start:nb_distance_end]

        nb_distances = re.findall(r'\d+\.\d+', nb_distance_text)
        nb_distances = [float(d) for d in nb_distances]

        # Calculate maximum, minimum, average, and standard deviation for NB distances
        max_distance_nb = np.max(nb_distances) if nb_distances else np.nan
        min_distance_nb = np.min(nb_distances) if nb_distances else np.nan
        avg_distance_nb = np.mean(nb_distances) if nb_distances else np.nan
        std_distance_nb = np.std(nb_distances) if nb_distances else np.nan

        # Search for occurrences of "A   --" in the text content, where A can be any letter with exactly 3 spaces
        chains = re.findall(r'\s{3}([A-Za-z])\s*:-', text_content)
        chains = list(set(chains))
        chains = '_'.join(chains) if chains else np.nan

        # Create a Pandas DataFrame for the current ligtype
        df = pd.DataFrame({
            'PDB code': [pdb_code.lower()],
            'Metal': [Metal],
            'Chains': [chains],
            'Number of hydrogen bonds': [num_hydrogen_bonds],
            'Number of non-bonded contacts': [num_non_bonded_contacts],
            'Average Distance HB': [avg_distance_hb],
            'Std Distance HB': [std_distance_hb],
            'Max Distance HB': [max_distance_hb],
            'Min Distance HB': [min_distance_hb],
            'Average Distance NB': [avg_distance_nb],
            'Std Distance NB': [std_distance_nb],
            'Max Distance NB': [max_distance_nb],
            'Min Distance NB': [min_distance_nb]
        })

        # Add keyword counts to the DataFrame
        for keyword in keywords:
            keyword_numbers = re.findall(r'{}\s+(\d+)'.format(keyword), text_content)
            keyword_counts = len(set(keyword_numbers))
            df['Unique {} count'.format(keyword)] = [keyword_counts]

        # Append the DataFrame to the list
        dfs.append(df)

        ligtype += 1

    # Check if it's time to write the data to a CSV file
    if i % 500 == 0 or i == len(pdb_codes):
        # Concatenate all DataFrames into a single DataFrame
        result_df = pd.concat(dfs, ignore_index=True)

        # Round the values to 2 significant figures
        result_df = result_df.round(2)

        # Rearrange the columns
        columns = [
            'PDB code', 'Metal', 'Chains', 'Number of hydrogen bonds', 'Number of non-bonded contacts',
            'Average Distance HB', 'Std Distance HB', 'Max Distance HB', 'Min Distance HB',
            'Average Distance NB', 'Std Distance NB', 'Max Distance NB', 'Min Distance NB'
        ] + ['Unique {} count'.format(keyword) for keyword in keywords]

        result_df = result_df[columns]

        # Generate a sensible filename
        start_index = i - 500 + 1
        end_index = min(i, len(pdb_codes))
        filename = f'results_{start_index}-{end_index}.csv'

        # Write the DataFrame to a CSV file
        result_df.to_csv(filename, index=False)
        print(f'Saved {filename}')

        # Clear the list of DataFrames
        dfs = []

Saved results_1-500.csv
Saved results_501-1000.csv
Saved results_1001-1500.csv
Saved results_1501-2000.csv
Saved results_2001-2500.csv
Saved results_2501-3000.csv
Saved results_3001-3500.csv
Saved results_3501-4000.csv
Saved results_4001-4500.csv
Saved results_4501-5000.csv
Saved results_5001-5500.csv
Saved results_5501-6000.csv
Saved results_6001-6500.csv
Saved results_6501-7000.csv
Saved results_7001-7500.csv
Saved results_7501-8000.csv
Saved results_8001-8500.csv
Saved results_8501-9000.csv
Saved results_9001-9500.csv
Saved results_9501-10000.csv
Saved results_10001-10500.csv
Saved results_10501-11000.csv
Saved results_11001-11500.csv
Saved results_11501-12000.csv
Saved results_12001-12500.csv
Saved results_12501-13000.csv
Saved results_13001-13500.csv
Saved results_13501-14000.csv
Saved results_14001-14500.csv
Saved results_14501-15000.csv
Saved results_15001-15500.csv
Saved results_15501-16000.csv
Saved results_16001-16500.csv
Saved results_16501-17000.csv
Saved results_17001-1750

Saved results_135001-135500.csv
Saved results_135501-136000.csv
Saved results_136001-136500.csv
Saved results_136501-137000.csv
Saved results_137001-137500.csv
Saved results_137501-138000.csv
Saved results_138001-138500.csv
Saved results_138501-139000.csv
Saved results_139001-139500.csv
Saved results_139501-140000.csv
Saved results_140001-140500.csv
Saved results_140501-141000.csv
Saved results_141001-141500.csv
Saved results_141501-142000.csv
Saved results_142001-142500.csv
Saved results_142501-143000.csv
Saved results_143001-143500.csv
Saved results_143501-144000.csv
Saved results_144001-144500.csv
Saved results_144501-145000.csv
Saved results_145001-145500.csv
Saved results_145501-146000.csv
Saved results_146001-146500.csv
Saved results_146501-147000.csv
Saved results_147001-147500.csv
Saved results_147501-148000.csv
Saved results_148001-148500.csv
Saved results_148501-149000.csv
Saved results_149001-149500.csv
Saved results_149501-150000.csv
Saved results_150001-150500.csv
Saved re