In [8]:
import csv
import re

uniprot_csv_file_path = '/Volumes/dax-hd/project-data/search-files/uniprot-data.csv'
enzyme_info_txt_file_path = '/Volumes/dax-hd/project-data/search-files/enzyme_dat.txt'
output_csv_file_path = '/Volumes/dax-hd/project-data/search-files/enzyme-id-data.csv'


def load_uniprot_pdb_mapping(file_path):
    mapping = {}
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            mapping[row['UniProt Accession']] = row['PDB ID']
    return mapping

def process_text_file(file_path, uniprot_pdb_mapping):
    with open(file_path, 'r') as file:
        content = file.read()

    entries = content.strip().split('//')
    data = []

    for entry in entries:
        ec_id_search = re.search(r'ID\s+(\d+\.\d+\.\d+\.\d+)', entry)
        enzyme_name_search = re.search(r'DE\s+(.+)\.', entry)
        dr_lines_search = re.findall(r'DR\s+(.+)', entry)

        if ec_id_search and enzyme_name_search and dr_lines_search:
            ec_id = ec_id_search.group(1)
            enzyme_name = enzyme_name_search.group(1)
            dr_lines = ' '.join(dr_lines_search)
            uniprot_ids = re.findall(r'(\w+),', dr_lines)

            for uniprot_id in uniprot_ids:
                if uniprot_id in uniprot_pdb_mapping:
                    data.append({
                        'PDB ID': uniprot_pdb_mapping[uniprot_id],
                        'UniProt Accession': uniprot_id,
                        'EC ID': ec_id,
                        'Enzyme Name': enzyme_name
                    })

    return data

def write_to_csv(file_path, data):
    with open(file_path, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['PDB ID', 'UniProt Accession', 'EC ID', 'Enzyme Name']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for row in data:
            writer.writerow(row)

uniprot_pdb_mapping = load_uniprot_pdb_mapping(uniprot_csv_file_path)
data = process_text_file(enzyme_info_txt_file_path, uniprot_pdb_mapping)
write_to_csv(output_csv_file_path, data)

print("The CSV file has been created with the matched entries, including PDB IDs.")

The CSV file has been created with the matched entries, including PDB IDs.
