In [1]:
import h5py
import os
import numpy as np
import pandas as pd
import json

SAVE_PATH = "../tmp/data"
SAVE_PATH_STRUCTURES = f"{SAVE_PATH}/mdcath_pdb_structures"

os.makedirs(SAVE_PATH, exist_ok=True)
os.makedirs(SAVE_PATH_STRUCTURES, exist_ok=True)

AA_ALPHABET = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]

In [None]:
with h5py.File(
    "/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_dynamics_profiles.h5", "r"
) as mdcath_3di_dynamics:
    print(list(mdcath_3di_dynamics.keys())[0])
    print(list(mdcath_3di_dynamics["12asA00"].attrs))
    print(list(mdcath_3di_dynamics["12asA00"].keys()))
    print(mdcath_3di_dynamics["12asA00"].attrs["sequence"])
    # with open(f"{SAVE_PATH}/mdcath_sequence_aa.fasta", "w") as f:
    #     f.write("\n".join(([f">{mdcath_3di_dynamics[item].attrs['name']}\n{mdcath_3di_dynamics[item].attrs['sequence']}" for item in list(mdcath_3di_dynamics.keys())])))


In [None]:
with h5py.File("/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_tokens.h5", "r") as mdcath_3di_structures:
    print(list(mdcath_3di_structures.keys())[0])
    print(list(mdcath_3di_structures["foldseek"].attrs))
    print(list(mdcath_3di_structures["foldseek"].keys())[0])
    print(list(mdcath_3di_structures["foldseek"]["12asA00"].attrs))
    print(list(mdcath_3di_structures["foldseek"]["12asA00"].keys()))
    print(mdcath_3di_structures["foldseek"]["12asA00"]["base"][()][0].decode("utf-8"))
    with open("../tmp/data/mdcath_sequence_3Di.fasta", "w") as f:
        f.write(
            "\n".join(
                (
                    [
                        f">{mdcath_3di_structures['foldseek'][item].attrs['name']}\n{mdcath_3di_structures['foldseek'][item]['base'][()][0].decode('utf-8')}"
                        for item in list(mdcath_3di_structures["foldseek"].keys())
                    ]
                )
            )
        )


In [None]:
with h5py.File(
    "/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_dynamics_profiles.h5", "r"
) as mdcath_3di_dynamics:
    print(list(mdcath_3di_dynamics.keys())[0])
    print(len(list(mdcath_3di_dynamics.keys())))
    print(list(mdcath_3di_dynamics["12asA00"].attrs))
    print(list(mdcath_3di_dynamics["12asA00"].keys()))
    # print(mdcath_3di_dynamics['12asA00'].attrs['structure'])

    for item in ["3uzoA01"]:  # list(mdcath_3di_dynamics.keys()):
        print(mdcath_3di_dynamics[item].attrs["structure"])
        with open(f"{SAVE_PATH_STRUCTURES}/{item}.pdbzip", "w") as f:
            f.write(mdcath_3di_dynamics[item].attrs["structure"])


In [None]:
PSSM_SAVE_DIR = "../tmp/data/generated_pssms/mdCATH_profiles_320_0/"
os.makedirs(PSSM_SAVE_DIR, exist_ok=True)


def pssm_to_csv(name, pssm):
    df_pssm = pd.DataFrame(pssm)
    with open(f"{PSSM_SAVE_DIR}/{name}.tsv", "w") as f:
        f.write(f"Query profile of sequence {name}\n")
        f.write(f"     {"      ".join(AA_ALPHABET)}      \n")
        df_pssm = df_pssm.round(4)
        df_pssm.to_csv(f, index=False, sep=" ", float_format="%.4f", header=False, lineterminator=" \n")


with h5py.File(
    "/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_dynamics_profiles.h5", "r"
) as mdcath_3di_dynamics:
    print(list(mdcath_3di_dynamics.keys())[0])
    print(list(mdcath_3di_dynamics["12asA00"].attrs))
    print(list(mdcath_3di_dynamics["12asA00"].keys()))
    for name in list(mdcath_3di_dynamics.keys()):
        pssm_to_csv(name, mdcath_3di_dynamics[name]["320_0"])


In [None]:
with h5py.File(
    "/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_dynamics_profiles.h5", "r"
) as mdcath_3di_dynamics:
    mdCATH_3Di_dynamics_keys = list(mdcath_3di_dynamics.keys())
    print(len(mdCATH_3Di_dynamics_keys))
    print(mdCATH_3Di_dynamics_keys[:10])

In [None]:
with h5py.File("/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_tokens.h5", "r") as mdcath_3di_structures:
    mdCATH_3Di_tokens_keys = list(mdcath_3di_structures["foldseek"].keys())
    print(len(mdCATH_3Di_tokens_keys))
    print(mdCATH_3Di_tokens_keys[:10])

In [None]:
set(mdCATH_3Di_dynamics_keys) - set(mdCATH_3Di_tokens_keys)

In [None]:
with h5py.File(
    "/mnt/gondolin/data/datasets_created/bachelor_thesis/mdCATH/mdCATH_3Di_dynamics_profiles.h5", "r"
) as mdcath_3di_dynamics:
    filepaths = [
        f"/mnt/gondolin/data/datasets/mdCATH/data/mdcath_dataset_{name}.h5" for name in list(mdcath_3di_dynamics.keys())
    ]
    with open("../tmp/data/mdcath_dataset_filepaths.json", "w") as f:
        json.dump(filepaths, f)


In [None]:
import os

pdb_structures_dir = "../tmp/data/mdcath_pdb_structures"
pdb_filenames = os.listdir(pdb_structures_dir)
print(f"Found {len(pdb_filenames)} PDB files")
print("First 10 files:", pdb_filenames[:10])


In [18]:
pdb_filenames = [x.lower() for x in pdb_filenames]

In [None]:
pdb_filenames

In [None]:
pdb_ids = [x[:7] for x in pdb_filenames]
duplicates = [x for x in set(pdb_ids) if pdb_ids.count(x) > 1]
print(f"Found {len(duplicates)} duplicate PDB IDs: {duplicates}")