In [None]:
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

In [None]:
# list of 20 proteins
PROTEIN_RESIDUE_TABLE = [
    "A", "C", "D", "E", "F", "G", "H", "I", "K", "L",
    "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"
]

# Dictionary for getting Residue symbols
RESIDUE_SYMBOL_MAPPING = {
    "ALA": "A", "CYS": "C", "ASP": "D", "GLU": "E", "PHE": "F", "GLY": "G", "HIS": "H",
    "ILE": "I", "LYS": "K", "LEU": "L", "MET": "M", "ASN": "N", "PRO": "P", "GLN": "Q",
    "ARG": "R", "SER": "S", "THR": "T", "VAL": "V", "TRP": "W", "TYR": "Y"
}

# residue features stored as key-value pair
# physico-chemical properties
# USED AS A RESIDUE EMBEDDING
PCP_DICT = {
    "A": [0.62014, -0.18875, -1.2387, -0.083627, -1.3296, -1.3817, -0.44118],
    "C": [0.29007, -0.44041, -0.76847, -1.05, -0.4893, -0.77494, -1.1148],
    "D": [-0.9002, 1.5729, -0.89497, 1.7376, -0.72498, -0.50189, -0.91814],
    "E": [-0.74017, 1.5729, -0.28998, 1.4774, -0.25361, 0.094051, -0.4471],
    "F": [1.1903, -1.1954, 1.1812, -1.1615, 1.1707, 0.8872, 0.02584],
    "G": [0.48011, 0.062916, -1.9949, 0.25088, -1.8009, -2.0318, 2.2022],
    "H": [-0.40009, -0.18875, 0.17751, 0.77123, 0.5559, 0.44728, -0.71617],
    "I": [1.3803, -0.84308, 0.57625, -1.1615, 0.10503, -0.018637, -0.21903],
    "K": [-1.5003, 1.5729, 0.75499, 1.1057, 0.44318, 0.95221, -0.27937],
    "L": [1.0602, -0.84308, 0.57625, -1.273, 0.10503, 0.24358, 0.24301],
    "M": [0.64014, -0.59141, 0.59275, -0.97565, 0.46368, 0.46679, -0.51046],
    "N": [-0.78018, 1.0696, -0.38073, 1.2172, -0.42781, -0.35453, -0.46879],
    "P": [0.12003, 0.062916, -0.84272, -0.1208, -0.45855, -0.75977, 3.1323],
    "Q": [-0.85019, 0.16358, 0.22426, 0.8084, 0.04355, 0.24575, 0.20516],
    "R": [-2.5306, 1.5729, 0.89249, 0.8084, 1.181, 1.6067, 0.11866],
    "S": [-0.18004, 0.21392, -1.1892, 0.32522, -1.1656, -1.1282, -0.48056],
    "T": [-0.050011, -0.13842, -0.58422, 0.10221, -0.69424, -0.63625, -0.50017],
    "V": [1.0802, -0.69208, -0.028737, -0.90132, -0.36633, -0.3762, 0.32502],
    "W": [0.81018, -1.6484, 2.0062, -1.0872, 2.3901, 1.8299, 0.032377],
    "Y": [0.26006, -1.0947, 1.2307, -0.78981, 1.2527, 1.1906, -0.18876],
}

In [None]:
import numpy as np

ftrs = np.load("../human_features/pdb_to_seqvec_dict.npy", allow_pickle=True)

In [None]:
from torch_geometric.data import Dataset as Dataset_TG, Data
from torch.utils.data import Dataset as _
import os
import pathlib
import biographs
import networkx as nx


class ProteinDataset(Dataset_TG):
    def __init__(self, root, transform=None, pre_transform=None):
        super(ProteinDataset, self).__init__(
            root, transform=transform, pre_transform=pre_transform
        )
        self.data = self.processed_paths

    @property
    def raw_file_names(self):
        return [filename for filename in os.scandir(self.root + "/raw")]

    @property
    def processed_file_names(self):
        return [
            os.path.splitext(os.path.basename(filename))[0] + ".pt"
            for filename in self.raw_paths
        ]

    def process(self):
        # Read data into huge `Data` list.
        self.data = self.processed_paths

        data_list = []
        for filename in self.raw_paths:
            if pathlib.Path(filename).suffix == ".pdb":
                current_protein_name = os.path.splitext(os.path.basename(filename))[0]
                node_feats = torch.tensor(ftrs.item()[current_protein_name])

                edge_index = self._get_edgeindex(filename)

                data = Data(x=node_feats, edge_index=edge_index)
                data_list.append(data)

                torch.save(
                    data,
                    self.processed_dir + "/" + current_protein_name + ".pt",
                )

        self.data_prot = data_list

    def __len__(self):
        return len(self.processed_file_names)

    def __getitem__(self, idx):
        return self.data_prot[idx]

    def len(self):
        return len(self)

    def get(self, idx):
        return self[idx]

    def _get_edgeindex(self, filename):
        molecule = biographs.Pmolecule(filename)

        network = molecule.network()
        mat = nx.adjacency_matrix(network).todense()

        edge_ind = []

        a = np.nonzero(mat > 0)[0]
        b = np.nonzero(mat > 0)[1]
        edge_ind.append(a)
        edge_ind.append(b)
        return torch.tensor(np.array(edge_ind), dtype=torch.long)

In [None]:
prot_graphs = ProteinDataset(root="../datasets/from_repo/human_features/processed/")

## Updating the `.pt` Files to the Latest PyTorch Version

In [None]:
import os
import torch
from torch_geometric.data import Data


def bump(g):
    return Data.from_dict(g.__dict__)


ROOT_DIR = "../datasets/from_repo/human_features/processed/"
BUMPED_DIR = "../datasets/processed/new_pts/"

os.makedirs(BUMPED_DIR, exist_ok=True)

for filename in os.listdir(ROOT_DIR):
    file_path = os.path.join(ROOT_DIR, filename)
    old_data = torch.load(file_path)

    try:
        new_data = bump(old_data)
    except:
        print(f"Failed to bump {filename}")
        continue

    new_path = os.path.join(BUMPED_DIR, filename)
    torch.save(new_data, new_path)