In [2]:
import pandas as pd
import os
import graphein
import pickle
path = '../datasets/ALPHAFOLD PDBs/'
import numpy as np
from graphein.protein.visualisation import plotly_protein_structure_graph
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph
from Bio.PDB import PDBParser   
from utils import *



In [1]:
import torch_geometric

## create a df with mutation name and path to che .pdb file

In [4]:

pdb_files = []
for folder in os.listdir(path):
    # if the folder is a directory
    if os.path.isdir(path + folder):
        # for every file in the folder
        for file in os.listdir(path + folder):
            if 'rank_001' in file and file.endswith('.pdb'):
                pdb_files.append(path + folder + '/' + file)



In [5]:
# for each pdb file in pdb_files, get name of the mutation which is the name of the folder before '_'
mutations = []
for pdb in pdb_files:
    mutations.append(pdb.split('/')[-2].split('_')[0])

pdb_files.append('../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs_9b3dd_unrelaxed_rank_001_alphafold2_ptm_model_3_seed_000.pdb')
mutations.append('His371Profs')

pdb_files.append('../datasets/ALPHAFOLD PDBs/K353Q_ed31e/K353Q_ed31e_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb')
mutations.append('L353Q')

# create a dataframe with the mutation and the pdb file

df = pd.DataFrame({'mutation': mutations, 'pdb_file': pdb_files})

out_path = "../datasets/pdb_files.csv"



In [6]:
# search the index of Ala218 in the df

df['mutation'] = df['mutation'].replace('Ala218', '(p.(Ala218_Asn219insLysIle))')
df['mutation'] = df['mutation'].replace('E168', 'E168*')
df['mutation'] = df['mutation'].replace('G115Mfs', 'G115Mfs*')
df['mutation'] = df['mutation'].replace('G372', 'G372_P373delinsA')
df['mutation'] = df['mutation'].replace('R321', 'R321*')
df['mutation'] = df['mutation'].replace('W60', 'W60*')
# add a row with the mutation and the pdb file



In [7]:
df.to_csv(out_path, index=False)

## Parse the .pdb files

In [6]:
structures = {}
parser = PDBParser()
for i, row in df.iterrows():
    structures[row['mutation']] = parser.get_structure(row['mutation'], row['pdb_file'])
 

## Create graphs from the .pdb files

In [7]:


config = ProteinGraphConfig()
(config.dict())

{'granularity': 'CA',
 'keep_hets': [],
 'insertions': True,
 'alt_locs': 'max_occupancy',
 'pdb_dir': None,
 'verbose': False,
 'exclude_waters': True,
 'deprotonate': False,
 'protein_df_processing_functions': None,
 'edge_construction_functions': [<function graphein.protein.edges.distance.add_peptide_bonds(G: 'nx.Graph') -> 'nx.Graph'>],
 'node_metadata_functions': [<function graphein.protein.features.nodes.amino_acid.meiler_embedding(n: str, d: Dict[str, Any], return_array: bool = False) -> Union[pandas.core.series.Series, numpy.ndarray]>],
 'edge_metadata_functions': None,
 'graph_metadata_functions': None,
 'get_contacts_config': None,
 'dssp_config': None}

In [None]:
graphs = {}
for i, row in df.iterrows():
    print(row['mutation'])
    graphs[row['mutation'] ] = construct_graph(path = row['pdb_file'], config= config, )

In [9]:
graph_original = graphein.protein.graphs.construct_graph(path ='../datasets/ALPHAFOLD PDBs/HGD_normal.pdb', config= config )
structure_original = parser.get_structure('HGD_normal', '../datasets/ALPHAFOLD PDBs/HGD_normal.pdb')

Output()

In [10]:
from graphein.protein.visualisation import plotly_protein_structure_graph

p = plotly_protein_structure_graph(
    graphs['G161R'],
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [None]:
import networkx as nx
# check if graphs are isomorphic
G1 = graphs['G161R']
G2 = graphs['G170A']

nx.is_isomorphic(G1, G2)

In [None]:
# plot structures['G161R'] 

import nglview as nv
view = nv.show_biopython(structures['G161R'])
view

In [None]:
# save the graphs
os.makedirs('../datasets/graphs/', exist_ok=True)
for i, graph in enumerate(graphs):
    mut = df.iloc[i]['mutation']
    with open(f'../datasets/graphs/graph_{mut}.gpickle', 'wb') as f:
        pickle.dump(graph, f)

## Associate to each patient its mutations

In [None]:
import pandas as pd
df_patients =pd.read_excel('../datasets/aku_prin_v2.0.xlsx')

In [None]:
df_patients = df_patients[['Protein change allele 1 ', 'Protein change allele 2']]

In [None]:
df_patients['graph_allele1'] = [graphs[mut] if mut in graphs else None for mut in df_patients['Protein change allele 1 '] ]
df_patients['graph_allele2'] = [graphs[mut] if mut in graphs else None for mut in df_patients['Protein change allele 2'] ]
df_patients['structure_allele1'] = [structures[mut] if mut in structures else None for mut in df_patients['Protein change allele 1 '] ] 
df_patients['structure_allele2'] = [structures[mut] if mut in structures else None for mut in df_patients['Protein change allele 2'] ]

In [None]:
#show all rows
pd.set_option('display.max_columns', 500)
df_patients

In [None]:
df_patients.to_csv('../datasets/aku_prin_v2.0_with_graphs.csv', index=False)



In [None]:
#read the csv file
df_patients_2 = pd.read_csv('../datasets/aku_prin_v2.0_with_graphs.csv')


In [None]:
graph_test = df_patients_2['graph_allele1'][0]
graph_test_og = df_patients['graph_allele1'][0]

In [None]:
type(graph_test_og)

In [None]:
import networkx as nx

def string_to_graph(input_string):
    # Initialize a NetworkX graph
    G = nx.Graph()

    # Split the input string into lines
    lines = input_string.strip().split('\n')

    # Iterate through each line
    for line in lines:
        # Split the line into nodes and edges
        nodes = line.strip().split()
        # Assuming the first element is a node and the rest are its neighbors
        node = nodes[0]
        neighbors = nodes[1:]

        # Add the node to the graph
        G.add_node(node)

        # Add edges between the node and its neighbors
        for neighbor in neighbors:
            G.add_edge(node, neighbor)

    return G

In [None]:
type(graph_test)
# convert to a scipy sparse matrix
graph = string_to_graph(graph_test)
type(graph)

In [None]:
from torch_geometric.data import Data


In [None]:
from utils import create_graph_df
df_p = create_graph_df()

## Convert to pytorch geometric

In [8]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import *
from graphein.protein.graphs import construct_graph
from torch_geometric import nn
edge_fns = [
    add_aromatic_interactions,
    add_hydrophobic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    add_disulfide_interactions,
    add_hydrogen_bond_interactions,
    add_ionic_interactions,
    add_peptide_bonds
    ]
config = ProteinGraphConfig(edge_construction_functions=edge_fns)

g = construct_graph(config=config, path= df['pdb_file'][0])
g2 = construct_graph(config=config, path='../datasets/ALPHAFOLD PDBs/HGD_normal.pdb')

Output()

Output()

In [9]:
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [10]:
# check if g and g2 are isomorphic
import networkx as nx

nx.is_isomorphic(g, g2)

False

In [8]:
g.edges(data=True)

EdgeDataView([('A:MET:1', 'A:PHE:193', {'kind': {'hydrophobic'}, 'distance': 12.366443708681976}), ('A:MET:1', 'A:LEU:164', {'kind': {'hydrophobic'}, 'distance': 10.518538586704903}), ('A:MET:1', 'A:ALA:2', {'kind': {'peptide_bond'}, 'distance': 3.880878379954722}), ('A:ALA:2', 'A:GLU:3', {'kind': {'peptide_bond'}, 'distance': 3.8727523804137016}), ('A:GLU:3', 'A:LEU:4', {'kind': {'peptide_bond'}, 'distance': 3.8747165057588413}), ('A:LEU:4', 'A:LEU:164', {'kind': {'hydrophobic'}, 'distance': 10.371771497675795}), ('A:LEU:4', 'A:TYR:6', {'kind': {'hydrophobic'}, 'distance': 6.6816707491465035}), ('A:LEU:4', 'A:LEU:173', {'kind': {'hydrophobic'}, 'distance': 7.551629029553822}), ('A:LEU:4', 'A:LYS:5', {'kind': {'peptide_bond'}, 'distance': 3.791274324023521}), ('A:LYS:5', 'A:TYR:6', {'kind': {'peptide_bond'}, 'distance': 3.780888387667639}), ('A:TYR:6', 'A:PRO:230', {'kind': {'hydrophobic'}, 'distance': 4.915751519350831}), ('A:TYR:6', 'A:LEU:173', {'kind': {'hydrophobic'}, 'distance': 

In [10]:
g = g.to_directed()

# see g edges attributes
g.edges(data=True)


OutEdgeDataView([('A:MET:1', 'A:PHE:193', {'kind': {'hydrophobic'}, 'distance': 12.366443708681976}), ('A:MET:1', 'A:LEU:164', {'kind': {'hydrophobic'}, 'distance': 10.518538586704903}), ('A:MET:1', 'A:ALA:2', {'kind': {'peptide_bond'}, 'distance': 3.880878379954722}), ('A:ALA:2', 'A:MET:1', {'kind': {'peptide_bond'}, 'distance': 3.880878379954722}), ('A:ALA:2', 'A:GLU:3', {'kind': {'peptide_bond'}, 'distance': 3.8727523804137016}), ('A:GLU:3', 'A:ALA:2', {'kind': {'peptide_bond'}, 'distance': 3.8727523804137016}), ('A:GLU:3', 'A:LEU:4', {'kind': {'peptide_bond'}, 'distance': 3.8747165057588413}), ('A:LEU:4', 'A:LEU:164', {'kind': {'hydrophobic'}, 'distance': 10.371771497675795}), ('A:LEU:4', 'A:TYR:6', {'kind': {'hydrophobic'}, 'distance': 6.6816707491465035}), ('A:LEU:4', 'A:LEU:173', {'kind': {'hydrophobic'}, 'distance': 7.551629029553822}), ('A:LEU:4', 'A:GLU:3', {'kind': {'peptide_bond'}, 'distance': 3.8747165057588413}), ('A:LEU:4', 'A:LYS:5', {'kind': {'peptide_bond'}, 'distance

In [11]:
# get the edge attributes
edge_attrs = nx.get_edge_attributes(g, 'kind')
# convert sets to frozensets
type(edge_attrs.values())

dict_values

In [12]:
edge_values =edge_attrs.values()
# get one element of edge_values
edge_values = list(edge_values)

In [13]:
# one hot encode edge values
edge_values = [list(edge) for edge in edge_values]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
edge_values = mlb.fit_transform(edge_values)


In [14]:
# see labels associated with the one hot encoding
mlb.classes_

array(['aromatic', 'hbond', 'hydrophobic', 'ionic', 'peptide_bond'],
      dtype=object)

In [15]:
#assign the one hot encoded values to the edge attributes
for i, edge in enumerate(g.edges()):
    g.edges[edge[0], edge[1]]['kind'] = edge_values[i]

In [17]:

from dgl import from_networkx
g = g.to_directed()
g_dgl=from_networkx(g, node_attrs=['residue_number', 'coords'], edge_attrs=['distance', 'kind'])
g_dgl.edata['kind'][1]

tensor([0, 0, 1, 0, 0], dtype=torch.int32)

AttributeError: module 'torch_geometric.nn' has no attribute 'Module'