In [3]:
import pandas as pd
import os
import graphein
import pickle
path = '../datasets/ALPHAFOLD PDBs/'
import numpy as np
from graphein.protein.visualisation import plotly_protein_structure_graph
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph
from Bio.PDB import PDBParser   
from utils import *

In [1]:
import torch_geometric

## create a df with mutation name and path to che .pdb file

In [32]:

pdb_files = []
for folder in os.listdir(path):
    # if the folder is a directory
    if os.path.isdir(path + folder):
        # for every file in the folder
        for file in os.listdir(path + folder):
            if 'rank_001' in file and file.endswith('.pdb'):
                pdb_files.append(path + folder + '/' + file)



In [33]:
# for each pdb file in pdb_files, get name of the mutation which is the name of the folder before '_'
mutations = []
for pdb in pdb_files:
    mutations.append(pdb.split('/')[-2].split('_')[0])

pdb_files.append('../datasets/ALPHAFOLD PDBs/H371fs_9b3dd/H371fs_9b3dd_unrelaxed_rank_001_alphafold2_ptm_model_3_seed_000.pdb')
mutations.append('His371Profs')

pdb_files.append('../datasets/ALPHAFOLD PDBs/K353Q_ed31e/K353Q_ed31e_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb')
mutations.append('L353Q')

# create a dataframe with the mutation and the pdb file

df = pd.DataFrame({'mutation': mutations, 'pdb_file': pdb_files})

out_path = "../datasets/pdb_files.csv"



In [34]:
# search the index of Ala218 in the df

df['mutation'] = df['mutation'].replace('Ala218', '(p.(Ala218_Asn219insLysIle))')
df['mutation'] = df['mutation'].replace('E168', 'E168*')
df['mutation'] = df['mutation'].replace('G115Mfs', 'G115Mfs*')
df['mutation'] = df['mutation'].replace('G372', 'G372_P373delinsA')
df['mutation'] = df['mutation'].replace('R321', 'R321*')
df['mutation'] = df['mutation'].replace('W60', 'W60*')
# add a row with the mutation and the pdb file



In [7]:
df.to_csv(out_path, index=False)

## Parse the .pdb files

In [6]:
structures = {}
parser = PDBParser()
for i, row in df.iterrows():
    structures[row['mutation']] = parser.get_structure(row['mutation'], row['pdb_file'])
 

## Create graphs from the .pdb files

In [7]:


config = ProteinGraphConfig()
(config.dict())

{'granularity': 'CA',
 'keep_hets': [],
 'insertions': True,
 'alt_locs': 'max_occupancy',
 'pdb_dir': None,
 'verbose': False,
 'exclude_waters': True,
 'deprotonate': False,
 'protein_df_processing_functions': None,
 'edge_construction_functions': [<function graphein.protein.edges.distance.add_peptide_bonds(G: 'nx.Graph') -> 'nx.Graph'>],
 'node_metadata_functions': [<function graphein.protein.features.nodes.amino_acid.meiler_embedding(n: str, d: Dict[str, Any], return_array: bool = False) -> Union[pandas.core.series.Series, numpy.ndarray]>],
 'edge_metadata_functions': None,
 'graph_metadata_functions': None,
 'get_contacts_config': None,
 'dssp_config': None}

In [None]:
graphs = {}
for i, row in df.iterrows():
    print(row['mutation'])
    graphs[row['mutation'] ] = construct_graph(path = row['pdb_file'], config= config, )

In [9]:
graph_original = graphein.protein.graphs.construct_graph(path ='../datasets/ALPHAFOLD PDBs/HGD_normal.pdb', config= config )
structure_original = parser.get_structure('HGD_normal', '../datasets/ALPHAFOLD PDBs/HGD_normal.pdb')

Output()

In [10]:
from graphein.protein.visualisation import plotly_protein_structure_graph

p = plotly_protein_structure_graph(
    graphs['G161R'],
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [None]:
import networkx as nx
# check if graphs are isomorphic
G1 = graphs['G161R']
G2 = graphs['G170A']

nx.is_isomorphic(G1, G2)

In [None]:
# plot structures['G161R'] 

import nglview as nv
view = nv.show_biopython(structures['G161R'])
view

In [None]:
# save the graphs
os.makedirs('../datasets/graphs/', exist_ok=True)
for i, graph in enumerate(graphs):
    mut = df.iloc[i]['mutation']
    with open(f'../datasets/graphs/graph_{mut}.gpickle', 'wb') as f:
        pickle.dump(graph, f)

## Associate to each patient its mutations

In [None]:
import pandas as pd
df_patients =pd.read_excel('../datasets/aku_prin_v2.0.xlsx')

In [None]:
df_patients = df_patients[['Protein change allele 1 ', 'Protein change allele 2']]

In [None]:
df_patients['graph_allele1'] = [graphs[mut] if mut in graphs else None for mut in df_patients['Protein change allele 1 '] ]
df_patients['graph_allele2'] = [graphs[mut] if mut in graphs else None for mut in df_patients['Protein change allele 2'] ]
df_patients['structure_allele1'] = [structures[mut] if mut in structures else None for mut in df_patients['Protein change allele 1 '] ] 
df_patients['structure_allele2'] = [structures[mut] if mut in structures else None for mut in df_patients['Protein change allele 2'] ]

In [None]:
#show all rows
pd.set_option('display.max_columns', 500)
df_patients

In [None]:
df_patients.to_csv('../datasets/aku_prin_v2.0_with_graphs.csv', index=False)



In [None]:
#read the csv file
df_patients_2 = pd.read_csv('../datasets/aku_prin_v2.0_with_graphs.csv')


In [None]:
graph_test = df_patients_2['graph_allele1'][0]
graph_test_og = df_patients['graph_allele1'][0]

In [None]:
type(graph_test_og)

In [None]:
import networkx as nx

def string_to_graph(input_string):
    # Initialize a NetworkX graph
    G = nx.Graph()

    # Split the input string into lines
    lines = input_string.strip().split('\n')

    # Iterate through each line
    for line in lines:
        # Split the line into nodes and edges
        nodes = line.strip().split()
        # Assuming the first element is a node and the rest are its neighbors
        node = nodes[0]
        neighbors = nodes[1:]

        # Add the node to the graph
        G.add_node(node)

        # Add edges between the node and its neighbors
        for neighbor in neighbors:
            G.add_edge(node, neighbor)

    return G

In [None]:
type(graph_test)
# convert to a scipy sparse matrix
graph = string_to_graph(graph_test)
type(graph)

In [None]:
from torch_geometric.data import Data


In [None]:
from utils import create_graph_df
df_p = create_graph_df()

## Convert to pytorch geometric

In [37]:
mutations_to_path = {}
for i, row in df.iterrows():
    mutations_to_path[row['mutation']] = row['pdb_file']

In [38]:
mutations_to_path

{'A122V': '../datasets/ALPHAFOLD PDBs/A122V_41d52/A122V_41d52_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 'A218fs': '../datasets/ALPHAFOLD PDBs/A218fs_8b909/A218fs_8b909_unrelaxed_rank_001_alphafold2_ptm_model_4_seed_000.pdb',
 'A267V': '../datasets/ALPHAFOLD PDBs/A267V_1c2c3/A267V_1c2c3_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 '(p.(Ala218_Asn219insLysIle))': '../datasets/ALPHAFOLD PDBs/Ala218_Asn219insLysIle_a4e58/Ala218_Asn219insLysIle_a4e58_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 'C120F': '../datasets/ALPHAFOLD PDBs/C120F_3f78a/C120F_3f78a_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 'D153fs': '../datasets/ALPHAFOLD PDBs/D153fs_315fd/D153fs_315fd_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 'D153G': '../datasets/ALPHAFOLD PDBs/D153G_f3577_0/D153G_f3577_0_unrelaxed_rank_001_alphafold2_ptm_model_5_seed_000.pdb',
 'D18N': '../datasets/ALPHAFOLD PDBs/D18N_1fe41/D18N_1fe41_unrelaxed_rank_001_alphafold2_ptm_model_5

In [40]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import *
from graphein.protein.graphs import construct_graph

from torch_geometric import nn
edge_fns = [
    add_aromatic_interactions,
    add_hydrophobic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    add_disulfide_interactions,
    add_hydrogen_bond_interactions,
    add_ionic_interactions,
    add_peptide_bonds
    ]
config = ProteinGraphConfig(edge_construction_functions=edge_fns)

g_original = construct_graph(config=config, path='../datasets/ALPHAFOLD PDBs/HGD_normal.pdb')
g_g161r = construct_graph(config=config, path= mutations_to_path['G161R'])
g_r330s = construct_graph(config=config, path= mutations_to_path['R330S'])


Output()

Output()

Output()

In [42]:
# active sites : His292, His335, His365, His371, and Glu341.
active_sites =['A:HIS:292', 'A:HIS:335', 'A:HIS:365', 'A:HIS:371', 'A:GLU:341']
# get neighbours of active sites
def get_neighbours(active_sites, g):
    active_sites_neighbours = []
    for site in active_sites:
        active_sites_neighbours.append(list(g.neighbors(site)))

    active_sites_neighbours = [item for sublist in active_sites_neighbours for item in sublist]
    active_sites_neighbours = list(set(active_sites_neighbours) | set(active_sites))
    return active_sites_neighbours


In [43]:
# convert the list of lists to a single list
active_sites_neighbours = get_neighbours(active_sites, g_original)
active_sites_neighbours_g161r = get_neighbours(active_sites, g_g161r)
active_sites_neighbours_r330s = get_neighbours(active_sites, g_r330s)

In [44]:
active_sites_neighbours

['A:ARG:336',
 'A:GLU:351',
 'A:ASP:291',
 'A:TYR:423',
 'A:GLU:341',
 'A:TRP:427',
 'A:SER:340',
 'A:PRO:370',
 'A:HIS:335',
 'A:HIS:371',
 'A:SER:366',
 'A:TYR:333',
 'A:GLY:372',
 'A:LEU:364',
 'A:PHE:342',
 'A:HIS:292',
 'A:ALA:293',
 'A:HIS:365',
 'A:TYR:334']

In [47]:
from utils import get_subgraph

# get the subgraph of the active sites and neighbours
g_active = get_subgraph(g_original, active_sites_neighbours)
g_active_g161r = get_subgraph(g_g161r, active_sites_neighbours_g161r)
g_active_r330s = get_subgraph(g_r330s, active_sites_neighbours_r330s)

In [52]:
p = plotly_protein_structure_graph(
    g_active,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Subgraph of active sites and their neighbors (original HGD)",
    node_size_multiplier=1
    )

p.show()
p = plotly_protein_structure_graph(
    g_active_g161r,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Subgraph of active sites and their neighbors (G161R)",
    node_size_multiplier=1
    )

p.show()

p = plotly_protein_structure_graph(
    g_active_r330s,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Subgraph of active sites and their neighbors (R330S)",
    node_size_multiplier=1
    )

p.show()

In [55]:
# check if the subgraphs are isomorphic
print(nx.is_isomorphic(g_active, g_active_g161r))
print(nx.is_isomorphic(g_active, g_active_r330s))

# edit distance between the subgraphs

from networkx.algorithms.similarity import graph_edit_distance

distance = graph_edit_distance(g_active, g_active_g161r)
distance




False
False


KeyboardInterrupt: 

In [66]:
# create two networkx graphs
G1 = nx.Graph()
G2 = nx.Graph()

# add nodes
G1.add_nodes_from(g_active.nodes)
G2.add_nodes_from(g_active_g161r.nodes)

# add edges withou attributes
G1.add_edges_from(g_active.edges)
G2.add_edges_from(g_active_g161r.edges)
for node in G1.nodes:
    print(node)

print('-----------------')

for node in G2.nodes:
    print(node)

# add attributes to the nodes
for node in G1.nodes:
    G1.nodes[node]['kind'] = g_active.nodes[node]['kind']



A:ARG:336
A:GLU:351
A:ASP:291
A:TYR:423
A:GLU:341
A:TRP:427
A:SER:340
A:PRO:370
A:HIS:335
A:HIS:371
A:SER:366
A:TYR:333
A:GLY:372
A:LEU:364
A:PHE:342
A:HIS:292
A:ALA:293
A:HIS:365
A:TYR:334
-----------------
A:ARG:336
A:ASP:291
A:TYR:423
A:GLU:341
A:TRP:427
A:SER:340
A:PRO:370
A:HIS:335
A:SER:366
A:HIS:371
A:TYR:333
A:GLY:372
A:LEU:364
A:PHE:342
A:ASP:294
A:HIS:292
A:ALA:293
A:HIS:365
A:TYR:334


In [None]:
# add a node to G1
G1.add_node('A:GLU:341')

# add an edge to G1
G1.add_edge('A:GLU:341', 'A:HIS:292')

In [78]:
# edit distance between the subgraphs

from networkx.algorithms.similarity import optimize_graph_edit_distance

def node_match(n1, n2):
    return n1['residue_number'] == n2['residue_number']



for d in optimize_graph_edit_distance(g_active, g_active_g161r, node_match=node_match):
    print(d)
d_g161r = d

for d in optimize_graph_edit_distance(g_active, g_active_r330s, node_match=node_match):
    print(d)
d_r330s = d

5.0
2.0


In [79]:
d_g161r, d_r330s

(5.0, 2.0)

In [10]:
g = g.to_directed()

# see g edges attributes
g.edges(data=True)


OutEdgeDataView([('A:MET:1', 'A:PHE:193', {'kind': {'hydrophobic'}, 'distance': 12.366443708681976}), ('A:MET:1', 'A:LEU:164', {'kind': {'hydrophobic'}, 'distance': 10.518538586704903}), ('A:MET:1', 'A:ALA:2', {'kind': {'peptide_bond'}, 'distance': 3.880878379954722}), ('A:ALA:2', 'A:MET:1', {'kind': {'peptide_bond'}, 'distance': 3.880878379954722}), ('A:ALA:2', 'A:GLU:3', {'kind': {'peptide_bond'}, 'distance': 3.8727523804137016}), ('A:GLU:3', 'A:ALA:2', {'kind': {'peptide_bond'}, 'distance': 3.8727523804137016}), ('A:GLU:3', 'A:LEU:4', {'kind': {'peptide_bond'}, 'distance': 3.8747165057588413}), ('A:LEU:4', 'A:LEU:164', {'kind': {'hydrophobic'}, 'distance': 10.371771497675795}), ('A:LEU:4', 'A:TYR:6', {'kind': {'hydrophobic'}, 'distance': 6.6816707491465035}), ('A:LEU:4', 'A:LEU:173', {'kind': {'hydrophobic'}, 'distance': 7.551629029553822}), ('A:LEU:4', 'A:GLU:3', {'kind': {'peptide_bond'}, 'distance': 3.8747165057588413}), ('A:LEU:4', 'A:LYS:5', {'kind': {'peptide_bond'}, 'distance

In [11]:
# get the edge attributes
edge_attrs = nx.get_edge_attributes(g, 'kind')
# convert sets to frozensets
type(edge_attrs.values())

dict_values

In [12]:
edge_values =edge_attrs.values()
# get one element of edge_values
edge_values = list(edge_values)

In [13]:
# one hot encode edge values
edge_values = [list(edge) for edge in edge_values]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
edge_values = mlb.fit_transform(edge_values)


In [14]:
# see labels associated with the one hot encoding
mlb.classes_

array(['aromatic', 'hbond', 'hydrophobic', 'ionic', 'peptide_bond'],
      dtype=object)

In [15]:
#assign the one hot encoded values to the edge attributes
for i, edge in enumerate(g.edges()):
    g.edges[edge[0], edge[1]]['kind'] = edge_values[i]

In [17]:

from dgl import from_networkx
g = g.to_directed()
g_dgl=from_networkx(g, node_attrs=['residue_number', 'coords'], edge_attrs=['distance', 'kind'])
g_dgl.edata['kind'][1]

tensor([0, 0, 1, 0, 0], dtype=torch.int32)