In [None]:
import pandas as pd
import os
import graphein
import pickle
path = '../datasets/ALPHAFOLD PDBs/'
import numpy as np


## create a df with mutation name and path to che .pdb file

In [None]:

pdb_files = []
for folder in os.listdir(path):
    # if the folder is a directory
    if os.path.isdir(path + folder):
        # for every file in the folder
        for file in os.listdir(path + folder):
            if 'rank_001' in file and file.endswith('.pdb'):
                pdb_files.append(path + folder + '/' + file)



In [None]:
pdb_files

In [None]:
# for each pdb file in pdb_files, get name of the mutation which is the name of the folder before '_'
mutations = []
for pdb in pdb_files:
    mutations.append(pdb.split('/')[-2].split('_')[0])


In [None]:
mutations

In [None]:
# create a dataframe with the mutation and the pdb file

df = pd.DataFrame({'mutation': mutations, 'pdb_file': pdb_files})

out_path = "../datasets/pdb_files.csv"
if not os.path.isfile(out_path):
    df.to_csv(out_path, index=False)


## Parse the .pdb files

In [None]:
from Bio.PDB import PDBParser   
structures = {}
parser = PDBParser()
for i, row in df.iterrows():
    structures[row['mutation']] = parser.get_structure(row['mutation'], row['pdb_file'])
 

In [None]:
structures

## Create graphs from the .pdb files

In [None]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.graphs import construct_graph

config = ProteinGraphConfig()
(config.dict())

In [None]:
graphs = {}
for i, row in df.iterrows():
    print(row['mutation'])
    graphs[row['mutation'] ] = construct_graph(path = row['pdb_file'], config= config, )

In [72]:
graph_original = graphein.protein.graphs.construct_graph(path ='../datasets/ALPHAFOLD PDBs/HGD_normal.pdb', config= config )
structure_original = parser.get_structure('HGD_normal', '../datasets/ALPHAFOLD PDBs/HGD_normal.pdb')

Output()

In [None]:
from graphein.protein.visualisation import plotly_protein_structure_graph

p = plotly_protein_structure_graph(
    graphs['G161R'],
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [None]:
from graphein.protein.config import ProteinGraphConfig
from graphein.protein.edges.distance import *
from graphein.protein.graphs import construct_graph

edge_fns = [
    add_aromatic_interactions,
    add_hydrophobic_interactions,
    add_aromatic_sulphur_interactions,
    add_cation_pi_interactions,
    add_disulfide_interactions,
    add_hydrogen_bond_interactions,
    add_ionic_interactions,
    add_peptide_bonds
    ]
config = ProteinGraphConfig(edge_construction_functions=edge_fns)

g = construct_graph(config=config, path= df['pdb_file'][0])

In [None]:
p = plotly_protein_structure_graph(
    g,
    colour_edges_by="kind",
    colour_nodes_by="degree",
    label_node_ids=False,
    plot_title="Peptide backbone graph. Nodes coloured by degree.",
    node_size_multiplier=1
    )
p.show()

In [70]:

def get_contact_map(graph):
    contact_map = np.zeros((446,446))
    # contact map
    for residue in graph.get_residues():
        for residue2 in graph.get_residues():
            if residue != residue2:
                if residue['CA'] - residue2['CA'] < 8:
                    contact_map[residue.id[1], residue2.id[1]] = 1
    return contact_map

contact_map = get_contact_map(structures['G161R'])


In [86]:
contact_map_original = get_contact_map(structure_original)

In [87]:
for i in range(446):
    for j in range(446):
        if contact_map[i,j] != contact_map_original[i,j]:
            if contact_map[i,j] == 1:
                contact_map_original[i,j] = -1
            else:
                contact_map[i,j] = -1


In [89]:
import matplotlib.pyplot as plt
# 0 -> white, 1 -> black
# invert white and black
#contact_map = 1 - contact_map
#contact_map_original = 1 - contact_map_original
fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].imshow(contact_map)
ax[1].imshow(contact_map_original)
# plot red dots where the matrixes are -1
for i in range(446):
    for j in range(446):
        if contact_map[i,j] == -1:
            ax[0].plot(j,i, 'ro')
        if contact_map_original[i,j] == -1:
            ax[1].plot(j,i, 'ro')



KeyboardInterrupt: 

In [None]:
import networkx as nx
# check if graphs are isomorphic
G1 = graphs['G161R']
G2 = graphs['G170A']

nx.is_isomorphic(G1, G2)

In [None]:
# plot structures['G161R'] 

import nglview as nv
view = nv.show_biopython(structures['G161R'])
view


In [None]:
# save the graphs
os.makedirs('../datasets/graphs/', exist_ok=True)
for i, graph in enumerate(graphs):
    mut = df.iloc[i]['mutation']
    with open(f'../datasets/graphs/graph_{mut}.gpickle', 'wb') as f:
        pickle.dump(graph, f)

## Associate to each patient its mutations

In [None]:
import pandas as pd
df_patients =pd.read_excel('../datasets/aku_prin_v2.0.xlsx')

In [None]:
df_patients = df_patients[['Protein change allele 1 ', 'Protein change allele 2']]

In [None]:
df_patients['graph_allele1'] = [graphs[mut] if mut in graphs else None for mut in df_patients['Protein change allele 1 '] ]
df_patients['graph_allele2'] = [graphs[mut] if mut in graphs else None for mut in df_patients['Protein change allele 2'] ]
df_patients['structure_allele1'] = [structures[mut] if mut in structures else None for mut in df_patients['Protein change allele 1 '] ] 
df_patients['structure_allele2'] = [structures[mut] if mut in structures else None for mut in df_patients['Protein change allele 2'] ]

In [None]:
#show all rows
pd.set_option('display.max_columns', 500)
df_patients

In [None]:
df_patients.to_csv('../datasets/aku_prin_v2.0_with_graphs.csv', index=False)



In [None]:
#read the csv file
df_patients_2 = pd.read_csv('../datasets/aku_prin_v2.0_with_graphs.csv')


In [None]:
graph_test = df_patients_2['graph_allele1'][0]
graph_test_og = df_patients['graph_allele1'][0]

In [None]:
type(graph_test_og)

In [None]:
import networkx as nx

def string_to_graph(input_string):
    # Initialize a NetworkX graph
    G = nx.Graph()

    # Split the input string into lines
    lines = input_string.strip().split('\n')

    # Iterate through each line
    for line in lines:
        # Split the line into nodes and edges
        nodes = line.strip().split()
        # Assuming the first element is a node and the rest are its neighbors
        node = nodes[0]
        neighbors = nodes[1:]

        # Add the node to the graph
        G.add_node(node)

        # Add edges between the node and its neighbors
        for neighbor in neighbors:
            G.add_edge(node, neighbor)

    return G

In [None]:
type(graph_test)
# convert to a scipy sparse matrix
graph = string_to_graph(graph_test)
type(graph)

In [None]:
graph.edges

In [None]:
from torch_geometric.data import Data


In [None]:
from utils import create_graph_df
df_p = create_graph_df()
df_p

In [None]:
type(df_p['graph_allele1'][0])