# GNN project - node classifiaction

## EDA

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from random import randint
import scipy.sparse as sp
from tensorflow.keras import layers
import tensorflow as tf


In [2]:
data  = pd.read_csv('G-HumanEssential.tsv', sep='\t')
data = data.rename(columns={
    'Essentiality (determined from multiple datasets)': 'Essentiality'
    })

In [3]:
data

Unnamed: 0,Gene ID,Essentiality,Data Source
0,122809,Non-essential,"PMID:26472758,PMID:26472760,PMID:18239125"
1,64359,Non-essential,"PMID:26472758,PMID:26472760,PMID:18239125"
2,11218,Essential,"PMID:26472758,PMID:26472760,PMID:18239125,PMID..."
3,401236,Non-essential,PMID:26472758
4,27115,Non-essential,"PMID:26472758,PMID:26472760,PMID:18239125"
...,...,...,...
18523,10647,Non-essential,"PMID:26472758,PMID:26472760,PMID:18239125"
18524,84937,Non-essential,"PMID:26472758,PMID:26472760,PMID:19490893,PMID..."
18525,4246,Non-essential,"PMID:26472758,PMID:26472760,PMID:18239125"
18526,51082,Non-essential,"PMID:26472758,PMID:26472760,PMID:18239125,PMID..."


In [5]:
print(data.shape)

(18528, 3)


In [7]:
# Extract unique values from the 'Gene ID' column
print("Number of different genes:", len(data['Gene ID'].unique()))

Number of different genes: 18528


In [8]:
# Extract unique values from the 'Gene ID' column
unique_ess = data['Essentiality'].unique()
print("Essentiality values:")
print(*unique_ess, sep="\t")


Essentiality values:
Non-essential	Essential


## Creating a graph

In [26]:
# Create an empty graph
graph = nx.Graph()

# Add nodes to the graph
for index, row in data.iterrows():
    gene_id = row['Gene ID']
    essentiality = row['Essentiality']
    graph.add_node(gene_id, essentiality=essentiality)

# Add edges to the graph
for index, row in data.iterrows():
    gene_id = row['Gene ID']
    data_sources = row['Data Source'].split(',')
    for source in data_sources:
        graph.add_edge(gene_id, source)

In [39]:
# Create a NetworkX graph
graph = nx.Graph()

# Add nodes and edges to the graph
graph.add_edges_from([(1, 2), (2, 3), (3, 1)])

# Assign 'Essentiality' attribute to the nodes (example)
# nx.set_node_attributes(graph, {1: 'Essential', 2: 'Non-essential', 3: 'Essential'}, name='Essentiality')
nx.set_node_attributes(graph, {1: 'Essential', 2: 'Non-essential'}, name='Essentiality')

# Convert the graph to a SciPy sparse matrix
adjacency = nx.adjacency_matrix(graph)
adjacency = adjacency.toarray()

print(adjacency)


[[0 1 1]
 [1 0 1]
 [1 1 0]]


In [40]:
# Define the TensorFlow model
model = tf.keras.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(adjacency.shape[0],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [41]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Prepare the labels for training
labels = np.array([1 if graph.nodes[n]['Essentiality'] == 'Essential' else 0 for n in graph.nodes])

# Split the data and labels into training and validation sets
split_index = int(0.8 * len(adjacency))
adjacency_train, adjacency_val = adjacency[:split_index], adjacency[split_index:]
labels_train, labels_val = labels[:split_index], labels[split_index:]

# Train the model
model.fit(adjacency_train, labels_train, epochs=10, validation_data=(adjacency_val, labels_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21405560e80>