<a href="https://colab.research.google.com/github/epicskills1/Final_yr_Project/blob/main/Final_yr_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Installing Libraries**


In [None]:
!pip install pandas numpy rdkit-pypi scikit-learn tensorflow torch torch-geometric pyngrok streamlit

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manyl

In [None]:
from google.colab import files
uploaded = files.upload()

Saving tox21.csv to tox21.csv


In [None]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten
from tensorflow.keras.optimizers import Adam
from spektral.layers import GCNConv
from spektral.data import Graph
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer

# Load and preprocess your data
data = pd.read_csv('tox21.csv')

# Example preprocessing: Filling missing values and encoding SMILES strings
data = data.fillna(0)

# Split features and labels
X = data['smiles'].values
y = data.drop(['mol_id', 'smiles'], axis=1).values

# Example tokenization (you may need a more sophisticated approach)
# Convert SMILES strings into numerical features
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = tf.keras.preprocessing.sequence.pad_sequences(X_seq, padding='post')

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# Normalize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define FCNN model
def build_fcnn(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(y.shape[1], activation='sigmoid')  # Multi-label classification
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define C1DNN model
def build_c1dnn(input_shape):
    model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(y.shape[1], activation='sigmoid')  # Multi-label classification
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define GCNN model
def build_gcnn(input_shape):
    class GCNNModel(tf.keras.Model):
        def __init__(self):
            super(GCNNModel, self).__init__()
            self.gconv1 = GCNConv(64, activation='relu')
            self.gconv2 = GCNConv(32, activation='relu')
            self.flatten = Flatten()
            self.dense1 = Dense(32, activation='relu')
            self.dense2 = Dense(y.shape[1], activation='sigmoid')  # Multi-label classification

        def call(self, inputs):
            x, adj = inputs
            x = self.gconv1([x, adj])
            x = self.gconv2([x, adj])
            x = self.flatten(x)
            x = self.dense1(x)
            x = self.dense2(x)
            return x

    return GCNNModel()

# Example usage
input_dim_fcnn = X_train.shape[1]
input_shape_c1dnn = (X_train.shape[1], 1)
num_nodes_gcnn = X_train.shape[1]

# Initialize models
fcnn = build_fcnn(input_dim_fcnn)
c1dnn = build_c1dnn(input_shape_c1dnn)
gcnn = build_gcnn(input_shape_c1dnn)

# Train and evaluate models
fcnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
c1dnn.fit(np.expand_dims(X_train, axis=-1), y_train, epochs=10, batch_size=32, validation_data=(np.expand_dims(X_test, axis=-1), y_test))
# For GCNN you need adjacency matrices (not covered in this simple example)
# gcnn.fit([X_train_gcnn, adj_train_gcnn], y_train, epochs=10, batch_size=32)

# Save models
fcnn.save('fcnn_model.h5')
c1dnn.save('c1dnn_model.h5')
# Save the GCNN model if used
# gcnn.save('gcnn_model.h5')

# Load saved models
fcnn = tf.keras.models.load_model('fcnn_model.h5')
c1dnn = tf.keras.models.load_model('c1dnn_model.h5')
# Load the GCNN model if used
# gcnn = tf.keras.models.load_model('gcnn_model.h5')

# Meta-learning: Combine predictions using a simple average
def meta_learner(models, X_test):
    predictions = np.zeros((X_test.shape[0], len(models), y.shape[1]))
    for i, model in enumerate(models):
        if isinstance(model, tf.keras.Model):  # Check if the model is a Keras model
            if len(model.input_shape) == 2:  # FCNN case
                predictions[:, i, :] = model.predict(X_test)
            else:  # C1DNN case
                predictions[:, i, :] = model.predict(np.expand_dims(X_test, axis=-1))
    return np.mean(predictions, axis=1)

# Combine predictions from all models
models = [fcnn, c1dnn]  # Add gcnn to this list once it's trained
y_pred = meta_learner(models, X_test)

# Evaluate combined model
from sklearn.metrics import accuracy_score
y_pred_binary = (y_pred > 0.5).astype(int)
accuracy = accuracy_score(y_test.flatten(), y_pred_binary.flatten())
print(f'Ensemble Model Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


Ensemble Model Accuracy: 0.9402254839395873


In [None]:
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from keras.preprocessing.text import Tokenizer

# Load saved models, use the correct file names
fcnn = tf.keras.models.load_model('fcnn_model.h5')
c1dnn = tf.keras.models.load_model('c1dnn_model.h5')
# gcnn = tf.keras.models.load_model('gcnn_model.h5')  # If GCNN model is used

# Function to preprocess a single SMILES string
def preprocess_smiles(smiles, tokenizer, scaler):
    seq = tokenizer.texts_to_sequences([smiles])
    pad_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=X_train.shape[1], padding='post')
    scaled_seq = scaler.transform(pad_seq)
    return scaled_seq

# Function to get predictions
def get_predictions(smiles):
    preprocessed = preprocess_smiles(smiles, tokenizer, scaler)
    fcnn_pred = fcnn.predict(preprocessed)
    c1dnn_pred = c1dnn.predict(np.expand_dims(preprocessed, axis=-1))
    # gcnn_pred = gcnn.predict([preprocessed, adj])  # If using GCNN and adjacency matrix
    ensemble_pred = np.mean([fcnn_pred, c1dnn_pred], axis=0)  # Add gcnn_pred to the list if using GCNN
    return ensemble_pred

# Example usage
smiles = 'CCOc1ccc2nc(S(N)(=O)=O)sc2c1'
prediction = get_predictions(smiles)
print(f'Toxicity Prediction: {prediction}')

Toxicity Prediction: [[0.01933786 0.01720362 0.14914522 0.07645066 0.0726798  0.03546712
  0.07644712 0.12187554 0.07791471 0.08815245 0.16553213 0.04245148]]
