<a href="https://colab.research.google.com/github/christophergaughan/GNN-Antibiotics/blob/main/GNN_CHEMBL_test_tables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Calculate molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []

for smiles in filtered_data['canonical_smiles']:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
    else:
        h_bond_donors.append(np.nan)
        h_bond_acceptors.append(np.nan)
        molecular_weights.append(np.nan)
        num_rings.append(np.nan)
        tpsa.append(np.nan)
        rotatable_bonds.append(np.nan)

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds

# Display the first few rows of the enriched dataset
filtered_data.head()

In [None]:
data.head(25)  # Inspect the first 25 rows to ensure all fields are retrieved correctly
data.columns  # Check all available columns to see if additional fields were included


In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
failed_smiles = []

for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            h_bond_donors.append(Descriptors.NumHDonors(mol))
            h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
            molecular_weights.append(Descriptors.MolWt(mol))
            num_rings.append(Descriptors.RingCount(mol))
            tpsa.append(Descriptors.TPSA(mol))
            rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
            # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
            fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
            fingerprints.append(list(fp))  # Store the entire bit vector
        else:
            failed_smiles.append(smiles)
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        failed_smiles.append(smiles)

print("\nFailed SMILES:")
print(failed_smiles)


In [None]:
print(f"Number of rows after processing: {len(filtered_data)}")


In [None]:
# Calculate the number of active and inactive rows more directly
active_count = filtered_data[filtered_data['active'] == 1].shape[0]
inactive_count = filtered_data[filtered_data['active'] == 0].shape[0]

# Print the correct counts
print(f"Number of active rows: {active_count}")
print(f"Number of inactive rows: {inactive_count}")


In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
active_count = filtered_data[filtered_data['active'] == 1].shape[0]
inactive_count = filtered_data[filtered_data['active'] == 0].shape[0]

print(f"Number of active rows: {active_count}")
print(f"Number of inactive rows: {inactive_count}")


In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove duplicate rows based on all columns to ensure no duplicate labels
filtered_data = filtered_data.drop_duplicates().reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove duplicate rows based on all columns to ensure no duplicate labels
filtered_data = filtered_data.drop_duplicates(subset=filtered_data.columns.difference(['fingerprints']).tolist()).reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
# Check if there are any empty fingerprints or NaN values in the dataset
empty_fingerprints_count = filtered_data['fingerprints'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()
nan_count = filtered_data.isna().sum()

# Display results
print(f"Number of rows with empty fingerprints: {empty_fingerprints_count}")
print("\nNumber of NaN values in each column:")
print(nan_count)


In [None]:
active_count = filtered_data[filtered_data['active'] == 1].shape[0]
inactive_count = filtered_data[filtered_data['active'] == 0].shape[0]

print(f"Number of active rows: {active_count}")
print(f"Number of inactive rows: {inactive_count}")


In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove duplicate rows based on all columns except 'fingerprints' to ensure no duplicate labels
filtered_data = filtered_data.drop_duplicates(subset=filtered_data.columns.difference(['fingerprints']).tolist()).reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
filtered_data.loc[:, 'standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')


In [None]:
# Calculate the number of active and inactive rows
active_count = filtered_data[filtered_data['active'] == 1].shape[0]
inactive_count = filtered_data[filtered_data['active'] == 0].shape[0]

# Print the correct counts
print(f"Number of active rows: {active_count}")
print(f"Number of inactive rows: {inactive_count}")


In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove duplicate rows based on 'molecule_chembl_id' to ensure no duplicate labels
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])


In [None]:
# Count the number of active and inactive rows
active_count = filtered_data[filtered_data['active'] == 1].shape[0]
inactive_count = filtered_data[filtered_data['active'] == 0].shape[0]

# Print the counts
print(f"Number of active rows: {active_count}")
print(f"Number of inactive rows: {inactive_count}")


In [None]:
# Remove exact duplicate rows based on all columns
filtered_data = filtered_data.drop_duplicates().reset_index(drop=True)


In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove exact duplicate rows
filtered_data = filtered_data.drop_duplicates().reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove rows with duplicate fingerprints
filtered_data = filtered_data[~filtered_data['fingerprints'].duplicated(keep=False)].reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

In [None]:
# Step 1: Set up the Colab environment

# Enable GPU (A100 should already be selected if you've chosen "GPU" in the Runtime)
# You can double-check by running the following command:
!nvidia-smi

# Install necessary Python libraries
!pip install pandas numpy scikit-learn matplotlib seaborn torch torch-geometric chembl-webresource-client rdkit
!pip install scikit-learn

# Import the required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, rdMolDescriptors

# Verify that the GPU is available
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("GPU not available, using CPU")

# Query ChEMBL for data with additional fields
chembl = new_client.activity
activities = chembl.filter(target_chembl_id='CHEMBL240', standard_type='IC50').only(
    'molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_chembl_id', 'assay_description', 'activity_comment',
    'pchembl_value', 'standard_relation', 'activity_type', 'molecule_properties', 'compound_name'
)

# Convert the retrieved data into a Pandas DataFrame
data = pd.DataFrame(activities)

# Display the first few rows to inspect the data fields
data.head()

# Filter and clean the dataset
# We want to keep only the rows with a defined 'standard_value' and add an 'active' column
# Assuming IC50 <= 1000 nM indicates activity
filtered_data = data[['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'assay_description', 'activity_comment']].dropna()
filtered_data['standard_value'] = pd.to_numeric(filtered_data['standard_value'], errors='coerce')
filtered_data = filtered_data.dropna()
filtered_data['active'] = filtered_data['standard_value'].apply(lambda x: 1 if x <= 1000 else 0)

# Remove duplicate rows based on 'molecule_chembl_id'
filtered_data = filtered_data.drop_duplicates(subset='molecule_chembl_id').reset_index(drop=True)

# Stepwise calculation of molecular descriptors using RDKit
h_bond_donors = []
h_bond_acceptors = []
molecular_weights = []
num_rings = []
tpsa = []
rotatable_bonds = []
fingerprints = []

# Use RDKit's rdMolDescriptors for calculating Morgan fingerprints
for i, smiles in enumerate(filtered_data['canonical_smiles']):
    if i % 10 == 0:
        print(f"Processing molecule {i}/{len(filtered_data['canonical_smiles'])}")
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        h_bond_donors.append(Descriptors.NumHDonors(mol))
        h_bond_acceptors.append(Descriptors.NumHAcceptors(mol))
        molecular_weights.append(Descriptors.MolWt(mol))
        num_rings.append(Descriptors.RingCount(mol))
        tpsa.append(Descriptors.TPSA(mol))
        rotatable_bonds.append(Descriptors.NumRotatableBonds(mol))
        # Calculate Morgan fingerprint using rdMolDescriptors.GetMorganFingerprintAsBitVect
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(list(fp))  # Store the entire bit vector

# Add the calculated descriptors to the filtered DataFrame
filtered_data['h_bond_donors'] = h_bond_donors
filtered_data['h_bond_acceptors'] = h_bond_acceptors
filtered_data['molecular_weight'] = molecular_weights
filtered_data['num_rings'] = num_rings
filtered_data['tpsa'] = tpsa
filtered_data['rotatable_bonds'] = rotatable_bonds
filtered_data['fingerprints'] = fingerprints

# Remove rows with problematic (NaN) fingerprints
filtered_data = filtered_data.dropna(subset=['fingerprints']).reset_index(drop=True)

# Remove rows with duplicate fingerprints
filtered_data = filtered_data[~filtered_data['fingerprints'].duplicated(keep=False)].reset_index(drop=True)

# Convert assay_description and activity_comment to numerical features using TF-IDF
text_data = filtered_data['assay_description'].fillna('') + ' ' + filtered_data['activity_comment'].fillna('')
vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = vectorizer.fit_transform(text_data)

# Convert TF-IDF features to a DataFrame and add to filtered_data
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=vectorizer.get_feature_names_out())
filtered_data = pd.concat([filtered_data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# Display the first few rows of the enriched dataset
filtered_data.head()

# Minimal test to isolate the Morgan fingerprint issue
print("\nTesting Morgan fingerprint calculation with a simple molecule:")
smiles = 'CCO'
mol = Chem.MolFromSmiles(smiles)
if mol:
    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    print("Morgan fingerprint calculated successfully for test molecule.")
else:
    print("Failed to create molecule from SMILES.")

# Print out the fingerprints column to debug issues
print("\nFingerprints column:")
print(filtered_data['fingerprints'])

# Count the number of active and inactive rows
active_count = filtered_data[filtered_data['active'] == 1].shape[0]
inactive_count = filtered_data[filtered_data['active'] == 0].shape[0]

print(f"Number of active entries: {active_count}")
print(f"Number of inactive entries: {inactive_count}")