In [None]:
pip install chembl_webresource_client mordred numpy pandas scikit-learn matplotlib rdkit

In [None]:
import numpy as np
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import sys

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Step 1: Read the solubility dataset and check columns in a table**

In [None]:
# Specify the path to Excel file
file_path = '/content/drive/MyDrive/solubility/solubility dataset.xls'
# Read the Excel file and skip the first N rows (e.g., skip the first 3 rows)
df_sol = pd.read_excel(file_path, skiprows=9)

In [None]:
print(df_sol.head())  # Display the first few rows of the DataFrame

In [None]:
print(df_sol.isnull().sum())  # Check for missing values in each column

In [None]:
print(df_sol.shape)


**Step 2: Calculating descriptors using Mordred**

In [None]:
# Import necessary libraries as above
from rdkit import Chem
from mordred import Calculator, descriptors

# DataFrame `df_sol` with a 'canonical_smiles' column
smiles = df_sol['SMILES']

# Convert SMILES strings to RDKit Molecule objects
mols = [Chem.MolFromSmiles(s) for s in smiles]

# Initialize the Mordred descriptor calculator
calc = Calculator(descriptors, ignore_3D=True)

# Calculate the descriptors and store them in a DataFrame
df_descriptor = calc.pandas(mols)

# Show the result
df_descriptor = df_descriptor.astype(float)
df_descriptor.dropna(axis=1, how='all', inplace = True)
print(df_descriptor)


Resetting the index ensures that the DataFrame's index is a simple range starting from 0

In [None]:
df_sol.reset_index(drop=True, inplace=True)
df_descriptor.reset_index(drop=True, inplace=True)

**Step 3: Combine descriptor dataframe with solubility dataset**

In [None]:
df_sol_des = pd.concat([df_sol, df_descriptor], axis=1)

Check how many rows and columns

In [None]:
print(len(df_sol_des), len(df_sol_des.columns))

Take a look into the combined dataset

In [None]:
df_sol_des

In [None]:
f,axs = plt.subplots(1,2,figsize=(10,3))
axs[0].hist(df_sol_des ['LogS'])
axs[0].set_xlabel('Aqueous solubility (logS)')
axs[0].set_ylabel('count')
axs[1].boxplot(df_sol_des ['LogS'])
axs[1].set_ylabel('Aqueous solubility (LogS)')

In [None]:
# Define the path to Google Drive (make sure the folder exists)
file_path = '/content/drive/MyDrive/solubility/sol_des_clean.zip'

# Set compression options for saving the DataFrame as a .zip file
compression_opts = dict(method='zip', archive_name='sol_des_clean.csv')

# Save the DataFrame as a compressed .zip file in Google Drive
df_sol_des.to_csv(file_path, index=False, compression=compression_opts)