## Data curation

### 1. Import libraries

In [None]:
# Import necessary libraries
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
import numpy as np
from itertools import combinations
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import seaborn as sns

### 2. Load and review the dataset

In [None]:
df = pd.read_csv('data.csv')
df.shape

### 3. Remove duplicates based on smile

In [None]:
# Remove duplicates
df = df.drop_duplicates(subset=['Smiles'], keep='first')
df.shape

### 4. Handling of missing data

In [None]:
# Drop rows with missing pChEMBL Value
df = df.dropna(subset=['pChEMBL Value'])
df.shape

### 5. Outlier analysis

In [None]:
# Calculate the IQR for pChEMBL Value
Q1 = df['pChEMBL Value'].quantile(0.25)
Q3 = df['pChEMBL Value'].quantile(0.75)
IQR = Q3 - Q1

# Define upper and lower bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame to remove outliers
df = df[(df['pChEMBL Value'] >= lower_bound) & (df['pChEMBL Value'] <= upper_bound)]
df.shape

### 6. Retain specific columns and rename them

In [None]:
# Keep only the specified columns
df = df[['Molecule ChEMBL ID', 'Smiles', 'pChEMBL Value']]

# Rename the columns
df.columns = ['Molecule', 'SMILES', 'pEC50']

# Save the cleaned dataset with the desired columns and new names
cleaned_file_path = '/rdkit/cleaned_data.csv'  # specify your desired path
df.to_csv(cleaned_file_path, index=False)

In [None]:
df.head()

### 7. Distribution of pIC50 values¶

In [None]:
# Set the style of seaborn
sns.set(style='whitegrid')

# Create a line plot for the probability distribution of pIC50 values
plt.figure(figsize=(10, 6))
sns.kdeplot(df['pEC50'], fill=True, color='blue', alpha=0.2)  # Kernel Density Estimate

# Customize the plot
plt.title('Probability Distribution of pEC50 Values', fontsize=16)
plt.xlabel('pEC50', fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.grid(True)

# Show the plot
plt.show()

### 8. Extracting the SMILES

In [None]:
df2 =  df['SMILES']
df2.to_csv('molecule.smi', index=False, header=False)
df2.head()

### 9. Calculating PaDEL descriptors

In [None]:
sm = df['SMILES'].values
sm.shape

In [None]:
def calculate_descriptors(smiles_list):
    descriptors_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            # Calculate all molecular descriptors
            descriptors = Descriptors.CalcMolDescriptors(mol)
            descriptors_list.append(descriptors)
        else:
            # Append NaNs for missing molecules
            descriptors_list.append([None] * len(Descriptors._descList))
    return pd.DataFrame(descriptors_list, columns=[desc[0] for desc in Descriptors._descList])

descriptors_df = calculate_descriptors(sm)


descriptors_df.to_csv("/rdkit/descriptors.csv", index=False)
# Concatenate the original DataFrame with the descriptors DataFrame
#result_df = pd.concat([df.reset_index(drop=True), descriptors_df], axis=1)

# Display the result DataFrame
#print(result_df.head())

### 10. Concatnetae descriptor data with molecules and pIC50

In [None]:
# Load the original DataFrame (df) with 'Molecule', 'SMILES', and 'pIC50' columns
# Assuming df is already available in your environment

# Load descriptors.csv, skip the first column
descriptors_df = pd.read_csv('CHEMBL941720/rdkit/descriptors.csv').iloc[:, 1:]

# Concatenate the original columns (Molecule, SMILES, pIC50) with descriptors
# First, reset the index of `df` and `descriptors_df` to ensure they align correctly
result_df = pd.concat([df[['Molecule', 'SMILES']].reset_index(drop=True), descriptors_df, df[['pEC50']].reset_index(drop=True)], axis=1)

# Save the final DataFrame to a new CSV file
result_df.to_csv('/rdkit/final_descriptors.csv', index=False)
print("The new file 'final_descriptors.csv' has been created with the required columns.")

In [None]:
result_df.shape

In [None]:
result_df.head()

### 11. Descriptor reduction

In [None]:
data = pd.read_csv("/rdkit/final_descriptors.csv")
data.shape

In [None]:
# Drop columns with any missing values
data = data.dropna(axis=1)

# Step 1: Remove virtually constant columns (>95% same values)
# Calculate the percentage of unique values in each column
constant_columns = [col for col in data.columns[1:-1] if data[col].value_counts(normalize=True).values[0] > 0.95]
data = data.drop(columns=constant_columns)

# Step 2: Remove highly correlated columns (|r| > 0.90)
# Compute the correlation matrix for the remaining descriptors
corr_matrix = data.iloc[:, 2:-1].corr().abs()

# Select upper triangle of correlation matrix
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find index of columns with correlation greater than 0.90
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
data = data.drop(columns=to_drop)

# Display the results
print(data.shape)

In [None]:
data.head()

### 12. Feature Selection via RFE

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Separate the predictors and target variable
X = data.iloc[:, 2:-1]  # Descriptors
y = data.iloc[:, -1]     # pIC50

# Define the model and RFE
model = LinearRegression()
rfe = RFE(model, n_features_to_select=8)  # Adjust to select your desired number of top features

# Fit RFE to data
rfe.fit(X, y)

# Get the selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features)

### 13. Feature selection via stepwise regression

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

# Load data

X = data.iloc[:, 2:-1]  # Descriptors (assuming the first column is compound key)
y = data.iloc[:, -1]

def stepwise_selection(X, y, threshold_in=0.05, threshold_out=0.05, max_features=8):
    included = []
    while len(included) < max_features:
        changed = False

        # Forward step: Try adding features
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)

        for col in excluded:
            model = sm.OLS(y, add_constant(X[included + [col]])).fit()
            new_pval[col] = model.pvalues[col]

        # Select the best feature with p-value below the threshold_in
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True

        # Check if we've reached the max_features limit
        if len(included) >= max_features:
            break

        # Backward step: Try removing features with p-value above the threshold_out
        model = sm.OLS(y, add_constant(X[included])).fit()
        pvalues = model.pvalues.iloc[1:]  # Skip intercept
        worst_pval = pvalues.max()

        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True

        # Break if no changes occurred (i.e., stable subset reached)
        if not changed:
            break

    # Final model to print the selected features' details
    final_model = sm.OLS(y, add_constant(X[included])).fit()
    print(final_model.summary())
    return included

# Run stepwise selection with a specific maximum number of features
selected_features = stepwise_selection(X, y, max_features=8)
print("Selected Features:", selected_features)

### 14. Building the multiple linear regression model

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error


# Separate predictors and target variable based on selected descriptors
X = data[['qed', 'FpDensityMorgan1', 'FpDensityMorgan2', 'BCUT2D_MWLOW',
       'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BalabanJ',
       'NumAliphaticHeterocycles']]
y = data["pEC50"]

# Bin y into bins
y_binned = pd.qcut(y, q=5, duplicates='drop') 

# Split data into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y_binned)

# Initialize and fit the MLR model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print("Model Performance:")
print(f"R^2 Score: {r2}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean performance:", scores.mean())

### 15. Save the dataset for further analysis

In [None]:
# The selected feature columns
selected_features = ['PEOE_VSA12', 'fr_morpholine']

# Ensure that 'Compound Key' and 'pIC50' are part of the data
# Select the relevant columns from your dataset
selected_columns = ['Molecule'] + selected_features + ['pIC50']

# Create a new DataFrame with the selected columns
selected_data = data[selected_columns]

# Save the new DataFrame to a CSV file
selected_data.to_csv("/rdkit/selected_features.csv", index=False)

print("Selected data saved to 'selected_features.csv'.")