# This notebook plots the results obtained in a QSAR model

## First we will plot the William's plot for applicability domain test

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# Load the training and test data with predictions
train_data = pd.read_csv("/padel/train_data_pred.csv")
test_data = pd.read_csv("/padel/test_data_pred.csv")

# Calculate residuals for the training set (experimental - predicted)
train_data['residuals'] = train_data['pEC50'] - train_data['pEC50_pred']

# Calculate standardized residuals for the training set
sigma_train = train_data['residuals'].std()
train_data['standardized_residuals'] = train_data['residuals'] / sigma_train

# Calculate residuals for the test set (experimental - predicted)
test_data['residuals'] = test_data['pEC50'] - test_data['pEC50_pred']

# Standardize the residuals for the test set using the same sigma from the training set
test_data['standardized_residuals'] = test_data['residuals'] / sigma_train

# Model matrix for the training set (used to calculate leverage)
X_train = train_data[['MDEC-33', 'VE1_Dzp', 'ATSC6e', 'minaaN', 'SpMax4_Bhm', 'nAtomLAC', 'VE3_Dzs']].values
hat_values_train = np.diag(X_train @ np.linalg.inv(X_train.T @ X_train) @ X_train.T)

# Leverage for the test set (matching the same descriptors)
X_test = test_data[['MDEC-33', 'VE1_Dzp', 'ATSC6e', 'minaaN', 'SpMax4_Bhm', 'nAtomLAC', 'VE3_Dzs']].values
hat_values_test = np.diag(X_test @ np.linalg.inv(X_train.T @ X_train) @ X_test.T)

# Leverage threshold (h*)
n = len(train_data)  # Number of training observations
p = X_train.shape[1] + 1  # Number of parameters (descriptors + intercept)
leverage_threshold = 3 * (p / n)

# Combine the training and test sets for the Williams plot
combined_data = pd.DataFrame({
    'Molecule': np.concatenate([train_data['Molecule'], test_data['Molecule']]),  # Use 'Compound Key' instead of index
    'Set': ['Training'] * len(train_data) + ['Test'] * len(test_data),  # Set label
    'Leverage': np.concatenate([hat_values_train, hat_values_test]),  # Leverage values
    'Standardized_Residuals': np.concatenate([train_data['standardized_residuals'], test_data['standardized_residuals']])  # Residuals
})

# Customize figure settings
plt.figure(figsize=(6, 6))  # Control figure size
sns.set(style="whitegrid")

# Create the Williams plot with customized markers, colors, and styles
sns.scatterplot(x='Leverage', y='Standardized_Residuals', hue='Set', style='Set',
                data=combined_data, s=100, palette={'Training': 'blue', 'Test': 'green'}, markers={'Training': 'o', 'Test': 's'},
                linewidth=2.5)  # Increased linewidth for markers

# Add labels for each point using 'Compound Key'
for i in range(len(combined_data)):
    plt.text(combined_data['Leverage'].iloc[i], combined_data['Standardized_Residuals'].iloc[i], 
             str(combined_data['Molecule'].iloc[i]), fontsize=10, fontweight='bold', 
             verticalalignment='bottom', horizontalalignment='right')

# Add lines for applicability domain with varying line widths
plt.axhline(y=3, color='red', linestyle='--', linewidth=2)
plt.axhline(y=-3, color='red', linestyle='--', linewidth=2)
plt.axvline(x=leverage_threshold, color='red', linestyle='--', linewidth=2)

# Labels and title with custom sizes and bold formatting
plt.xlabel('Leverages', fontsize=14, fontweight='bold', labelpad=15)  # Label padding control
plt.ylabel('Standardized Residuals', fontsize=14, fontweight='bold', labelpad=15)
#plt.title('Williams Plot: Training and Test Sets', fontsize=16, fontweight='bold', pad=20)

# Customize tick size and boldness
plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(fontsize=12, fontweight='bold')

# Adjust spacing between the plot and the labels
plt.tight_layout()

# Customize the legend
plt.legend(title='Set', title_fontsize=12, fontsize=10, loc='upper right')

# Save the figure in high-quality JPG format, 600 dpi
plt.savefig("/padel/Williams_plot.jpg", format='jpg', dpi=600)

# Show the plot
plt.show()

# Scatter plot of experimental and predicted pIC50 values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Set up the matplotlib figure
plt.figure(figsize=(8, 6))
sns.set_style("ticks")

# Scatter plot for the training set
plt.scatter(train_data['pEC50'], train_data['pEC50_pred'], color='blue', label='Training Set', edgecolor='w', s=100)

# Scatter plot for the test set
plt.scatter(test_data['pEC50'], test_data['pEC50_pred'], color='green', label='Test Set', edgecolor='w', s=100)

plt.plot(train_data['pEC50'],train_data['pEC50'], color='red', linewidth=2)
plt.plot(test_data['pEC50'],test_data['pEC50'], color='orange', linewidth=2)

# Adjust labels and title
plt.xlabel('Experimental pEC50', fontsize=18, fontweight='bold')
plt.ylabel('Predicted pEC50', fontsize=18, fontweight='bold')
#plt.title('pIC50 vs Predicted pIC50', fontsize=16, fontweight='bold')

# Add a legend
plt.legend(title_fontsize=12, fontsize=12)

plt.xticks(fontsize=14, fontweight='bold')
plt.yticks(fontsize=14, fontweight='bold')

plt.tick_params(axis='both', which='major', length=8, width=2, labelsize=14)  # Major ticks
plt.tick_params(axis='both', which='minor', length=4, width=2, labelsize=12)  # Minor ticks

ax = plt.gca()  # Get current axes
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))  # Major ticks every 0.5
ax.xaxis.set_minor_locator(ticker.AutoMinorLocator(1))  # 2 minor ticks between major ticks
ax.yaxis.set_major_locator(ticker.MultipleLocator(1.0))  # Major ticks every 0.1

plt.tick_params(axis='both', which='major', direction='in', length=8, width=2, labelsize=16)  # Major ticks
plt.tick_params(axis='both', which='minor', direction='in', length=4, width=2)  # Minor ticks

for spine in ax.spines.values():
    spine.set_linewidth(3)

# Add minor ticks
ax.minorticks_on()

# Save the figure
plt.tight_layout()
plt.savefig('/padel/pEC50_vs_predicted_pEC50_with_line.jpg', dpi=600)

# Show the plot
plt.show()