In [None]:
%load_ext autoreload 
%autoreload 2

# 1.Load 
## 1.1 Load libraries

In [27]:
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from src.scripts.load_and_save import load_data
from src.utils.exploration_and_clean import explore_column, quick_check_column, clean_metrics

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


## 1.2 Load data
The dataset is cleaned and filtered the the family of interest.

In [None]:
df = load_data()
df.head()

# 2. Developpement - Figures

## 2.1 Familly characterisation or "Kinase looking for love"

In this part we will justify the choice of the familly, maybe take the familly file and justify why the kinase are very well represented, introduce them ...

In [29]:
from src.utils.embeddings_plots import  reduce_family

# Let's reduce the number of targets
df['Target Name Detailed'] = df['Target Name']
df['Target Name'] =  df['Target Name'].apply( reduce_family)

## 2.2 FEATURES - What is a good match?

### 2.2.1 Metrics 
In this part we will analyse the binding metric that we have for our selected subject of data.

In [None]:
# Usefull metrics 
metric_percent = {}
for col in ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)','kon (M-1-s-1)', 'koff (s-1)', 'EC50 (nM)', 'pH', 'Temp (C)']:
    metric_percent[col] = len(df[col].dropna())/len(df)


import plotly.express as px
x =list(metric_percent.keys())
y=list(metric_percent.values())

fig = px.bar(x=x, y=y)

fig.update_layout(xaxis_title='', yaxis_title='Fraction of Nan')

fig.show()

fig.write_html('src/data/figures/nan_fraction.html')

removed Kon and koff, EC50 because not objective measure and not lot of data \
Ki, Kd is more meaningfull because don't associated to an essay but we have lot of Ki \
keep pH and Temp for later analysis \
remove Kd because no commun value with IC50 and not enough value to keep it alone \


In [31]:
metrics = ['pKi', 'pIC50'] 
features = ['pH', 'Temp (C)']

In [32]:
for metric in metrics : 
    df['p' + metric] = np.where(
        df[metric] > 0,  # Only apply log10 to positive values
        -np.log10(df[metric] * 1e-9),  # Transform to molar and take -log10
        np.nan  # Assign NaN for zero or negative values
    )

In [None]:
'''# Even if IC50 can be not maining full due to its dependancy to the essay it is highly correlated to Ki
# can be explain by a uniformity between experiment 
# we can extract meaningfull info about the inhibitory potential from IC50 

# TODO : ASK THE REST OF THE GROUP
df_Ki_IC50 = df[['pKi','pIC50', 'Target Name']].dropna()


print('Pearson Correlation coef:' , df_Ki_IC50[['pKi','pIC50']].corr(method='pearson'))


from  sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(df_Ki_IC50['pIC50'].values.reshape(-1, 1), df_Ki_IC50['pKi'].values.reshape(-1, 1))
df_Ki_IC50['pred_pki'] = reg.predict(df_Ki_IC50['pIC50'].values.reshape(-1, 1))


fig1 = px.line(df_Ki_IC50, x='pIC50', y='pred_pki')
fig1.update_traces(line_color='red')
fig2 =px.scatter(df_Ki_IC50, x='pIC50', y =  'pKi',color='Target Name')
fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.update_layout(title=f'Linear fit, coef :{reg.coef_[0][0].round(4)}')

fig3.show()


'''

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression

df_Ki_IC50 = df.dropna(subset = ['pKi', 'pIC50'])


# Pearson Correlation
pearson_corr = df_Ki_IC50[['pKi', 'pIC50']].corr(method='pearson').iloc[0, 1]

# Linear Regression
reg = LinearRegression()
reg.fit(df_Ki_IC50['pIC50'].values.reshape(-1, 1), df_Ki_IC50['pKi'].values.reshape(-1, 1))
df_Ki_IC50['pred_pki'] = reg.predict(df_Ki_IC50['pIC50'].values.reshape(-1, 1))
linear_coef = reg.coef_[0][0].round(4)

# Line and Scatter Plot
fig1 = px.line(df_Ki_IC50, x='pIC50', y='pred_pki')
fig1.update_traces(line_color='red')
fig2 = px.scatter(df_Ki_IC50, x='pIC50', y='pKi', color='Target Name')
fig3 = go.Figure(data=fig1.data + fig2.data)

# Adding Annotations
fig3.add_annotation(
    x=df_Ki_IC50['pIC50'].max()-0.5,
    y=df_Ki_IC50['pKi'].min()+0.5,
    text=f"Pearson Corr: {pearson_corr:.2f}<br>Linear Coef: {linear_coef:.2f}",
    showarrow=False,
    align="right",
    font=dict(size=14, color="black")
)

# Layout updates
fig3.update_layout(
    xaxis_title="pIC50",
    yaxis_title="pKi"
)

# Show plot
fig3.show()
fig3.write_html('src/data/figures/pKi_pIC50.html')

### 2.2.1 multivariate with the chemical caracterization
wants plot that explain the distribution of each features

In [35]:
properties = ['Ligand MW', 'logP']	

n_metrics = len(metrics)
n_properties = len(properties)

In [None]:
df_ic50 = df.dropna(subset='IC50 (nM)')
df_ki = df.dropna(subset='Ki (nM)')

def get_metrics_df(df, metric):
    return df.dropna(subset=metric)

len(df_ic50)

In [None]:
from src.utils.exploration_and_clean import plot_chemical_property_distributions

properties_colors = ['#EB89B5', '#330C73']
plot_chemical_property_distributions(df, metrics, chemical_properties=properties, properties_colors=properties_colors, filepath='src/data/figures/chemchar1.html', plot_metrics=True) 


In [None]:
df.head()
functionnal_groups = ["Aliphatic OH", "Aromatic NH", "Ester", "Ether", "Amide",	"Ketone", "Benzene Ring"]
df_functionnal_groups = df[functionnal_groups]
df_functionnal_groups.head()

In [39]:
counts = df_functionnal_groups.sum()

In [40]:
fig = px.histogram(x=functionnal_groups, y=counts)
fig.update_layout(xaxis_title= '', yaxis_title='Count')
fig.write_html('src/data/figures/functional_groups.html')

In [None]:
metric = metrics[0]
property = properties[0]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=800, showlegend=False)
fig.show()

#fig.write_image('src/data/figures/pKi_MW.png', height=400, scale=1)

In [None]:
metric = metrics[1]
property = properties[0]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=1700)
#fig.add_hline(y=1e3, line_dash="dash",row=1, col=1)
fig.show()

In [None]:
len(df)

In [None]:
metric = metrics[0]
property = properties[1]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')

fig.update_layout(height=600, width=800, showlegend=False)
fig.show()

#fig.write_image(f'src/data/figures/pKi_{property}.png', height=400, scale=2)

In [None]:
metric = metrics[1]
property = properties[1]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=1700)
fig.show()

In [None]:
# Predict pKi from the chemical properties with Statsmodels

import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler


metric = metrics[0]
features_of_interest = ["NumValenceElectrons", "ExactMolWt", "Chi0", "HeavyAtomCount"]

df_embeddings = load_data('src/data/embeddings_RDKIT_descriptors.csv.zip')
df_predict = get_metrics_df(df.merge(df_embeddings, on='Ligand SMILES', how='left'), metric)
X = df_predict[features_of_interest]
# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = sm.add_constant(X_scaled)
y = df_predict[metric]

model = sm.OLS(y, X_scaled).fit()
predictions = model.predict(X_scaled)

model.summary()
    

In [None]:
df_predict[metric]

In [None]:
# Now verify the Mutual information between the chemical properties and the metrics
from sklearn.feature_selection import mutual_info_regression

df_MI = pd.DataFrame(np.round(mutual_info_regression(X_scaled, y)[1:], 3), index=features_of_interest, columns=['Mutual Information'])
df_MI

In [None]:
# Plot pIC50 vs. ExactMolWt
fig = px.scatter(df_predict, x='ExactMolWt', y=metrics[1], color='Target Name')
fig.update_layout(height=600, width=800, showlegend=False)
fig.show()

In [None]:
import pandas as pd
import plotly.graph_objects as go
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

# Simulated setup from your prior code
metric = "pKi"
features_of_interest = ["NumValenceElectrons", "ExactMolWt", "Chi0", "HeavyAtomCount"]

# Prepare Data
df_embeddings = load_data('src/data/embeddings_RDKIT_descriptors.csv.zip')
df_predict = get_metrics_df(df.merge(df_embeddings, on='Ligand SMILES', how='left'), metric)
X = df_predict[features_of_interest]
y = df_predict[metric]

# Normalize the features and fit the OLS model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = sm.add_constant(X_scaled)
model = sm.OLS(y, X_scaled).fit()

# Extract model summary information
summary_df = pd.DataFrame({
    "Feature": ["const"] + features_of_interest,
    "Coefficient": model.params.values,
    "P-Value": model.pvalues.values
})

# Filter only the features (excluding constant if desired)
summary_df_filtered = summary_df[summary_df["Feature"] != "const"]
summary_df_filtered['Mutual Information'] = mutual_info_regression(X_scaled, y)[1:]

# Plot the extracted information as a Plotly table
fig = go.Figure(data=[go.Table(
    header=dict(values=['Feature', 'OLS Coefficient', 'OLS P-Value', 'Mutual Information']),
    cells=dict(values=[summary_df_filtered['Feature'], 
                       summary_df_filtered['Coefficient'].round(4), 
                       summary_df_filtered['P-Value'].round(4),
                       summary_df_filtered['Mutual Information'].round(4)])
)])
fig.update_layout( width=800, height=400)
fig.show()

# Save the table as an image
fig.write_image('src/data/figures/ols_coefficients.png', scale=2, engine='kaleido')
fig.show()


In [None]:
# Save a table with the mutual information as plotly table
fig = go.Figure(data=[go.Table(header=dict(values=['Feature', 'Mutual Information']), cells=dict(values=[df_MI.index, df_MI['Mutual Information']]))])
fig.update_layout(width=500, height=400)    
fig.show()
fig.write_image('src/data/figures/mutual_information.png',  scale=2, engine='kaleido')

In [None]:
# If the ligands are present several times in the dataset, are their metrics consistent?
df_grouped_ligands = df.groupby('Ligand SMILES')
# Get the group with the most entries
ligand = df_grouped_ligands.get_group(df_grouped_ligands.size().idxmax())
ligand['pIC50'].plot(kind='hist')

In [None]:
# If the ligands are present several times in the dataset, are their metrics consistent?
df_grouped_ligands = df.groupby('Ligand SMILES')
# Get the group with the highest mean pIC50: do the mean only on the  pIC50
ligand = df_grouped_ligands.get_group(df_grouped_ligands['pIC50'].apply('mean').idxmax())
len(ligand)
df_grouped_ligands['pIC50'].apply('mean')

In [None]:
# Select the ligands that have more than 50 entries
ligand = df_grouped_ligands.filter(lambda x: len(x) > 50)
# Select the ligands with the 50 highest mean pIC50
ligand = df_grouped_ligands.filter(lambda x: len(x) > 50).groupby('Ligand SMILES')['pIC50'].apply('mean').nlargest(50, 'pIC50')

In [None]:
# Step 1: Filter ligands that have more than 50 entries
#filtered_ligands = df_grouped_ligands.groupby('Ligand SMILES').filter(lambda x: len(x) > 50)

# Step 2: Calculate the mean pIC50 for each ligand and select the 50 highest values
top_ligands = (
    df
    .groupby('Ligand SMILES')['pIC50']
    .mean()
    .nlargest(50)
)

# Display or use the result
print(top_ligands)


In [None]:
# Step 1: Group by 'Ligand SMILES' and calculate mean and standard deviation
ligand_stats = (
    df
    .groupby('Ligand SMILES')['pIC50']
    .agg(['mean', 'std'])  # Calculate both mean and standard deviation
)

# Step 2: Select the top 50 ligands based on the highest mean pIC50
top_ligands = ligand_stats.nlargest(50, 'mean')

# Display or use the result
print(top_ligands)


In [None]:
# Step 1: Group by 'Ligand SMILES' and filter for groups with at least 2 entries
filtered_ligands = (
    df
    .groupby('Ligand SMILES')
    .filter(lambda x: len(x) > 25)  # Keep groups with at least 2 entries
)

# Step 2: Group again and calculate mean and standard deviation
ligand_stats = (
    filtered_ligands
    .groupby('Ligand SMILES')['pIC50']
    .agg(['mean', 'std'])  # Calculate both mean and standard deviation
)

print(len(ligand_stats))
# Step 3: Select the top 50 ligands based on the highest mean pIC50
top_ligands = ligand_stats.nlargest(100, 'mean')

# Display or use the result
print(top_ligands)


In [None]:
import matplotlib.pyplot as plt

# Plot mean pIC50 vs standard deviation
plt.figure(figsize=(8, 6))
plt.scatter(top_ligands['mean'], top_ligands['std'], alpha=0.7)
plt.title('Mean pIC50 vs Standard Deviation')
plt.xlabel('Mean pIC50')
plt.ylabel('Standard Deviation')
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Sort the ligands by mean pIC50 values
top_ligands_sorted = top_ligands.sort_values(by='mean', ascending=False)

# Plot the bar plot with error bars
plt.figure(figsize=(12, 6))
plt.bar(top_ligands_sorted.index, top_ligands_sorted['mean'], 
        yerr=top_ligands_sorted['std'], 
        capsize=4, color='skyblue', alpha=0.7)

# Add titles and labels
plt.title('Top 50 Ligands: Mean pIC50 with Standard Deviation')
plt.ylabel('Mean pIC50')
plt.xlabel('Ligand SMILES')
plt.xticks(rotation=90)  # Rotate x-axis labels for readability
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
top_ligands.head()

top_ligands['Number of Targets'] = [len(df_grouped_ligands.get_group(ligand)['Target Name'].unique()) for ligand in top_ligands.index]
top_ligands.head()

In [None]:
import plotly.graph_objects as go

# Sort the ligands by mean pIC50 values
top_ligands_sorted = top_ligands.sort_values(by='mean', ascending=False)

# Replace long Ligand SMILES with simple ligand names or indices
ligand_names = [f'Ligand {i+1}' for i in range(len(top_ligands_sorted))]

# Create the bar plot with error bars
fig = go.Figure(data=[
    go.Bar(
        x=ligand_names,
        y=top_ligands_sorted['mean'],
        error_y=dict(type='data', array=top_ligands_sorted['std'], visible=True),
        marker=dict(color='skyblue'),
        color=top_ligands_sorted['Number of Targets'],
    )
])

# Update layout for better readability
fig.update_layout(
    xaxis_title='Ligands with more than 25 matches',
    yaxis_title='Mean pIC50',
    xaxis=dict(tickangle=45),
    showlegend=False,
    template='plotly_white'
)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objects as go

# Sort the ligands by mean pIC50 values
top_ligands_sorted = top_ligands.sort_values(by='mean', ascending=False)

# Replace long Ligand SMILES with simple ligand names or indices
ligand_names = [f'Ligand {i+1}' for i in range(len(top_ligands_sorted))]

# Create the bar plot with error bars
fig = go.Figure(data=[
    go.Bar(
        #x=ligand_names,
        y=top_ligands_sorted['mean'],
        error_y=dict(type='data', array=top_ligands_sorted['std'], visible=True),
        marker=dict(
            color=top_ligands_sorted['Number of Targets'],  # Map to Number of Targets
            cmin=0,
            colorscale='PuRd',  # Choose a colorscale like Viridis, Plasma, etc.
            colorbar=dict(title='Unique Matches')  # Add a colorbar for clarity
        )
    )
])

# Update layout for better readability
fig.update_layout(
    xaxis_title='Ligands with more than 25 dates',
    yaxis_title='Mean pIC50',
    xaxis=dict(tickangle=45),
    showlegend=False,
    template='plotly_white'
)

# Show the plot
fig.show()
fig.write_html('src/data/figures/top_matches.html')

#fig.write_image('src/data/figures/top_matches.png')

# Popular matches
Some of our ligands were popular and were tested for many matches. 45 were involved in more than 25 dates: we can observe that the standard deviation of the success of their matches is pretty different accross them.

In [None]:
df_embeddings = load_data('src/data/embeddings_RDKIT_descriptors.csv.zip')
df.merge(df_embeddings,how='left', on='Ligand SMILES')

## 2.3 Embedding space or A match made in haven 

introduce the embedding space, the model that provide it. and visualization. in the best situation we would want to have conditons on the targets, metrics, and chemical caraterization to color certain part of the graph while the unselected remain grey. but if not possible to make it interactive juste do severals

In [None]:
from src.run_reduction import run_analysis

path_RD_df  = 'src/data/embeddings_RDKIT_descriptors.csv.zip'
path_Mol2vec_df = 'src/data/embeddings_Mol2Vec.csv.zip'
path_Morgan_df = 'src/data/embeddings_Morgan_Fingerprint.csv.zip'
path_full_df = 'src/data/embeddings_full.csv.zip'


In [None]:
run_analysis(df, path_RD_df, do_umap=True)

In [None]:
# Let's do more chemical characterization!
import random

features_of_interest = ["NumValenceElectrons", "ExactMolWt", "MolWt", "Chi0", "LabuteASA", "Kappa1", "HeavyAtomMolWt", "HeavyAtomCount", "Chi0v", "Chi1", "SPS", "FractionCSP3", "NumSaturatedRings", "NumAliphaticRings", "BCUT2D_CHGLO", "BCUT2D_CHGHI", "BCUT2D_LOGPLOW", "SlogP_VSA6", "SMR_VSA7", "NumAromaticCarbocycles"]
features_of_interest = ["NumValenceElectrons", "ExactMolWt", "Chi0", "HeavyAtomCount"]


colors = ["#{:06x}".format(random.randint(0, 0xFFFFFF)) for _ in features_of_interest]

df_embeddings = load_data('src/data/embeddings_RDKIT_descriptors.csv.zip')
plot_chemical_property_distributions(df, metrics, chemical_properties=features_of_interest[:5], properties_colors=colors[:5], filepath='src/data/chemchar2.html', plot_metrics=False, df_embeddings=df_embeddings) 

In [None]:
run_analysis(df, path_Mol2vec_df, do_umap=True)

In [None]:
run_analysis(df, path_Morgan_df, do_umap=True)


In [None]:
run_analysis(df, path_full_df, do_umap=True)

## 2.4 Closer look to one target and prediction of the best match 

select the target, plot it with a nice representation : the one that turn would be great
plot the place of its ligands in embedding speace (highligh the main features on which it relay)
then introduce the model that is used to predict a good target
plot some of the predicted target (viz the mol that turn pleeaaase) with a panel with their chemical caracterisations + predicted metric of binding (would be better if its KI)
