In [None]:
%load_ext autoreload 
%autoreload 2

# 1.Load 
## 1.1 Load libraries

In [25]:
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from src.scripts.load_and_save import load_data
from src.utils.exploration_and_clean import explore_column, quick_check_column, clean_metrics

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


## 1.2 Load data
The dataset is cleaned and filtered the the family of interest.

In [None]:
df = load_data()
df.head()

# 2. Developpement - Figures

## 2.1 Familly characterisation or "Kinase looking for love"

In this part we will justify the choice of the familly, maybe take the familly file and justify why the kinase are very well represented, introduce them ...

In [27]:
from src.utils.embeddings_plots import  reduce_family

# Let's reduce the number of targets
df['Target Name Detailed'] = df['Target Name']
df['Target Name'] =  df['Target Name'].apply( reduce_family)

## 2.2 FEATURES - What is a good match?

### 2.2.1 Metrics 
In this part we will analyse the binding metric that we have for our selected subject of data.

In [None]:
# Usefull metrics 
metric_percent = {}
for col in ['Ki (nM)', 'IC50 (nM)', 'Kd (nM)','kon (M-1-s-1)', 'koff (s-1)', 'EC50 (nM)', 'pH', 'Temp (C)']:
    metric_percent[col] = len(df[col].dropna())/len(df)


import plotly.express as px
x =list(metric_percent.keys())
y=list(metric_percent.values())

fig = px.bar(x=x, y=y, title='Percent of non nan values per metric among the selected familly')

fig.show()

removed Kon and koff, EC50 because not objective measure and not lot of data \
Ki, Kd is more meaningfull because don't associated to an essay but we have lot of Ki \
keep pH and Temp for later analysis \
remove Kd because no commun value with IC50 and not enough value to keep it alone \


In [29]:
metrics = ['pKi', 'pIC50'] 
features = ['pH', 'Temp (C)']

In [30]:
for metric in metrics : 
    df['p' + metric] = np.where(
        df[metric] > 0,  # Only apply log10 to positive values
        -np.log10(df[metric] * 1e-9),  # Transform to molar and take -log10
        np.nan  # Assign NaN for zero or negative values
    )

In [None]:
# Even if IC50 can be not maining full due to its dependancy to the essay it is highly correlated to Ki
# can be explain by a uniformity between experiment 
# we can extract meaningfull info about the inhibitory potential from IC50 

# TODO : ASK THE REST OF THE GROUP
df_Ki_IC50 = df[['pKi','pIC50', 'Target Name']].dropna()


print('Pearson Correlation coef:' , df_Ki_IC50[['pKi','pIC50']].corr(method='pearson'))


from  sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(df_Ki_IC50['pIC50'].values.reshape(-1, 1), df_Ki_IC50['pKi'].values.reshape(-1, 1))
df_Ki_IC50['pred_pki'] = reg.predict(df_Ki_IC50['pIC50'].values.reshape(-1, 1))


fig1 = px.line(df_Ki_IC50, x='pIC50', y='pred_pki')
fig1.update_traces(line_color='red')
fig2 =px.scatter(df_Ki_IC50, x='pIC50', y =  'pKi',color='Target Name')
fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.update_layout(title=f'Linear fit, coef :{reg.coef_[0][0].round(4)}')

fig3.show()




### 2.2.1 multivariate with the chemical caracterization
wants plot that explain the distribution of each features

In [32]:
properties = ['Ligand MW', 'logP']	

n_metrics = len(metrics)
n_properties = len(properties)

In [None]:
df_ic50 = df.dropna(subset='IC50 (nM)')
df_ki = df.dropna(subset='Ki (nM)')

def get_metrics_df(df, metric):
    return df.dropna(subset=metric)

len(df_ic50)

In [None]:
# Graph: la distribution des features extraire avec rdkit
fig = make_subplots(rows=n_metrics, cols=n_properties+1)
properties_colors = ['#EB89B5', '#330C73']

for i, metric in enumerate(metrics):
    df_to_plot = get_metrics_df(df,metric)

    for j, property in enumerate(properties):
        color = properties_colors[j]
        x = df_to_plot[property]
        fig.add_trace(
            go.Histogram(x=x, name=property, showlegend=False,marker_color=color),
            row=i+1, col=j+1
        )
        fig.update_xaxes(title_text=property, row=i+1, col=j+1) 
        #fig.update_yaxes(title_text="Count", row=i+1, col=j+1)

for i, metric in enumerate(metrics):
    df_to_plot = get_metrics_df(df,metric)
    x = df_to_plot[metric]
    color = 'wheat'
    fig.add_trace(
                go.Histogram(x=x, name=metric, showlegend=False,marker_color=color),
                row=i+1, col=3
            )
    fig.update_xaxes(title_text=metric, row=i+1, col=3) 
fig.update_layout(height=800, width=1200, title_text="Chemical Properties of the ligands in the pKi and pIC50 subsets")
fig.show()

In [None]:
df.head()
functionnal_groups = ["Aliphatic OH", "Aromatic NH", "Ester", "Ether", "Amide",	"Ketone", "Benzene Ring"]
df_functionnal_groups = df[functionnal_groups]
df_functionnal_groups.head()

In [36]:
counts = df_functionnal_groups.sum()

In [None]:
px.histogram(x=functionnal_groups, y=counts)

In [None]:
metric = metrics[0]
property = properties[0]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=1200)
fig.show()

In [None]:
metric = metrics[1]
property = properties[0]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=1700)
#fig.add_hline(y=1e3, line_dash="dash",row=1, col=1)
fig.show()

In [None]:
metric = metrics[0]
property = properties[1]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=1200)
fig.show()

In [None]:
metric = metrics[1]
property = properties[1]
fig = px.scatter(get_metrics_df(df, metric), x=property, y=metric, marginal_x="histogram", marginal_y="histogram", color='Target Name')
fig.update_layout(height=600, width=1700)
fig.show()

## 2.3 Embedding space or A match made in haven 

introduce the embedding space, the model that provide it. and visualization. in the best situation we would want to have conditons on the targets, metrics, and chemical caraterization to color certain part of the graph while the unselected remain grey. but if not possible to make it interactive juste do severals

In [42]:
from src.run_reduction import run_analysis

path_RD_df  = 'src/data/embeddings_RDKIT_descriptors.csv.zip'
path_Mol2vec_df = 'src/data/embeddings_Mol2Vec.csv.zip'
path_Morgan_df = 'src/data/embeddings_Morgan_Fingerprint.csv.zip'
path_full_df = 'src/data/embeddings_full.csv.zip'


In [None]:
run_analysis(df, path_RD_df, do_umap=True)

In [None]:
run_analysis(df, path_Mol2vec_df, do_umap=True)


In [None]:
run_analysis(df, path_Morgan_df, do_umap=True)


In [None]:
run_analysis(df, path_full_df, do_umap=True)

## 2.4 Closer look to one target and prediction of the best match 

select the target, plot it with a nice representation : the one that turn would be great
plot the place of its ligands in embedding speace (highligh the main features on which it relay)
then introduce the model that is used to predict a good target
plot some of the predicted target (viz the mol that turn pleeaaase) with a panel with their chemical caracterisations + predicted metric of binding (would be better if its KI)
