In [11]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import pickle
from pprint import pprint as pp
from tqdm import tqdm
import itertools
BASE_PATH = '/content/drive/MyDrive/Generative_ML/current_data/' #@param {type:"string"}

PRETRAINING_PATH = BASE_PATH + '1. Pretraining/'
GENERATION_PATH = BASE_PATH + '2. Generation/'
SAMPLING_PATH = BASE_PATH + '3. Sampling/'
DIFFDOCK_PATH = BASE_PATH + '4. DiffDock/'
SCORING_PATH = BASE_PATH + '5. Scoring/'
AL_PATH = BASE_PATH + '6. ActiveLearning/'


In [None]:
sub = set(pd.read_csv(f"{DIFFDOCK_PATH}sampled_mols/model1_softsub_al2_samples.csv")['smiles'])
div = set(pd.read_csv(f"{DIFFDOCK_PATH}sampled_mols/model1_softdiv_al2_samples.csv")['smiles'])
len(sub), len(div), len(sub & div)

(1000, 1000, 2)

# Dataset statistics

## Convert each dictionary to a dataframe

In [None]:
def split_the_dictionary(fname):
    with open(PICKLES+f'{fname}.pkl', 'rb') as f:
        smiles_to_descriptors = pickle.load(f)
    smiles = list(smiles_to_descriptors.keys())
    half_index = int(len(smiles)//2)
    pt1 = {}
    pt2 = {}
    for i, smile in enumerate(smiles):
        if i < half_index:
            pt1[smile] = smiles_to_descriptors[smile]
        else:
            pt2[smile] = smiles_to_descriptors[smile]
    print(len(pt1), len(pt2), len(pt1)+len(pt2), len(smiles_to_descriptors))
    pickle.dump(pt1, open(PICKLES + f'{fname}_subpt1.pkl', 'wb'))
    pickle.dump(pt2, open(PICKLES + f'{fname}_subpt2.pkl', 'wb'))

In [None]:
split_the_dictionary('smile_to_descriptors_pt1')

In [None]:
def pickle_to_csv(fname):
    with open(PICKLES+f'{fname}.pkl', 'rb') as f:
        smiles_to_descriptors = pickle.load(f)
    keyToData = {}
    keys = pickle.load(open(PICKLES + 'descriptors_list.pkl', 'rb'))
    pbar = tqdm(smiles_to_descriptors.items(), total=len(smiles_to_descriptors), desc=fname)
    for smile, descriptors in pbar:
        keyToData.setdefault('smile', []).append(smile)
        for key in keys:
            keyToData.setdefault(key, []).append(descriptors[key])
    df = pd.DataFrame(keyToData)
    df.to_pickle(PICKLES + '_'.join(fname.split('_')[2:]) + '.pkl')
    return df


In [None]:
df = pickle_to_csv('smile_to_descriptors_pt1_subpt1') # done

In [None]:
df = pickle_to_csv('smile_to_descriptors_pt1_subpt2') # done

In [None]:
df = pickle_to_csv('smile_to_descriptors_pt2_subpt1') # done

In [None]:
df = pickle_to_csv('smile_to_descriptors_pt2_subpt2')

In [None]:
df = pickle_to_csv('smile_to_descriptors_pt3') # done

## Combine dataframes

In [None]:
pt1_sbpt1 = pd.read_pickle(PICKLES + 'descriptors_pt1_subpt1.pkl')
pt1_sbpt2 = pd.read_pickle(PICKLES + 'descriptors_pt1_subpt2.pkl')
pt2_sbpt1 = pd.read_pickle(PICKLES + 'descriptors_pt2_subpt1.pkl')
pt2_sbpt2 = pd.read_pickle(PICKLES + 'descriptors_pt2_subpt2.pkl')
pt3 = pd.read_pickle(PICKLES + 'descriptors_pt3.pkl')
pt1_sbpt1.shape, pt1_sbpt2.shape, pt2_sbpt1.shape, pt2_sbpt2.shape, pt3.shape

In [None]:
merged_df = pd.concat([pt1_sbpt1, pt1_sbpt2, pt2_sbpt1, pt2_sbpt2, pt3])
merged_df.shape
merged_df.to_pickle(PICKLES + 'descriptors_combined.pkl')

In [None]:
descriptors_wsmiles = pd.read_pickle(PICKLES + 'descriptors_combined.pkl')
train_df = pd.read_csv(BASE_PATH + 'raw_data/processed_train.csv')
val_df = pd.read_csv(BASE_PATH + 'raw_data/processed_val.csv')
# Create boolean series for whether 'smile' column is in each dataframe
is_in_train_df = descriptors_wsmiles['smile'].isin(train_df['smiles'])
is_in_val_df = descriptors_wsmiles['smile'].isin(val_df['smiles'])

# Combine the two series with logical or (|)
is_in_either_df = is_in_train_df | is_in_val_df

# Select only rows where 'smile' column is in either dataframe
descriptors_wsmiles_excluded = descriptors_wsmiles[is_in_either_df]

descriptors_wsmiles_excluded.to_pickle(PICKLES + 'descriptors_combined_processed.pkl')

val = 285448 + 1
train = 5423523 + 1
total = val + train
print(total, descriptors_wsmiles.shape, descriptors_wsmiles_excluded.shape)
# 5708973 (5770637, 210) (5708842, 210) apparently more invalid smiles?

In [None]:
descriptors_wsmiles = pd.read_pickle(PICKLES + 'descriptors_combined.pkl')
print(descriptors_wsmiles.shape)
train_df = pd.read_csv(BASE_PATH + 'raw_data/moses_and_binding_no_rare_tokens_train.csv.gz', compression='gzip')
test_df = pd.read_csv(BASE_PATH + 'raw_data/moses_and_binding_no_rare_tokens_test.csv.gz', compression='gzip')
# Create boolean series for whether 'smile' column is in each dataframe
is_in_train_df = descriptors_wsmiles['smile'].isin(train_df['smiles'])
is_in_test_df = descriptors_wsmiles['smile'].isin(test_df['smiles'])

# Combine the two series with logical or (|)
is_in_either_df = is_in_train_df | is_in_test_df

# Select only rows where 'smile' column is in either dataframe
descriptors_wsmiles_excluded = descriptors_wsmiles[is_in_either_df]
print(descriptors_wsmiles_excluded.shape)
descriptors_wsmiles_excluded.to_pickle(PICKLES + 'descriptors_moses+bindingdb.pkl')

val = 285448 + 1
train = 5423523 + 1
total = val + train
# print(total, descriptors_wsmiles.shape, descriptors_wsmiles_excluded.shape)
# 5708973 (5770637, 210) (5708842, 210) apparently more invalid smiles?

(5770637, 210)
(2894910, 210)


# Graph Styling

In [4]:
class Graph:
    def __init__(self):
        self.title_size = 20
        self.axis_title_size = 14
        self.tick_font_size = 12
        self.text_color="#333333"
        self.background = "white"
        self.grid_color = "#e2e2e2"
        self.line_color = "#000000"
        self.font_family = 'Helvetica'
        self.width = 600
        self.height = 400
        self.title = ''
        self.xaxis_title = ''
        self.yaxis_title = ''

    def update_parameters(self, params):
        for key, val in params.items():
            setattr(self, key, val)


    def style_figure(self, figure):
        figure.update_layout({
            'margin': {'t': 50, 'b': 50, 'l': 50, 'r': 50},
            'plot_bgcolor': self.background,
            'paper_bgcolor': self.background,
            'title': {
                'text': self.title,
                'font': {
                    'size': self.title_size,
                    'color': self.text_color,
                    'family': self.font_family
                },
            },
            'height': self.height,  # Set fixed size ratio 3:4
            'width': self.width,
            'font': {
                'family': self.font_family,
                'size': self.tick_font_size,
                'color': self.text_color
            },
            'legend': {
                'font': {
                    'family': self.font_family,
                    'size': self.tick_font_size,
                    'color': self.text_color
                },
            },
        })

        # Setting the title size and color and grid for both x and y axes
        figure.update_xaxes(
            title=self.xaxis_title,
            title_font={'size': self.axis_title_size, 'color': self.text_color, 'family': self.font_family},
            tickfont={'size': self.tick_font_size, 'color': self.text_color, 'family': self.font_family},
            showgrid=True,
            gridwidth=1,
            gridcolor=self.grid_color,
            linecolor=self.line_color,  # make x axis line visible
            linewidth=2
        )

        figure.update_yaxes(
            title=self.yaxis_title,
            title_standoff=0,
            title_font={'size': self.axis_title_size, 'color': self.text_color, 'family': self.font_family},
            tickfont={'size': self.tick_font_size, 'color': self.text_color, 'family': self.font_family},
            showgrid=True,
            gridwidth=1,
            gridcolor=self.grid_color,
            linecolor=self.line_color,  # make y axis line visible
            linewidth=2
        )
        return figure

# PCA Analysis

## Definitions

In [5]:
def fit_pca(dataframe, n=3, sigma=None, whiten=False):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(dataframe)
    if sigma is not None:
        scaled_data = scaled_data[(scaled_data <= sigma).all(axis=1)]
    pca = PCA(n_components=n, whiten=whiten)
    pca.fit(scaled_data)
    return scaler, pca

def pca_transform(pca, dataframe, n):
    assert pca.n_components_ >= n, f"PCA was fitted on {pca.n_components_} components, but {n} were requested."
    transformed = pca.transform(dataframe)
    return [transformed[:,i] for i in range(n)]

def create_scatter_trace(x, y, color, name):
    return go.Scatter(x=x, y=y, mode='markers', name=name,
                      marker=dict(size=5, color=color, showscale=True if isinstance(color, (list, np.ndarray)) else False, colorscale='Viridis', opacity=0.5)
            )

def plot_pca(datapoints, yscale=1.05):
    fig = go.Figure()
    minX, minY, maxX, maxY = float('inf'), float('inf'), float('-inf'), float('-inf')
    traces = []
    for xarr, yarr, color, label in datapoints:
        minX = min(minX, min(xarr))
        minY = min(minY, min(yarr))
        maxX = max(maxX, max(xarr))
        maxY = max(maxY, max(yarr))
        traces.append(create_scatter_trace(xarr, yarr, color, label))

    for trace in traces:
        fig.add_trace(trace)

    fig.update_layout(
        xaxis=dict(title='PCA Component 1', autorange=False, range=[yscale*minX,  yscale*maxX]),
        yaxis=dict(title='PCA Component 2', autorange=False, range=[yscale*minY, yscale*maxY]),
    )
    return fig

In [6]:
def get_data_boundaries(data_list):
    combined = np.vstack(data_list) # Combine both datasets to get overall min and max values

    min_val = np.floor(np.min(combined, axis=0)/10.0) * 10 # Round to the nearest number divisible by 10
    max_val = np.ceil(np.max(combined, axis=0)/10.0) * 10
    return min_val, max_val

def discretize_data(data, boundaries, bin_size):
    # Use 2D histogram to discretize data
    bins = [np.arange(boundaries[0][i], boundaries[1][i], bin_size) for i in range(2)]
    hist_data, xedges, yedges = np.histogram2d(data[:,0], data[:,1], bins=bins)

    # Compute bin centers
    xcenters = (xedges[:-1] + xedges[1:]) / 2
    ycenters = (yedges[:-1] + yedges[1:]) / 2

    return hist_data.T, xcenters, ycenters

def plot_heatmap(args_list, difference=False, bin_size=10, width=1280, height=720, all_differences=False):
    data_list, name_list = zip(*args_list)
    boundaries = get_data_boundaries(data_list)
    fig = go.Figure()

    if difference:
        if all_differences:
            traces = []
            for i, ((data_before, name_before), (data_after, name_after)) in enumerate(itertools.combinations(args_list, 2)):
                hist_before, xcenters, ycenters = discretize_data(data_before, boundaries, bin_size)
                hist_after, _, _ = discretize_data(data_after, boundaries, bin_size)
                diff = hist_after - hist_before
                label = f"|{name_after}|<br>-|{name_before}|"
                traces.append(go.Heatmap(x=xcenters, y=ycenters, z=diff, zmid=0, zmax=110, zmin=-110, colorscale='RdBu', name=label, showlegend=True, visible=True if i == 0 else 'legendonly'))
            for trace in traces:
                fig.add_trace(trace)
        else:
            assert len(data_list) == 2, f"To plot a difference, please provide only 2 data sources"
            hist_before, xcenters, ycenters = discretize_data(data_list[0], boundaries, bin_size)
            hist_after, _, _ = discretize_data(data_list[1], boundaries, bin_size)
            diff = hist_after - hist_before
            label = f"|{name_list[1]}|<br>-|{name_list[0]}|"
            fig.add_trace(go.Heatmap(x=xcenters, y=ycenters, z=diff, zmid=0, colorscale='RdBu', name=label, showlegend=True))
    else:
        traces = []
        zmax = max([discretize_data(data, boundaries, bin_size)[0].max() for data in data_list])
        zmin = min([discretize_data(data, boundaries, bin_size)[0].min() for data in data_list])
        for i, (data, name) in enumerate(args_list):
            hist, xcenters, ycenters = discretize_data(data, boundaries, bin_size)
            if 'al1 good' in name:
                mult = 1/50
            else:
                mult = 1
            traces.append(go.Heatmap(x=xcenters, y=ycenters, z=hist, name=name, zmin=zmin, zmax=mult*zmax, showlegend=True, visible=True if i == 0 else 'legendonly'))
        for trace in traces:
            fig.add_trace(trace)

    fig.update_layout(title=f'Difference in distribution: # of datapoints per bin ({bin_size=})',
                      xaxis_title='X',
                      yaxis_title='Y',
                      width=width, height=height,
                      legend=dict(x=1.2, y=1))

    return fig


## preprocessing & fitting

In [None]:
descriptors = pd.read_pickle(SAMPLING_PATH + 'descriptors/descriptors_moses+bindingdb.pkl')
blacklist = ['smile', 'Ipc', 'AvgIpc', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'MaxAbsPartialCharge', 'MaxPartialCharge', 'MinAbsPartialCharge', 'MinPartialCharge']
descriptors = descriptors.drop(columns=blacklist)
print(descriptors.shape)


(2894910, 195)


In [None]:
pickle.dump(descriptors.columns.to_list(), open(PICKLES + 'descriptors_moses+bindingdb_columnlist.pkl', 'wb'))

In [None]:
scaler, pca = fit_pca(descriptors, n=100)
pickle.dump((scaler, pca), open(PICKLES + 'scaler_pca_moses+bingingdb.pkl', 'wb'))

## visualization

In [None]:
scaler, pca = pickle.load(open(f"{SAMPLING_PATH}pca_weights/scaler_pca_moses+bindingdb.pkl", 'rb'))
columns = pickle.load(open(f"{SAMPLING_PATH}descriptors/descriptors_moses+bindingdb_columnlist.pkl", 'rb'))
load_baseline = lambda fname: pd.read_pickle(f"{SAMPLING_PATH}descriptors/{fname}.pkl").drop_duplicates(subset='smiles')
gpt_base = load_baseline('model1_baseline_temp1.0')
gpt_sub1 = load_baseline('model1_softsub_al1_temp1.0')
gpt_div1 = load_baseline('model1_softdiv_al1_temp1.0')
gpt_sub2 = load_baseline('model1_softsub_al2_temp1.0')
gpt_div2 = load_baseline('model1_softdiv_al2_temp1.0')

pre_base = 'model1_baseline_threshold11'
pre_sub1 = 'model1_softsub_al1_threshold11'
pre_div1 = 'model1_softdiv_al1_threshold11'
pre_sub2 = 'model1_softsub_al2_threshold11'
pre_div2 = 'model1_softdiv_al2_threshold11'
l_base = lambda fname: gpt_base[gpt_base['smiles'].isin(pd.read_csv(f"{AL_PATH}training_sets/{pre_base}_{fname}.csv")['smiles'].unique())]
l_sub1 = lambda fname: gpt_sub1[gpt_sub1['smiles'].isin(pd.read_csv(f"{AL_PATH}training_sets/{pre_sub1}_{fname}.csv")['smiles'].unique())]
l_div1 = lambda fname: gpt_div1[gpt_div1['smiles'].isin(pd.read_csv(f"{AL_PATH}training_sets/{pre_div1}_{fname}.csv")['smiles'].unique())]
l_sub2 = lambda fname: gpt_sub2[gpt_sub2['smiles'].isin(pd.read_csv(f"{AL_PATH}training_sets/{pre_sub2}_{fname}.csv")['smiles'].unique())]
l_div2 = lambda fname: gpt_div2[gpt_div2['smiles'].isin(pd.read_csv(f"{AL_PATH}training_sets/{pre_div2}_{fname}.csv")['smiles'].unique())]


In [None]:
print(l_base('linear').shape, l_base('linear_noscore').shape, l_base('softmax_divf0.25').shape, l_base('softmax_sub').shape)
print(l_sub1('softmax_sub').shape, l_sub1('softmax_sub_noscore').shape, l_div1('softmax_divf0.25').shape, l_div1('softmax_divf0.25_noscore').shape,)
print(l_sub2('softmax_sub').shape, l_sub2('softmax_sub_noscore').shape, l_div2('softmax_divf0.25').shape, l_div2('softmax_divf0.25_noscore').shape,)

(5304, 210) (5304, 210) (5304, 210) (5304, 210)
(5406, 210) (5406, 210) (5363, 210) (5363, 210)
(5527, 210) (5527, 210) (5466, 210) (5466, 210)


In [None]:
sample = 5_000
seed = 42
colors = {"gunmetal": "#31393C", "blue": "#4361EE", "plum": "#8E338C", "grape": "#7209B7", "red": "#D90429", "orange": "#FF7B00", "yellow": "#FFBA08", "mindaro": "#CBFF8C"}

scatter = lambda loader, fname: pca_transform(pca, scaler.transform(loader(fname)[columns]), n=2)
plot_pca(
    [
    (*pca_transform(pca, scaler.transform(gpt_base[columns].sample(n=sample, random_state=seed)), n=2), colors["gunmetal"], f'GPT baseline'),
    (*pca_transform(pca, scaler.transform(gpt_sub1[columns].sample(n=sample, random_state=seed)), n=2), colors["grape"], f'GPT Softmax Sub'),
    (*pca_transform(pca, scaler.transform(gpt_div1[columns].sample(n=sample, random_state=seed)), n=2), colors["red"], f'GPT Softmax Div0.25'),
    #(*scatter('linear'), colors["blue"], f'AL1 Linear'),
    (*scatter(l_base, 'linear_noscore'), colors["mindaro"], f'AL1 Diffusion'),
    #(*scatter(l_base, 'softmax_divf0.25'), colors["grape"], f'AL1 Softmax Div0.25'),
    #(*scatter(l_base, 'softmax_sub'), colors["red"], f'AL1 Softmax Sub'),
    ], yscale=1.5
).show()

In [None]:
#@title
import matplotlib.pyplot as plt

var_arr = pca.explained_variance_ratio_
print(sum(var_arr))
print(sum(var_arr[:2]))
cum_varr = np.cumsum(var_arr)
plt.plot(var_arr)
plt.plot([1 for _ in range(len(var_arr))])
plt.plot(cum_varr)

In [None]:
heat = lambda loader, fname: pca.transform(scaler.transform(loader(fname)[columns]))[:, :2]
plot_heatmap([
    (pca.transform(scaler.transform(gpt_base[columns].sample(n=5_304, random_state=seed)))[:, :2], f'GPT baseline'),
    (heat(l_base, 'linear_noscore'), f'Base Diffusion'),
    # (pca.transform(scaler.transform(al1_softmax_div_nosc[columns]))[:, :2], f'AL1 Diffusion Softmax Div0.25'),
    # (pca.transform(scaler.transform(al1_softmax_sub_nosc[columns]))[:, :2], f'AL1 Diffusion Softmax Sub'),
    (heat(l_base, 'linear'), f'AL1 Linear'),
    (heat(l_base, 'softmax_divf0.25'), f'AL1 Softmax Div0.25'),
    (heat(l_base, 'softmax_sub'), f'AL1 Softmax Sub'),
], difference=True, all_differences=True, bin_size=1.5, width=900, height=500)



In [None]:
plot_heatmap([
    (pca.transform(scaler.transform(gpt_base[columns].sample(n=5_304, random_state=seed)))[:, :2], f'GPT baseline'),
    (pca.transform(scaler.transform(gpt_sub1[columns].sample(n=5_406, random_state=seed)))[:, :2], f'GPT Sub1'),
    (heat(l_sub1, 'softmax_sub_noscore'), f'Sub1 Diffusion'),
    (heat(l_sub1, 'softmax_sub'), f'AL2 Softmax Sub'),
], difference=True, all_differences=True, bin_size=1.5, width=900, height=500)

In [None]:
plot_heatmap([
    (pca.transform(scaler.transform(gpt_base[columns].sample(n=5_304, random_state=seed)))[:, :2], f'GPT baseline'),
    (pca.transform(scaler.transform(gpt_div1[columns].sample(n=5_363, random_state=seed)))[:, :2], f'GPT Div1'),
    (heat(l_div1, 'softmax_divf0.25_noscore'), f'Div1 Diffusion'),
    (heat(l_div1, 'softmax_divf0.25'), f'AL2 Softmax Div'),
], difference=True, all_differences=True, bin_size=1.5, width=900, height=500)

In [None]:
plot_heatmap([
    (pca.transform(scaler.transform(gpt_base[columns].sample(n=5_304, random_state=seed)))[:, :2], f'GPT baseline'),
    (pca.transform(scaler.transform(gpt_sub2[columns].sample(n=5_406, random_state=seed)))[:, :2], f'GPT Sub2'),
    (heat(l_sub2, 'softmax_sub_noscore'), f'Sub2 Diffusion'),
    (heat(l_sub2, 'softmax_sub'), f'AL3 Softmax Sub'),
], difference=True, all_differences=True, bin_size=1.5, width=900, height=500)

In [None]:
l_div2

In [None]:
plot_heatmap([
    (pca.transform(scaler.transform(gpt_base[columns].sample(n=5_304, random_state=seed)))[:, :2], f'GPT baseline'),
    (pca.transform(scaler.transform(gpt_div2[columns].sample(n=5_363, random_state=seed)))[:, :2], f'GPT Div2'),
    (heat(l_div2, 'softmax_divf0.25_noscore'), f'Div2 Diffusion'),
    (heat(l_div2, 'softmax_divf0.25'), f'AL3 Softmax Div'),
], difference=True, all_differences=True, bin_size=1.5, width=900, height=500)

# Distribution Analysis

## Definitions

In [7]:
import numpy as np
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

load_dist = lambda fname: pd.read_csv(f"{SCORING_PATH}scored_dataframes/{fname}.csv")['score'].to_numpy()

def compute_cluster_scores(fname):
    good_data = pd.read_csv(f"{SCORING_PATH}scored_dataframes/{fname}.csv")
    cluster_to_scores = {}
    for index, row in good_data.iterrows():
        cluster_to_scores.setdefault(row['cluster_id'], []).append(row['score'])
    cluster_to_score = {cluster_id: np.mean(scores) for cluster_id, scores in cluster_to_scores.items()}
    return np.array(list(cluster_to_score.values()))

#dark to light
biscale = {"blue": ("#03045e", "#023e8a"), "purple": ("#7b2cbf", "#c77dff"), "green": ("#008000", "#70e000"), "red": ("#a4133c", "#ff4d6d"), "brown": ("#6f4518", "#bc8a5f")}

def create_hist_trace(i, data, label, color, threshold, bin_step):
    # Generate KDE for data
    density = gaussian_kde(data)
    xs = np.linspace(np.min(data), np.max(data), 200)
    density.covariance_factor = lambda : .25
    density._compute_covariance()

    hist_vals, bin_edges = np.histogram(data, bins=range(0, int(np.max(data)) + 2, bin_step), density=True)
    hist = go.Bar(x=bin_edges[:-1], y=hist_vals, name=label, opacity=0.6, marker=dict(color=biscale[color][1]),
                  hovertemplate=[f'[{int(i)}, {int(i + bin_step)})' for i in bin_edges[:-1]])
    #nbins=30
    #hist = go.Histogram(x=data, nbinsx=nbins, name=label, histnorm='probability', opacity=0.6, marker=dict(color=biscale[color][1]))
    density_curve = go.Scatter(x=xs, y=density(xs), mode='lines', name=label + ' Density', line=dict(color=biscale[color][0]))

    above_threshold_pct = np.sum(data > threshold) / len(data) * 100
    q25, q50, q75, q90 = np.percentile(data, [25, 50, 75, 90])
    max = data.max()

    annotation = dict(
        x=0.95, y=1.0 - 0.04*i, xref='paper', yref='paper',
        text=f"{label}: % > threshold = {above_threshold_pct:.2f}, Q25 = {q25:.2f}, Q50 = {q50:.2f}, Q75 = {q75:.2f}, Q90 = {q90:.2f}, max = {max:.2f}",
        showarrow=False, font=dict(size=12))

    return [hist, density_curve], annotation

def plot_hist_density(data_list, title_spec, threshold=11, bin_step=2):
    graph = Graph()
    traces, annotations = [], []
    for i, data in enumerate(data_list):
        new_traces, annotation = create_hist_trace(i, *data, threshold, bin_step)
        traces.extend(new_traces)
        annotations.append(annotation)

    # Create figure and add traces
    fig = go.Figure(data=traces, layout=go.Layout(bargap=0.2, barmode='overlay', shapes=[
                      dict(type="line", x0=threshold, x1=threshold, y0=0, y1=1,
                           yref='paper',  # refers to the entire plot for the y-dimension
                           line=dict(color="grey", width=2, dash='longdash'))],
                      annotations=annotations))

    # Show figure
    graph.update_parameters(dict(title=f'Distribution of {title_spec} scores based on DiffDock Poses',
                                 xaxis_title='Prolif Score', yaxis_title='Rel. Frequency', width=1280, height=600))
    graph.style_figure(fig)
    fig.show()

## Individual Scores

In [21]:
plot_hist_density([
    (load_dist('model1_baseline'), 'Baseline', 'blue'),
    #(load_dist('model1_baseline_random'), 'Baseline Random', 'red'),
    #(load_dist('model1_random_al1'), 'Random Al1', 'purple'),
    #(load_dist('model1_softdiv_al1'), 'SoftDiv Al1', 'purple'), #(load_dist('model1_softdiv_al2'), 'SoftDiv Al2', 'red'),
    (load_dist('model1_softsub_al1'), 'SoftSub Al1', "green"), #(load_dist('model1_softsub_al2'), 'SoftSub Al2', "brown")
], title_spec='ligand', bin_step=1, threshold=11)

In [None]:
plot_hist_density([
    (compute_cluster_scores('model1_baseline'), 'Baseline', 'blue'),
    (compute_cluster_scores('model1_baseline_random'), 'Baseline Random', 'red'),
    # (compute_cluster_scores('model1_softdiv_al1'), 'SoftDiv Al1', 'purple'), (compute_cluster_scores('model1_softsub_al1'), 'SoftSub Al1', "green"),
    # (compute_cluster_scores('model1_softdiv_al2'), 'SoftDiv Al2', 'red'), (compute_cluster_scores('model1_softsub_al2'), 'SoftSub Al2', "brown")
], title_spec='cluster', bin_step=1, threshold=11)

ValueError: ignored

In [None]:
import plotly.graph_objs as go

def plot_correlation_circle(pca, features):
    pcs = pca.components_

    # Create a trace for the variable vectors
    vectors = go.Scatter(
        x=pcs[0, :],
        y=pcs[1, :],
        mode='lines+markers+text',
        text=features,
        textposition='top center',
        line=dict(color='red'),
        marker=dict(size=10, color='blue'),
        textfont=dict(size=8)
    )

    # Create a trace for the unit circle
    circle = go.Scatter(
        x=np.cos(np.linspace(0, 2*np.pi, 100)),
        y=np.sin(np.linspace(0, 2*np.pi, 100)),
        mode='lines',
        line=dict(color='blue', width=1),
        showlegend=False
    )

    layout = go.Layout(
        title = 'Correlation Circle',
        autosize=False,
        width=800,
        height=800,
        showlegend=False,
        xaxis=dict(
            title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)",
            range=[-1.1, 1.1],
            zeroline=False,
            showgrid=True,
            domain=[0, 1],
        ),
        yaxis=dict(
            title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)",
            range=[-1.1, 1.1],
            zeroline=False,
            showgrid=True,
            domain=[0, 1],
        )
    )

    fig = go.Figure(data=[vectors, circle], layout=layout)
    fig.show()

# Assuming pca is your PCA model fitted with sklearn and df is the pandas dataframe with your original data
plot_correlation_circle(pca, descriptors.columns.values)


In [None]:
pca.explained_variance_ratio_

array([0.18405689, 0.05313747, 0.05101234, 0.03719958, 0.03377713,
       0.02607806, 0.02453578, 0.02168244, 0.01957461, 0.01764923,
       0.01653251, 0.01558626, 0.01462157, 0.01440143, 0.01360093,
       0.01318905, 0.0122997 , 0.01160566, 0.01057536, 0.01029826,
       0.01015372, 0.00996571, 0.00888317, 0.00863959, 0.00848849,
       0.00841659, 0.00778453, 0.00752196, 0.00730309, 0.00690913,
       0.00679754, 0.00642682, 0.00639788, 0.00625259, 0.00621522,
       0.00600859, 0.00589496, 0.00575097, 0.00570769, 0.00565048,
       0.00563242, 0.00553329, 0.00549129, 0.00546999, 0.00539551,
       0.00537414, 0.00532685, 0.00529351, 0.00526975, 0.0052222 ,
       0.00520911, 0.00520831, 0.00520489, 0.00518984, 0.00518221,
       0.00515622, 0.00514502, 0.00510856, 0.00507628, 0.0050335 ,
       0.00500594, 0.00495525, 0.00492759, 0.00488869, 0.00482122,
       0.00477146, 0.0047189 , 0.00457945, 0.00455428, 0.00445394,
       0.00440963, 0.00428209, 0.00416083, 0.00411194, 0.00395