In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import DataLoader
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from model.featurisation import smiles2graph
from model.CL_model_vas_info import GNNModelWithNewLoss
import pandas as pd


In [2]:
df = pd.read_csv("./data/vsa.csv")  
smiles_list = df["SMILES"].tolist()
smr_vsa_list = [list(map(float, row.split())) for row in df["SMR_VSA"]]

In [3]:
def read_vsa_data(vsa_file):
    df = pd.read_csv(vsa_file)

    def parse_vsa(s):
        try:
            return list(map(float, s.strip('[]').split()))
        except:
            return []

    smr_arrays = df["SMR_VSA"].apply(parse_vsa).tolist()          
    slogp_arrays = df["SlogP_VSA"].apply(parse_vsa).tolist()     
    peoe_arrays = df["PEOE_VSA"].apply(parse_vsa).tolist()       

    properties = list(zip(smr_arrays, slogp_arrays, peoe_arrays))
    
    return df["SMILES"].tolist(), properties

x_smiles, properties = read_vsa_data("./data/vsa.csv")

In [4]:
data_list = smiles2graph(
    x_smiles, y=None, cluster=None, properties=properties, test=False
)
data_list[0]

Data(x=[21, 79], edge_index=[2, 44], edge_attr=[44, 10], global_features=[5], smiles='Cc1cccc(C2=CCN(C(=O)NCCCC#N)CC2)c1', property_0=[1, 10], property_1=[1, 10], property_2=[1, 14])

In [5]:
import torch
from torch_geometric.data import DataLoader
devices = ["cuda" if torch.cuda.is_available() else "cpu"]
model1 = GNNModelWithNewLoss(
        num_node_features=data_list[0].x.shape[1],
        num_edge_features=data_list[0].edge_attr.shape[1],
        num_global_features=data_list[0].global_features.shape[0],
        hidden_dim=512,
        dropout_rate=0.1,
        property_index=1 ,
        save_path= 'premodels/0' 
    ).to(devices[0])

In [6]:
device = devices[0]
ckpt = torch.load(f'premodels/{0}/best_model.pth', map_location=device)
model1.load_state_dict(ckpt['encoder_state_dict'])
model1.eval()

  ckpt = torch.load(f'premodels/{0}/best_model.pth', map_location=device)


GNNModelWithNewLoss(
  (conv1): GATConv(79, 512, heads=1)
  (conv2): GATConv(512, 512, heads=1)
  (conv3): GATConv(512, 512, heads=1)
  (bn1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (bn2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (bn3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (projection_head): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=64, bias=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
)

In [7]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def compute_distances(model, dataloader, anchor_smiles="O", vsa_property='property_0', weight_vsa=0.5, weight_embedding=0.5):
    """
    计算每个分子与锚点（水分子）之间的表示距离
    vsa_property: 选择的VSA分量，可以是 'property_0', 'property_1', 或 'property_2'
    """
    anchor_embedding = None
    anchor_vsa = None
    for data in dataloader:
        data = data.to(model.device)
        prop = model.get_property(data)
        embedding = model._project(model.forward(data))

        if "O" in data.smiles:  # 如果找到水分子
            anchor_embedding = embedding
            anchor_vsa = prop[vsa_property]
            break
    else:
        # 如果没有找到水分子（'O'），则选择第一个分子，并打印它的SMILES
        data = dataloader.dataset[0]
        anchor_smiles = data.smiles
        print(f"Water molecule (O) not found. Using the first molecule: {anchor_smiles}")
        anchor_embedding = model._project(model.forward(data.to(model.device)))
        anchor_vsa = model.get_property(data)
    embedding_distances = []
    vsa_distances = []
    for data in dataloader:
        data = data.to(model.device)
        prop = model.get_property(data)
        embedding = model._project(model.forward(data))

        # 计算embedding距离（余弦相似度）
        embedding_dist = 1 - cosine_similarity(anchor_embedding.cpu().detach().numpy(), embedding.cpu().detach().numpy())
        embedding_distances.append(embedding_dist.flatten())

        # 计算VSA距离（余弦相似度）
        anchor_vsa_cpu = anchor_vsa.cpu().detach().numpy().reshape(1, -1)
        prop_vsa_cpu = prop.cpu().detach().numpy().reshape(1, -1)
        vsa_dist = 1 - cosine_similarity(anchor_vsa_cpu, prop_vsa_cpu)
        vsa_distances.append(vsa_dist.flatten())

    # 将embedding距离和VSA距离进行标准化
    scaler = StandardScaler()
    embedding_distances_scaled = scaler.fit_transform(np.array(embedding_distances).reshape(-1, 1)).flatten()
    vsa_distances_scaled = scaler.fit_transform(np.array(vsa_distances).reshape(-1, 1)).flatten()


    return  np.array(vsa_distances_scaled), np.array(embedding_distances_scaled )
   

# 改进散点图的可视化
def plot_distances_vs_vsa(embedding_distances, vsa_distances, vsa_label):
    """
    绘制距离与VSA分量的关系图
    vsa_label: VSA分量的标签，用于绘制标题
    """
    plt.figure(figsize=(8, 6))
    plt.scatter(embedding_distances, vsa_distances, alpha=0.7, edgecolors='w', s=30, cmap='viridis')
    plt.xlabel('Embedding Distance to Anchor Molecule (Water)')
    plt.ylabel(f'VSA Component ({vsa_label}) Distance')
    plt.title(f'Embedding Distance vs VSA Component ({vsa_label}) Distance')
    plt.colorbar(label='Cluster')
    plt.grid(True)
    plt.show()

# 聚类并改进图形显示
# 聚类并打印最靠近中心的10个分子的SMILES
def cluster_and_plot(embedding_distances, vsa_distances, n_clusters=20, dataloader=None):
    # 标准化距离以进行聚类
    scaler = StandardScaler()
    distances_scaled = scaler.fit_transform(embedding_distances.reshape(-1, 1))

    # 使用KMeans进行聚类
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(distances_scaled)

    # 绘制聚类结果
    plt.figure(figsize=(8, 6))
    plt.scatter(embedding_distances, vsa_distances, c=kmeans.labels_, cmap='viridis', alpha=0.6, edgecolors='w', s=30)
    plt.xlabel('Embedding Distance to Anchor Molecule (Water)')
    plt.ylabel('VSA Distance')
    plt.title('Molecule Clusters Based on Embedding Distance and VSA Distance')
    plt.colorbar(label='Cluster')
    plt.grid(True)
    plt.show()

    # 对于每个聚类，获取最靠近中心的10个分子并输出其SMILES
    for cluster_id in range(n_clusters):
        cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
        cluster_center = kmeans.cluster_centers_[cluster_id]
        cluster_distances = embedding_distances[cluster_indices]
        
        # 找到最靠近中心的10个分子
        closest_indices = cluster_indices[np.argsort(np.abs(cluster_distances - cluster_center))[:10]]
        print(f"Cluster {cluster_id}:")
        
        # 输出每个最靠近中心的分子的SMILES和其他信息
        for idx in closest_indices:
            smiles = dataloader.dataset[idx].smiles  # 获取该分子的SMILES
            print(f"  Molecule index: {idx} (SMILES: {smiles}, Embedding Distance: {embedding_distances[idx]}, VSA Distance: {vsa_distances[idx]})")






In [8]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler

# 计算Embedding和VSA之间的皮尔逊相关系数
def compute_correlation(embedding_distances, vsa_distances):
    correlation_matrix = np.corrcoef(embedding_distances, vsa_distances)
    print(f"Correlation coefficient between Embedding and VSA: {correlation_matrix[0, 1]}")

# 使用PCA降维并可视化
def apply_pca_and_visualize(embedding_distances, vsa_distances):
    # 将数据标准化
    scaler = StandardScaler()
    data = np.vstack((embedding_distances, vsa_distances)).T
    data_scaled = scaler.fit_transform(data)

    # 应用PCA降维
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(data_scaled)

    # 可视化PCA结果
    plt.figure(figsize=(8, 6))
    plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.6)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title('PCA Visualization of Embedding and VSA Distances')
    plt.grid(True)
    plt.show()

# 示例
embedding_distances = np.array(embedding_distances)
vsa_distances = np.array(vsa_distances)

# 计算相关性
compute_correlation(embedding_distances, vsa_distances)

# 使用PCA降维并可视化
apply_pca_and_visualize(embedding_distances, vsa_distances)


NameError: name 'embedding_distances' is not defined

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataloader = DataLoader(data_list, batch_size=32, shuffle=False)

vsa_property = 'property_0'  # 可以选择 'property_0', 'property_1', 或 'property_2'
embedding_distances, vsa_distances = compute_distances(model1, dataloader, anchor_smiles="O", vsa_property=vsa_property)
plot_distances_vs_vsa(embedding_distances, vsa_distances, vsa_label=vsa_property)
cluster_and_plot(embedding_distances, vsa_distances)


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataloader = DataLoader(data_list, batch_size=32, shuffle=False)
model1.get_distribution(dataloader)

