In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

import numpy as np
from itertools import combinations
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split


# 1. Carregar as moléculas do conjunto de treinamento
# Aqui, por simplicidade, usaremos smiles de exemplo. Substitua pela sua lista de smiles.
df=pd.read_csv(r"C:\Users\Francisco\Downloads\HEK_curated_reduced_1-5.csv")
x=df[['SMILES']]
y=df[['Outcome']]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42, stratify=y)


In [None]:
molecules_train = [Chem.MolFromSmiles(smile) for smile in X_train['SMILES']]

# 2. Calcular os fingerprints
# Usaremos o Morgan Fingerprint (equivalente ao ECFP)
fingerprints_train = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in molecules_train]

# 3. Calcular as similaridades (Coeficiente de Tanimoto) entre todos os pares de moléculas
def calculate_tanimoto(fp_list):
    n = len(fp_list)
    sim_matrix = np.zeros((n, n))
    total_pairs = n * (n - 1) // 2  # Número total de pares
    pairs = combinations(range(n), 2)
    for i, j in tqdm(pairs, total=total_pairs, desc="Calculando similaridades"):
        sim = DataStructs.FingerprintSimilarity(fp_list[i], fp_list[j])
        sim_matrix[i, j] = sim
        sim_matrix[j, i] = sim
    return sim_matrix

similarity_matrix = calculate_tanimoto(fingerprints_train)

# 4. Extrair os valores superiores da matriz (sem repetição)
similarity_values = similarity_matrix[np.triu_indices(len(fingerprints_train), k=1)]

# 5. Analisar a distribuição de similaridades
mean_sim = np.mean(similarity_values)
median_sim = np.median(similarity_values)
std_sim = np.std(similarity_values)

print(f"Média de Similaridade: {mean_sim:.4f}")
print(f"Mediana de Similaridade: {median_sim:.4f}")
print(f"Desvio Padrão: {std_sim:.4f}")

# 6. Definir o threshold
# Podemos usar o percentil 5% como exemplo
threshold = np.percentile(similarity_values, 5)
print(f"Tanimoto Similarity Threshold (5th percentile): {threshold:.4f}")

# 7. Plot the histogram of similarities
plt.hist(similarity_values, bins=50, edgecolor='black')
plt.axvline(threshold, color='red', linestyle='dashed', linewidth=2, label=f'Threshold = {threshold:.2f}')
plt.title('Distribution of Tanimoto Similarities (HEK 293)')
plt.xlabel('Tanimoto Similarity')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
np.save(r'C:\Users\Francisco\Downloads\HEKsimilarity_matrix', similarity_values)

In [None]:
def calculate_percentage_within_domain(test_fps, train_fps, threshold):
    within_domain_count = 0
    
    for test_fp in tqdm(test_fps, desc="Checking domain applicability"):
        # Calcula a similaridade com todos os compostos do treino
        similarities = [DataStructs.FingerprintSimilarity(test_fp, train_fp) for train_fp in train_fps]
        max_similarity = max(similarities)  # Encontra a similaridade máxima com o treino
        
        # Verifica se está dentro do domínio
        if max_similarity >= threshold:
            within_domain_count += 1
    
    # Calcula a porcentagem
    total_test_compounds = len(test_fps)
    percentage_within_domain = (within_domain_count / total_test_compounds) * 100
    
    return percentage_within_domain

molecules_test = [Chem.MolFromSmiles(smile) for smile in X_test['SMILES']]

test_fingerprints=[AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) for mol in molecules_test]

# Chamada da função
percentage_within = calculate_percentage_within_domain(test_fingerprints, fingerprints_train, threshold)
print(f"{percentage_within:.2f}% of the test compounds are within the domain of applicability.")
