- Handling warnings

In [2]:
import warnings
warnings.filterwarnings("ignore")

- Load modules

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import (silhouette_score, davies_bouldin_score, calinski_harabasz_score)
from sklearn.cluster import (KMeans, AffinityPropagation, AgglomerativeClustering, Birch, HDBSCAN,
                            BisectingKMeans, DBSCAN, MeanShift, MiniBatchKMeans, OPTICS, SpectralClustering)
import numpy as np
import json
from common_funtions import *

- Defining auxiliar functions

In [4]:
def get_profile(row, label_columns):
    active_labels = [col for col in label_columns if row[col] == 1]
    return '-'.join(active_labels)

- Loading and preparing data for word/sentence clustering

In [5]:
df_updated = pd.read_csv("../raw_data/curated_PrimaryOdor_1.csv")
df_updated.head(2)

Unnamed: 0,standardized_smiles,CAS-Id,fcfp_compact,fcfp_environments,odor_1,odor_2,odor_3,odor_4,odor_5,odor_6,...,odor_27,odor_28,odor_29,odor_30,odor_31,odor_32,odor_33,odor_34,odor_35,odor_36
0,BrC=Cc1ccccc1,103-64-0,4P///wAEAAANAAAAAAYGdnbepHxSzmQcKQQ8,"{""0"": [[1, 0], [2, 0]], ""4"": [[3, 0], [4, 0], ...",Green,Fruity,Vegetation,Fragrant,,,...,,,,,,,,,,
1,C#CC(C)(O)CCC=C(C)C,29171-20-8,4P///wAEAAAQAAAAAARaFrUBLEQhAEbEVkKoME59AAw=,"{""0"": [[0, 0], [1, 0], [2, 0], [3, 0], [5, 0],...",Sweet,Plants,Waxy,Ambrosial,Resinous,Grassy,...,,,,,,,,,,


In [6]:
df_updated.drop(columns = ["fcfp_compact", "fcfp_environments"], inplace=True)
df_updated.head(2)

Unnamed: 0,standardized_smiles,CAS-Id,odor_1,odor_2,odor_3,odor_4,odor_5,odor_6,odor_7,odor_8,...,odor_27,odor_28,odor_29,odor_30,odor_31,odor_32,odor_33,odor_34,odor_35,odor_36
0,BrC=Cc1ccccc1,103-64-0,Green,Fruity,Vegetation,Fragrant,,,,,...,,,,,,,,,,
1,C#CC(C)(O)CCC=C(C)C,29171-20-8,Sweet,Plants,Waxy,Ambrosial,Resinous,Grassy,Dry,Woody,...,,,,,,,,,,


In [7]:
cols = ["CAS-Id"] + [col for col in df_updated.columns if col != "CAS-Id"]
df_updated = df_updated[cols]
df_updated.columns = df_updated.columns.str.lower().str.replace("-", "_")
df_updated.head(2)

Unnamed: 0,cas_id,standardized_smiles,odor_1,odor_2,odor_3,odor_4,odor_5,odor_6,odor_7,odor_8,...,odor_27,odor_28,odor_29,odor_30,odor_31,odor_32,odor_33,odor_34,odor_35,odor_36
0,103-64-0,BrC=Cc1ccccc1,Green,Fruity,Vegetation,Fragrant,,,,,...,,,,,,,,,,
1,29171-20-8,C#CC(C)(O)CCC=C(C)C,Sweet,Plants,Waxy,Ambrosial,Resinous,Grassy,Dry,Woody,...,,,,,,,,,,


In [8]:
# df_data = pd.read_csv("../raw_data/curated_PrimaryOdor.csv")
df_data = df_updated.copy()
df_data.head(2)

Unnamed: 0,cas_id,standardized_smiles,odor_1,odor_2,odor_3,odor_4,odor_5,odor_6,odor_7,odor_8,...,odor_27,odor_28,odor_29,odor_30,odor_31,odor_32,odor_33,odor_34,odor_35,odor_36
0,103-64-0,BrC=Cc1ccccc1,Green,Fruity,Vegetation,Fragrant,,,,,...,,,,,,,,,,
1,29171-20-8,C#CC(C)(O)CCC=C(C)C,Sweet,Plants,Waxy,Ambrosial,Resinous,Grassy,Dry,Woody,...,,,,,,,,,,


In [9]:
df_data.columns

Index(['cas_id', 'standardized_smiles', 'odor_1', 'odor_2', 'odor_3', 'odor_4',
       'odor_5', 'odor_6', 'odor_7', 'odor_8', 'odor_9', 'odor_10', 'odor_11',
       'odor_12', 'odor_13', 'odor_14', 'odor_15', 'odor_16', 'odor_17',
       'odor_18', 'odor_19', 'odor_20', 'odor_21', 'odor_22', 'odor_23',
       'odor_24', 'odor_25', 'odor_26', 'odor_27', 'odor_28', 'odor_29',
       'odor_30', 'odor_31', 'odor_32', 'odor_33', 'odor_34', 'odor_35',
       'odor_36'],
      dtype='object')

- Get Unique odors

In [10]:
list_unique_odors = []

for i in range(1, 37):
    column = f"odor_{i}"
    unique_values = df_data[column].unique().tolist()
    unique_values = [str(value).replace(" ", "_").lower() for value in unique_values]
    list_unique_odors += unique_values

list_unique_odors = list(set(list_unique_odors))
len(list_unique_odors)

114

- Loading pre-trained model from sentenceTransformer: See https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 for more details

In [11]:
model = SentenceTransformer('all-MiniLM-L6-v2')

- Get embeddings for applying clustering strategies

In [12]:
matrix_for_clustering = []
list_words_for_clustering = []

for word in list_unique_odors:
    if str(word) != "nan":
        vector = model.encode(word)
        matrix_for_clustering.append(vector)
        list_words_for_clustering.append(word)

header = [f"p_{i+1}" for i in range(len(matrix_for_clustering[0]))]
df_for_clustering = pd.DataFrame(data=matrix_for_clustering, columns=header)
df_for_clustering.insert(0, 'word', list_words_for_clustering)
df_for_clustering

Unnamed: 0,word,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_375,p_376,p_377,p_378,p_379,p_380,p_381,p_382,p_383,p_384
0,septic,-0.053634,0.008882,0.028514,-0.025462,-0.090610,-0.046136,-0.001819,0.079025,0.015717,...,0.036398,0.030189,0.023783,-0.002196,-0.025275,0.054594,0.064645,0.011690,0.085835,-0.006902
1,boiled,-0.026973,-0.066337,-0.028082,0.114139,-0.047949,-0.057046,0.077117,-0.062182,-0.052604,...,0.066964,0.029938,0.042809,-0.006201,-0.047204,-0.023041,0.072196,0.061460,0.065978,-0.009908
2,woody,0.020421,-0.085284,0.054503,0.012303,0.019870,0.041864,0.121591,0.030242,0.037035,...,0.026619,-0.106251,-0.118209,0.009011,-0.034562,-0.048551,0.097677,-0.033269,0.047515,-0.017099
3,unripe,-0.072524,0.021161,-0.039231,0.030346,-0.082156,-0.030096,0.070469,0.003769,-0.028843,...,0.065919,0.012318,-0.002311,0.013394,-0.068031,0.064124,0.084176,0.040310,0.131497,-0.112128
4,marshy,-0.014556,-0.017524,0.010451,-0.029076,-0.001990,-0.086743,0.118538,-0.049482,-0.031996,...,0.028811,0.023573,-0.045643,-0.021003,-0.002776,0.029222,0.159024,0.005198,0.039764,0.025904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,bakery,0.020641,0.008084,-0.035948,0.018343,-0.076443,0.027311,0.088122,-0.015092,-0.008860,...,-0.013564,-0.017501,-0.017970,-0.065373,0.058383,0.049341,0.060496,-0.047795,0.032961,-0.073921
109,soft_floral,-0.052057,-0.058722,0.058388,0.047712,0.025695,-0.003336,0.082344,-0.009636,-0.002263,...,-0.041426,-0.004066,0.028492,0.078200,0.007123,-0.020773,-0.019956,0.043181,0.103909,0.014855
110,putrid,0.013540,0.041110,-0.063318,-0.010889,-0.036073,0.025942,0.076649,0.058998,0.054185,...,0.120393,0.046948,0.026376,0.038434,-0.085071,0.041670,0.089506,0.047948,0.078274,-0.022794
111,people_&_animals,-0.015200,0.002265,0.015997,0.082923,-0.061700,-0.010635,0.103583,-0.023647,0.014615,...,-0.066994,-0.090940,0.016231,0.012510,0.006004,-0.051919,0.043773,0.027442,0.025624,0.052591


In [13]:
df_for_clustering.to_csv("../data/odors_embedded.csv", index=False)