In [10]:
#!/usr/bin/env python3
"""
Compute Mol2Vec embeddings for each SMILES in a CSV.

Usage:
    pip install rdkit-pypi mol2vec gensim pandas numpy
    python compute_mol2vec.py input.csv model_300dim.pkl output.csv
"""

import sys
import pandas as pd
import numpy as np
from rdkit import Chem
from mol2vec.features import mol2alt_sentence, sentences2vec
from gensim.models import Word2Vec

def load_smiles_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if 'SMILES' not in df.columns or 'TASTE' not in df.columns:
        raise KeyError("Input CSV must contain 'SMILES' and 'TASTE' columns")
    return df[['SMILES', 'TASTE']]

def inject_gensim4_shim(model: Word2Vec):
    # Mol2Vec expects model.wv.vocab; recreate it from key_to_index
    model.wv.vocab = {key: None for key in model.wv.index_to_key}

def smiles_to_sentences(smiles_list, radius: int = 1):
    sentences = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            sentences.append([])  # invalid SMILES → UNK vector
        else:
            sentences.append(mol2alt_sentence(mol, radius=radius))
    return sentences

def compute_embeddings(sentences, model: Word2Vec) -> np.ndarray:
    return sentences2vec(sentences, model, unseen='UNK')


def main():
    if len(sys.argv) != 4:
        print("Usage: python compute_mol2vec.py input.csv model.pkl output.csv")
        sys.exit(1)

    input_csv, model_path, output_csv = sys.argv[1:]
    print(f"Loading SMILES from {input_csv}...")
    df = load_smiles_csv(input_csv)

    print(f"Loading Mol2Vec model from {model_path}...")
    model = Word2Vec.load(model_path)

    print("Converting SMILES to fragment sentences...")
    sentences = smiles_to_sentences(df['SMILES'], radius=1)

    print("Computing embeddings...")
    vectors = compute_embeddings(sentences, model)
    n_dims = vectors.shape[1]
    col_names = [f"mol2vec_{i}" for i in range(n_dims)]
    emb_df = pd.DataFrame(vectors, columns=col_names)

    print(f"Concatenating embeddings and saving to {output_csv}...")
    out_df = pd.concat([df.reset_index(drop=True), emb_df], axis=1)
    out_df.to_csv(output_csv, index=False)

    print("Done.")

if __name__ == "__main__":
    # hard‑coded paths:
    input_csv   = "/home/pavit21178/BTP/redoing_work/datasets_base/combined.csv"
    model_path  = "model_300dim.pkl"
    output_csv  = "/home/pavit21178/BTP/redoing_work/datasets_base/combined_with_mol2vec.csv"

    print(f"Loading SMILES from {input_csv}...")
    df = load_smiles_csv(input_csv)

    print(f"Loading Mol2Vec model from {model_path}...")
    model = Word2Vec.load(model_path)

    print("Converting SMILES to fragment sentences...")
    sentences = smiles_to_sentences(df['SMILES'], radius=1)

    print("Computing embeddings...")
    vectors = compute_embeddings(sentences, model)
    n_dims = vectors.shape[1]
    col_names = [f"mol2vec_{i}" for i in range(n_dims)]
    emb_df = pd.DataFrame(vectors, columns=col_names)

    print(f"Concatenating embeddings and saving to {output_csv}...")
    out_df = pd.concat([df.reset_index(drop=True), emb_df], axis=1)
    out_df=out_df.drop(columns=["SMILES"])
    out_df.to_csv(output_csv, index=False)

    print("Done.")

Loading SMILES from /home/pavit21178/BTP/redoing_work/datasets_base/combined.csv...
Loading Mol2Vec model from model_300dim.pkl...
Converting SMILES to fragment sentences...
Computing embeddings...
Concatenating embeddings and saving to /home/pavit21178/BTP/redoing_work/datasets_base/combined_with_mol2vec.csv...
Done.


In [13]:
#!/usr/bin/env python3
"""
compute_rdkit_features.py

Reads a CSV with 'SMILES' and 'TASTE', computes RDKit 2D descriptors
and Morgan fingerprints, and writes out two CSVs:
  - descriptors + TASTE
  - Morgan bits + TASTE

Dependencies:
    pip install rdkit-pypi pandas numpy
"""

import sys
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

def load_smiles_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if 'SMILES' not in df.columns or 'TASTE' not in df.columns:
        raise KeyError("Input CSV must contain 'SMILES' and 'TASTE' columns")
    return df[['SMILES', 'TASTE']]

def compute_rdkit_descriptors(df: pd.DataFrame) -> pd.DataFrame:
    # get list of (name, function) for every 2D descriptor
    desc_list = Descriptors.descList
    names = [n for n,_ in desc_list]

    records = []
    for smi in df['SMILES']:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            records.append([np.nan]*len(desc_list))
        else:
            records.append([func(mol) for _,func in desc_list])

    desc_df = pd.DataFrame(records, columns=names, index=df.index)
    out = pd.concat([desc_df, df['TASTE']], axis=1)
    return out

def compute_morgan_fingerprints(df: pd.DataFrame,
                                radius: int = 2,
                                n_bits: int = 2048) -> pd.DataFrame:
    fps = []
    for smi in df['SMILES']:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            arr = np.zeros((n_bits,), dtype=int)
        else:
            bv = rdMolDescriptors.GetMorganFingerprintAsBitVect(
                mol, radius=radius, nBits=n_bits
            )
            arr = np.array(bv, dtype=int)
        fps.append(arr)

    bitnames = [f"fp_bit_{i}" for i in range(n_bits)]
    fp_df = pd.DataFrame(fps, columns=bitnames, index=df.index)
    out = pd.concat([fp_df, df['TASTE']], axis=1)
    return out


if __name__ == "__main__":
    # ─── Hard‑coded paths ───────────────────────────────────────────────────
    input_csv       = "/home/pavit21178/BTP/redoing_work/datasets_base/peptides.csv"
    descriptors_csv = "/home/pavit21178/BTP/redoing_work/datasets_base/peptides_rdkit_descriptors.csv"
    fingerprints_csv= "/home/pavit21178/BTP/redoing_work/datasets_base/peptides_morgan_fps.csv"
    # ────────────────────────────────────────────────────────────────────────

    print(f"[1/3] Loading SMILES+TASTE from {input_csv}...")
    data = load_smiles_csv(input_csv)

    print("[2/3] Computing RDKit 2D descriptors...")
    desc_df = compute_rdkit_descriptors(data)
    # drop SMILES if you want only descriptors + TASTE
    desc_df = desc_df.drop(columns=['SMILES'], errors='ignore')
    desc_df.to_csv(descriptors_csv, index=False)
    print(f"    -> Wrote descriptors to {descriptors_csv}")

    print("[3/3] Computing Morgan fingerprints...")
    fp_df = compute_morgan_fingerprints(data, radius=2, n_bits=2048)
    fp_df = fp_df.drop(columns=['SMILES'], errors='ignore')
    fp_df.to_csv(fingerprints_csv, index=False)
    print(f"    -> Wrote fingerprints to {fingerprints_csv}")

    print("Done.")


[1/3] Loading SMILES+TASTE from /home/pavit21178/BTP/redoing_work/datasets_base/peptides.csv...
[2/3] Computing RDKit 2D descriptors...
    -> Wrote descriptors to /home/pavit21178/BTP/redoing_work/datasets_base/peptides_rdkit_descriptors.csv
[3/3] Computing Morgan fingerprints...
    -> Wrote fingerprints to /home/pavit21178/BTP/redoing_work/datasets_base/peptides_morgan_fps.csv
Done.
