# UniMol2 Feature Generation

This notebook shows how the UniMol2 representations were created for the training and test datasets.

In [2]:
import numpy as np
from unimol_tools import UniMolRepr

2025-08-13 11:38:25 | unimol_tools\weights\weighthub.py | 17 | INFO | Uni-Mol Tools | Weights will be downloaded to default directory: c:\Users\cbrzakala\AppData\Local\anaconda3\Lib\site-packages\unimol_tools\weights


In [3]:
clf = UniMolRepr(
    data_type="molecule",
    remove_hs=False,
    model_name="unimolv2",  # avaliable: unimolv1, unimolv2
    model_size="84m",  # work when model_name is unimolv2. avaliable: 84m, 164m, 310m, 570m, 1.1B.
)

2025-08-13 11:38:29 | unimol_tools\models\unimolv2.py | 144 | INFO | Uni-Mol Tools | Loading pretrained weights from c:\Users\cbrzakala\AppData\Local\anaconda3\Lib\site-packages\unimol_tools\weights\modelzoo/84M/checkpoint.pt


In [5]:
import pandas as pd

host_df = pd.read_csv("host_smiles.csv")
guest_df = pd.read_csv("guest_smiles.csv")

host_smiles = host_df["smiles"].tolist()
guest_smiles = guest_df["smiles"].tolist()

In [4]:
# Define a function to process data in chunks and return a DataFrame
def process_in_chunks(smiles_list, chunk_size=100):
    repr_list = []
    for i in range(0, len(smiles_list), chunk_size):
        chunk = smiles_list[i : i + chunk_size]
        chunk_repr = clf.get_repr(chunk, return_atomic_reprs=False)
        repr_list.extend(chunk_repr["cls_repr"])
    # Convert the list of representations to a DataFrame
    repr_df = pd.DataFrame(repr_list)
    return repr_df

In [None]:
# Process host_smiles in chunks
host_repr = process_in_chunks(host_smiles)

# Convert to DataFrame and save to CSV
host_repr_df = pd.DataFrame(host_repr)
host_repr_df.to_csv("host_representations.csv", index=False)

In [6]:
# Process guest_smiles in chunks
guest_repr = process_in_chunks(guest_smiles)

# Convert to DataFrame and save to CSV
guest_repr_df = pd.DataFrame(guest_repr)
guest_repr_df.to_csv("guest_representations.csv", index=False)

2025-02-05 11:13:29 | unimol_tools\data\conformer.py | 333 | INFO | Uni-Mol Tools | Start generating conformers...
100%|██████████| 100/100 [00:02<00:00, 43.57it/s]
2025-02-05 11:13:31 | unimol_tools\data\conformer.py | 342 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 100.00% of molecules.
2025-02-05 11:13:31 | unimol_tools\data\conformer.py | 349 | INFO | Uni-Mol Tools | Succeeded in generating 3d conformers for 100.00% of molecules.
100%|██████████| 4/4 [00:12<00:00,  3.14s/it]
2025-02-05 11:13:44 | unimol_tools\data\conformer.py | 333 | INFO | Uni-Mol Tools | Start generating conformers...
100%|██████████| 100/100 [00:01<00:00, 68.17it/s]
2025-02-05 11:13:45 | unimol_tools\data\conformer.py | 342 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 100.00% of molecules.
2025-02-05 11:13:45 | unimol_tools\data\conformer.py | 349 | INFO | Uni-Mol Tools | Succeeded in generating 3d conformers for 100.00% of molecules.
100%|██████████| 4/4 [00:08<00:00,  2.

In [7]:
# Import host and guest representations from CSV files
host_repr_df = pd.read_csv("host_representations.csv")
guest_repr_df = pd.read_csv("guest_representations.csv")

# Print the shapes to verify the data
print("Host representations shape:", host_repr_df.shape)
print("Guest representations shape:", guest_repr_df.shape)

Host representations shape: (3459, 768)
Guest representations shape: (3459, 768)


In [8]:
# Get molecular representations (cls_repr) for both host and guest
host_mol_repr = np.stack(host_repr_df.values)
guest_mol_repr = np.stack(guest_repr_df.values)

print("Host molecular representations shape:", host_mol_repr.shape)
print("Guest molecular representations shape:", guest_mol_repr.shape)

# Concatenate along feature axis (axis 1)
combined_repr = np.concatenate([host_mol_repr, guest_mol_repr], axis=1)

print("\nCombined representations shape:", combined_repr.shape)

Host molecular representations shape: (3459, 768)
Guest molecular representations shape: (3459, 768)

Combined representations shape: (3459, 1536)


In [11]:
# Convert the combined representation to a DataFrame
combined_repr_df = pd.DataFrame(combined_repr)

# Save the DataFrame to a CSV file
combined_repr_df.to_csv("unimol_rep.csv", index=False)

In [None]:
import pandas as pd
import numpy as np


# Function to process a file with Host_SMILES and Guest_SMILES columns and save concatenated unimol representations
def process_host_guest_file(input_file: str, output_file: str) -> None:
    """
    Processes a CSV file containing Host_SMILES and Guest_SMILES columns,
    generates unimol representations for each, concatenates them, and saves to a CSV file.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
    """
    # Type validation
    if not isinstance(input_file, str) or not isinstance(output_file, str):
        raise TypeError("Both input_file and output_file must be strings.")

    # Read the input file
    df = pd.read_csv(input_file)
    if "Host_SMILES" not in df.columns or "Guest_SMILES" not in df.columns:
        raise ValueError(
            "Input file must contain 'Host_SMILES' and 'Guest_SMILES' columns."
        )

    # Extract SMILES lists
    host_smiles_list = df["Host_SMILES"].tolist()
    guest_smiles_list = df["Guest_SMILES"].tolist()

    # Generate unimol representations for hosts and guests
    host_repr_df = process_in_chunks(host_smiles_list)
    guest_repr_df = process_in_chunks(guest_smiles_list)

    # Convert to numpy arrays for concatenation
    host_repr_np = np.stack(host_repr_df.values)
    guest_repr_np = np.stack(guest_repr_df.values)

    # Concatenate along feature axis (axis 1)
    combined_repr = np.concatenate([host_repr_np, guest_repr_np], axis=1)

    # Save to CSV
    combined_repr_df = pd.DataFrame(combined_repr)
    combined_repr_df.to_csv(output_file, index=False)


process_host_guest_file("cd_val_canonical.csv", "cd_val_unimol.csv")
process_host_guest_file("pfas_val_canonical.csv", "pfas_val_unimol.csv")

NameError: name 'process_in_chunks' is not defined

In [5]:
process_host_guest_file("base_cdpfas_val_canonical.csv", "base_cdpfas_val_unimol.csv")

2025-08-13 11:38:43 | unimol_tools\data\conformer.py | 333 | INFO | Uni-Mol Tools | Start generating conformers...
100%|██████████| 11/11 [00:14<00:00,  1.31s/it]
2025-08-13 11:38:57 | unimol_tools\data\conformer.py | 342 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 100.00% of molecules.
2025-08-13 11:38:57 | unimol_tools\data\conformer.py | 349 | INFO | Uni-Mol Tools | Succeeded in generating 3d conformers for 100.00% of molecules.
100%|██████████| 1/1 [00:41<00:00, 41.64s/it]
2025-08-13 11:39:39 | unimol_tools\data\conformer.py | 333 | INFO | Uni-Mol Tools | Start generating conformers...
100%|██████████| 11/11 [00:00<00:00, 47.61it/s]
2025-08-13 11:39:39 | unimol_tools\data\conformer.py | 342 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 100.00% of molecules.
2025-08-13 11:39:39 | unimol_tools\data\conformer.py | 349 | INFO | Uni-Mol Tools | Succeeded in generating 3d conformers for 100.00% of molecules.
100%|██████████| 1/1 [00:06<00:00,  6.77s/