## Multithreading + Image Generation with Indigo from SMILES strings obtained from Pubchem

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from indigo import Indigo
from indigo.renderer import IndigoRenderer

# Step 1: Load the CSV file
csv_file = "pubchem/train_1m.csv"
output_folder = "images"
image_size = 384
num_threads = 16  # Number of threads for parallel processing

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize Indigo and IndigoRenderer
indigo = Indigo()
renderer = IndigoRenderer(indigo)

# Read the CSV to extract PubChem CIDs and SMILES
data = pd.read_csv(csv_file)
cids_smiles = data[['pubchem_cid', 'SMILES']].dropna()


# Define the function to render a single image
def render_image(cid, smiles):
    try:
        # Load molecule from SMILES
        mol = indigo.loadMolecule(smiles)
        mol.layout()  # Generate 2D coordinates
        indigo.setOption('render-output-format', 'png')
        indigo.setOption('render-background-color', '1,1,1')
        indigo.setOption('render-stereo-style', 'none')
        indigo.setOption('render-label-mode', 'hetero')
        indigo.setOption("render-image-width", image_size)
        indigo.setOption("render-image-height", image_size)
        # Render and save the image
        image_path = os.path.join(output_folder, f"{cid}.png")
        renderer.renderToFile(mol, image_path)
        return f"CID {cid}: Success"
    except Exception as e:
        return f"CID {cid}: Error ({e})"

# Step 2: Use ThreadPoolExecutor for parallel rendering
images_processed = 0

print("Data Loaded!")

batch_size = 1000  # Number of tasks per batch

for start_idx in tqdm(range(0, len(cids_smiles), batch_size), desc="Batch Processing"):
    batch = cids_smiles.iloc[start_idx:start_idx + batch_size]

    with ProcessPoolExecutor(max_workers=num_threads) as executor:
        futures = {
            executor.submit(render_image, row['pubchem_cid'], row['SMILES']): row['pubchem_cid']
            for _, row in batch.iterrows()
        }

        for future in as_completed(futures):
            result = future.result()
            if "Success" in result:
                images_processed += 1
            else:
                print(result)

print(f"Images saved to folder: {output_folder}")
print(f"Total images processed: {images_processed}")


Data Loaded!


Batch Processing:   6%|███▊                                                             | 59/1000 [01:22<26:39,  1.70s/it]

### No Multithreading Image Generation for Dataset

In [1]:
import os
import pandas as pd
from tqdm import tqdm
from indigo import Indigo
from indigo.renderer import IndigoRenderer

# Step 1: Load the CSV file
csv_file = "pubchem/train_1m.csv"
output_folder = "images"
image_size = 384

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Initialize Indigo and IndigoRenderer
indigo = Indigo()
renderer = IndigoRenderer(indigo)

# Configure rendering settings
# Set rendering options using indigo.setOption() calls
indigo.setOption("render-output-format", "png")
indigo.setOption("render-image-width", image_size)
indigo.setOption("render-image-height", image_size)

# Read the CSV to extract PubChem CIDs and SMILES
data = pd.read_csv(csv_file)
cids_smiles = data[['pubchem_cid', 'SMILES']].dropna()

def render_image(cid, smiles):
    try:
        mol = indigo.loadMolecule(smiles)
        mol.layout()  # Generate 2D coordinates
        
        image_path = os.path.join(output_folder, f"{cid}.png")
        renderer.renderToFile(mol, image_path)
        return f"CID {cid}: Success"
    except Exception as e:
        return f"CID {cid}: Error ({e})"

images_processed = 0

# Sequential processing with a progress bar
for _, row in tqdm(cids_smiles.iterrows(), total=len(cids_smiles), desc="Rendering Images"):
    cid = row['pubchem_cid']
    smiles = row['SMILES']
    result = render_image(cid, smiles)
    if "Success" in result:
        images_processed += 1
    else:
        print(result)

print(f"Images saved to folder: {output_folder}")
print(f"Total images processed: {images_processed}")

KeyboardInterrupt: 

## Modify and Balance Dataset

In [5]:
import pandas as pd
import random
import json

data = pd.read_csv("dataset_with_labels.csv") # 80k samples

balanced_data = []

non_matches = 0
matches = 0

non_match_bound = 30000
match_bound = 40000

for index, row in data.iterrows():
    if row['label'] == 0:
        balanced_data.append(row.to_dict())
    else:
        if matches < match_bound:
            balanced_data.append(row.to_dict())
            matches += 1
        if non_matches < non_match_bound:
            random_int = random.randint(1, 10)
            bonds = json.loads(row['bonds'])
            min_index = min([b[0] for b in bonds])
            max_index = max([b[0] for b in bonds])
            for i in range(random_int):
                decision = random.randint(1,3)
                random_1 = random.randint(min_index, max_index)
                random_2 = random.randint(min_index, max_index)
                min_random = min(random_1, random_2)
                max_random = max(random_1, random_2)
                if decision == 1 or len(bonds) == 0:
                    bonds.append([min_random, max_random, 1])
                elif decision == 2:
                    bonds.pop(random.randint(0, len(bonds)-1))
                else:
                    bonds[random.randint(0, len(bonds)-1)][0] = min_random
                    bonds[random.randint(0, len(bonds)-1)][1] = max_random
            row['bonds'] = json.dumps(bonds)
            row['label'] = 0
            balanced_data.append(row.to_dict())
            non_matches += 1

    if index % 2000 == 0:
        print(f"Processed {index} entries")

# Save combined results
df_out = pd.DataFrame(balanced_data, columns = data.columns)
df_out.to_csv("dataset_balanced.csv", index=False)

with open('dataset_balanced.json', 'w') as f:
    json.dump(balanced_data, f, indent=4)

with open('dataset_balanced_stats.txt', 'w') as f:
    f.write(f"Changed {non_matches} entries to label 0 and {matches} entries with label 1\n")

print("All done!")


Processed 0 entries
Processed 2000 entries
Processed 4000 entries
Processed 6000 entries
Processed 8000 entries
Processed 10000 entries
Processed 12000 entries
Processed 14000 entries
Processed 16000 entries
Processed 18000 entries
Processed 20000 entries
Processed 22000 entries
Processed 24000 entries
Processed 26000 entries
Processed 28000 entries
Processed 30000 entries
Processed 32000 entries
Processed 34000 entries
Processed 36000 entries
Processed 38000 entries
Processed 40000 entries
Processed 42000 entries
Processed 44000 entries
Processed 46000 entries
Processed 48000 entries
Processed 50000 entries
Processed 52000 entries
Processed 54000 entries
Processed 56000 entries
Processed 58000 entries
Processed 60000 entries
Processed 62000 entries
Processed 64000 entries
Processed 66000 entries
Processed 68000 entries
Processed 70000 entries
Processed 72000 entries
Processed 74000 entries
Processed 76000 entries
Processed 78000 entries
All done!


In [None]:
import pandas as pd
import random
import json

data = pd.read_csv("dataset_balanced.csv") # 80k samples

balanced_data = []

for index, row in data.iterrows():
    reversed_bonds = []
    bonds = json.loads(row['bonds'])
    for bond in bonds:
        reversed_bonds.append([bond[1], bond[0], bond[2]])
    bonds.extend(reversed_bonds)
    #Shuffle bonds
    random.shuffle(bonds)
    row['bonds'] = json.dumps(bonds)
    balanced_data.append(row.to_dict())

    if index % 2000 == 0:
        print(f"Processed {index} entries")

# Save combined results
df_out = pd.DataFrame(balanced_data, columns = data.columns)
df_out.to_csv("dataset_balanced_2.csv", index=False)

with open('dataset_balanced_2.json', 'w') as f:
    json.dump(balanced_data, f, indent=4)

print("All done!")
