In [None]:
from rdkit import Chem
import pandas as pd

SMRT_dataset_sdf = Chem.SDMolSupplier("SMRT_dataset.sdf") 
SMRT_dataset_df = pd.read_csv("SMRT_dataset.csv",delimiter=';')


In [None]:
from tqdm import tqdm
id_smiles_dict = {}
for mol in tqdm(SMRT_dataset_sdf):
    if mol is not None:
        id_smiles_dict[int(mol.GetProp("_Name"))] = Chem.CanonSmiles(Chem.MolToSmiles(mol))

In [None]:
import pubchempy as pcp

for idx,row in SMRT_dataset_df.iterrows():
    if pd.isnull(row.SMILES):
        print(f"Missing SMILES for pubchem ID: {row.pubchem}")
        compound = pcp.Compound.from_cid(row.pubchem)
        id_smiles_dict[row.pubchem] = Chem.CanonSmiles(compound.smiles)

In [None]:
SMRT_dataset_df.loc[:, "SMILES"] = SMRT_dataset_df.pubchem.map(id_smiles_dict)
SMRT_dataset_df.SMILES.isnull().sum()

def remove_atom_map_numbers(s):
    mol = Chem.MolFromSmiles(s)
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(0)
    return Chem.CanonSmiles(Chem.MolToSmiles(mol))
SMRT_dataset_df.loc[:, "SMILES"] = SMRT_dataset_df.loc[:, "SMILES"].apply(remove_atom_map_numbers)

np.int64(0)

In [32]:
SMRT_dataset_df.to_csv("SMRT_dataset_processed.csv",index=False)

## Generate pseudo date labels based on scaffold

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# Generate pseudo dates based on scaffold, and each scaffold gets a date range of 3-5 years with normal distribution

# Set random seed for reproducibility
dataset_clean = pd.read_csv("SMRT_dataset_processed.csv")
np.random.seed(42)

# Get unique scaffolds
unique_scaffolds = dataset_clean['scaffold'].unique()

# Create a mapping of scaffold to date parameters
scaffold_date_params = {}
base_date = datetime(2015, 1, 1)  # Starting date

for scaffold in unique_scaffolds:
    if pd.isna(scaffold):
        continue

    # Random year range between 1-3 years for this scaffold
    year_range = np.random.uniform(1, 3)

    # Random starting point for this scaffold (0-5 years from base_date)
    scaffold_start_offset = np.random.uniform(0, 5 * 365)
    scaffold_start_date = base_date + timedelta(days=scaffold_start_offset)
    
    scaffold_date_params[scaffold] = {
        'start_date': scaffold_start_date,
        'year_range': year_range
    }

# Generate dates for each row
def generate_date_for_row(row):
    scaffold = row['scaffold']
    
    if pd.isna(scaffold) or scaffold not in scaffold_date_params:
        # For rows without scaffold, assign random date
        random_days = np.random.uniform(0, 15 * 365)
        return base_date + timedelta(days=random_days)
    
    params = scaffold_date_params[scaffold]
    start_date = params['start_date']
    year_range = params['year_range']
    
    # Generate date with normal distribution
    # Mean at middle of range, std dev = range/6 (covers ~99.7% within range)
    days_range = year_range * 365
    mean_days = days_range / 2
    std_days = days_range / 6
    
    # Sample from normal distribution and clip to range
    random_offset = np.random.normal(mean_days, std_days)
    random_offset = np.clip(random_offset, 0, days_range)
    
    return start_date + timedelta(days=random_offset)

print("Generating pseudo dates based on scaffold...")
dataset_clean['date'] = dataset_clean.apply(generate_date_for_row, axis=1)

# Convert to date format (remove time component)
dataset_clean['date'] = pd.to_datetime(dataset_clean['date']).dt.date

print(f"Generated dates for {len(dataset_clean)} records")
print(f"\nDate range: {dataset_clean['date'].min()} to {dataset_clean['date'].max()}")

# Show example of dates for one scaffold
example_scaffold = dataset_clean[dataset_clean['scaffold'].notna()]['scaffold'].iloc[0]
scaffold_dates = dataset_clean[dataset_clean['scaffold'] == example_scaffold]['date']
print(f"\nExample: Scaffold '{example_scaffold[:50]}...'")
print(f"  Number of chemicals: {len(scaffold_dates)}")
print(f"  Date range: {scaffold_dates.min()} to {scaffold_dates.max()}")
print(f"  Span: {(pd.to_datetime(scaffold_dates.max()) - pd.to_datetime(scaffold_dates.min())).days / 365:.2f} years")

# Show sample
print(f"\nSample data with dates:")
print(dataset_clean[['SMILES', 'rt', 'scaffold', 'date']].head(10))