In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")


In [19]:
# Load GoodScents dataset
print("=== Loading GoodScents Dataset ===")
goodscents_dir = os.path.join('pyrfume-data', 'goodscents')
goodscents_behavior = pd.read_csv(os.path.join(goodscents_dir, 'behavior.csv'))
goodscents_molecules = pd.read_csv(os.path.join(goodscents_dir, 'molecules.csv'))

print(f"GoodScents molecules: {len(goodscents_molecules)}")
print(f"GoodScents behavior entries: {len(goodscents_behavior)}")

print(f"\nMolecules columns: {list(goodscents_molecules.columns)}")
print(f"Behavior columns: {list(goodscents_behavior.columns)}")

# Show example entries from both
print("\nExample molecule entries:")
print(goodscents_molecules.head())

print("\nExample behavior entries:")
goodscents_behavior.head()


=== Loading GoodScents Dataset ===
GoodScents molecules: 4565
GoodScents behavior entries: 4626

Molecules columns: ['CID', 'MolecularWeight', 'IsomericSMILES', 'IUPACName', 'name']
Behavior columns: ['Stimulus', 'Descriptors']

Example molecule entries:
   CID  MolecularWeight         IsomericSMILES                     IUPACName  \
0    4            75.11                CC(CN)O            1-aminopropan-2-ol   
1   49           116.11       CC(C)C(=O)C(=O)O   3-methyl-2-oxobutanoic acid   
2   51           146.10  C(CC(=O)O)C(=O)C(=O)O        2-oxopentanedioic acid   
3   58           102.09          CCC(=O)C(=O)O            2-oxobutanoic acid   
4   70           130.14      CC(C)CC(=O)C(=O)O  4-methyl-2-oxopentanoic acid   

                           name  
0            1-aminopropan-2-ol  
1   3-methyl-2-oxobutanoic acid  
2           2-ketoglutaric acid  
3            2-oxobutanoic acid  
4  4-methyl-2-oxopentanoic acid  

Example behavior entries:


Unnamed: 0,Stimulus,Descriptors
0,100-06-1,sweet;vanilla;cherry maraschino cherry;powdery...
1,100-09-4,phenolic;animal;fecal;medicinal
2,100-42-5,sweet;plastic;floral;balsamic
3,100-51-6,sweet;floral;rose;fruity;phenolic;balsamic;che...
4,100-52-7,sweet;cherry;cherry maraschino cherry;nutty;fr...


In [20]:
# Create labeled dataset from GoodScents following the proper chain:
# Stimulus (CAS) → CID → SMILES
print("=== Creating Labeled Dataset with Proper CAS → CID → SMILES Chain ===")

# Load the stimuli file to get the CAS → CID mapping
stimuli_path = os.path.join('pyrfume-data', 'goodscents', 'stimuli.csv')
goodscents_stimuli = pd.read_csv(stimuli_path)

print(f"Loaded stimuli mapping: {len(goodscents_stimuli)} entries")
print(f"Stimuli columns: {list(goodscents_stimuli.columns)}")

# Show the data chain
print(f"\n=== Data Chain Example ===")
print("Stimuli (CAS → CID mapping):")
print(goodscents_stimuli[['Stimulus', 'CID']].head())

print(f"\nBehavior (CAS → Descriptors):")
print(goodscents_behavior[['Stimulus', 'Descriptors']].head())

print(f"\nMolecules (CID → SMILES):")
print(goodscents_molecules[['CID', 'IsomericSMILES']].head())

def create_goodscents_dataset(molecules_df, behavior_df, stimuli_df):
    """Create labeled dataset using the proper CAS → CID → SMILES chain"""
    
    print(f"\n=== Step 1: Merge Behavior (CAS) with Stimuli (CAS → CID) ===")
    # First join: behavior + stimuli (both use Stimulus/CAS as key)
    behavior_with_cid = behavior_df.merge(stimuli_df[['Stimulus', 'CID']], 
                                         on='Stimulus', how='inner')
    print(f"Merged behavior with stimuli: {len(behavior_with_cid)} entries")
    
    print(f"\n=== Step 2: Merge with Molecules (CID → SMILES) ===")
    # Second join: behavior_with_cid + molecules (both use CID as key)
    full_dataset = behavior_with_cid.merge(molecules_df[['CID', 'IsomericSMILES']], 
                                          on='CID', how='inner')
    print(f"Final merged dataset: {len(full_dataset)} entries with complete CAS→CID→SMILES chain")
    
    # Check for missing SMILES
    missing_smiles = full_dataset['IsomericSMILES'].isna().sum()
    print(f"Entries with missing SMILES: {missing_smiles}")
    
    if missing_smiles > 0:
        print("Removing entries without SMILES...")
        full_dataset = full_dataset.dropna(subset=['IsomericSMILES'])
        print(f"Dataset after removing missing SMILES: {len(full_dataset)} entries")
    
    merged = full_dataset
    
    # Debug: check some merged descriptors
    print(f"\nSample merged descriptors:")
    for i, desc in enumerate(merged['Descriptors'].head(5)):
        print(f"{i+1}. {desc}")
    
    # Create binary labels for our targets - let's test each one
    target_labels = ['sweet', 'floral', 'minty', 'pungent']
    
    print(f"\nTesting label detection:")
    for label in target_labels:
        # Test the search
        matches = merged['Descriptors'].str.contains(label, case=False, na=False)
        count = matches.sum()
        print(f"{label}: {count} matches found")
        
        # Show some examples if found
        if count > 0:
            examples = merged[matches]['Descriptors'].head(3)
            for j, example in enumerate(examples):
                print(f"  Example {j+1}: {example}")
        
        # Create the binary column
        merged[label] = matches.astype(int)
    
    # Select relevant columns and clean data
    result = merged[['Stimulus', 'CID', 'IsomericSMILES', 'Descriptors'] + target_labels].copy()
    
    # All SMILES should be real now (not placeholders)
    before_smiles = len(result)
    result = result.dropna(subset=['IsomericSMILES'])
    print(f"\nRemoved {before_smiles - len(result)} molecules without SMILES")
    
    # Show label distribution before filtering
    print(f"\nLabel distribution before filtering:")
    for label in target_labels:
        count = result[label].sum()
        print(f"{label}: {count}")
    
    # Remove rows with no labels (all zeros) - but let's be more careful
    label_cols = ['sweet', 'floral', 'minty', 'pungent']
    before_labels = len(result)
    has_labels = result[label_cols].sum(axis=1) > 0
    result = result[has_labels]
    print(f"Removed {before_labels - len(result)} molecules with no target labels")
    
    return result

# Create the dataset using the 3-way join
goodscents_data = create_goodscents_dataset(goodscents_molecules, goodscents_behavior, goodscents_stimuli)
print(f"\nFinal dataset: {len(goodscents_data)} labeled molecules")

if len(goodscents_data) > 0:
    # Show label distribution
    print("\n=== Final Label Distribution ===")
    for label in ['sweet', 'floral', 'minty', 'pungent']:
        count = goodscents_data[label].sum()
        print(f"{label.upper()}: {count} molecules")
    
    goodscents_data.head()
else:
    print("No labeled molecules found! Need to debug the descriptor format.")


=== Creating Labeled Dataset with Proper CAS → CID → SMILES Chain ===
Loaded stimuli mapping: 4626 entries
Stimuli columns: ['Stimulus', 'TGSC ID', 'CID', 'Concentration %', 'Solvent']

=== Data Chain Example ===
Stimuli (CAS → CID mapping):
   Stimulus   CID
0  100-06-1  7476
1  100-09-4  7478
2  100-42-5  7501
3  100-51-6   244
4  100-52-7   240

Behavior (CAS → Descriptors):
   Stimulus                                        Descriptors
0  100-06-1  sweet;vanilla;cherry maraschino cherry;powdery...
1  100-09-4                    phenolic;animal;fecal;medicinal
2  100-42-5                      sweet;plastic;floral;balsamic
3  100-51-6  sweet;floral;rose;fruity;phenolic;balsamic;che...
4  100-52-7  sweet;cherry;cherry maraschino cherry;nutty;fr...

Molecules (CID → SMILES):
   CID         IsomericSMILES
0    4                CC(CN)O
1   49       CC(C)C(=O)C(=O)O
2   51  C(CC(=O)O)C(=O)C(=O)O
3   58          CCC(=O)C(=O)O
4   70      CC(C)CC(=O)C(=O)O

=== Step 1: Merge Behavior (CAS) 

In [22]:
goodscents_data.head(5)

Unnamed: 0,Stimulus,CID,IsomericSMILES,Descriptors,sweet,floral,minty,pungent
0,100-06-1,7476,CC(=O)C1=CC=C(C=C1)OC,sweet;vanilla;cherry maraschino cherry;powdery...,1,0,0,0
2,100-42-5,7501,C=CC1=CC=CC=C1,sweet;plastic;floral;balsamic,1,1,0,0
3,100-51-6,244,C1=CC=C(C=C1)CO,sweet;floral;rose;fruity;phenolic;balsamic;che...,1,1,0,0
4,100-52-7,240,C1=CC=C(C=C1)C=O,sweet;cherry;cherry maraschino cherry;nutty;fr...,1,0,0,0
5,100-53-8,7509,C1=CC=C(C=C1)CS,sulfurous;garlic;horseradish;alliaceous;coffee...,0,0,1,0


In [23]:
# Create train/test split with stratification for multi-label data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

print("=== Creating Train/Test Split ===")

# Create multi-label combinations for stratification
label_cols = ['sweet', 'floral', 'minty', 'pungent']
label_combinations = goodscents_data[label_cols].apply(lambda x: tuple(x), axis=1)

print(f"Total molecules: {len(goodscents_data)}")
print(f"Unique label combinations: {len(set(label_combinations))}")

# Show most common label combinations
from collections import Counter
combo_counts = Counter(label_combinations)
print(f"\nTop 10 label combinations:")
for combo, count in combo_counts.most_common(10):
    labels = [label_cols[i] for i, val in enumerate(combo) if val == 1]
    print(f"  {labels}: {count} molecules")

# Stratified split (80/20)
X = goodscents_data[['Stimulus', 'CID', 'IsomericSMILES', 'Descriptors']]
y = goodscents_data[label_cols]

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=label_combinations
    )
    print(f"\nStratified split successful!")
except ValueError as e:
    print(f"\nStratified split failed (some combinations too rare): {e}")
    print("Using random split instead...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

print(f"Train set: {len(X_train)} molecules")
print(f"Test set: {len(X_test)} molecules")

# Combine X and y for easier handling
train_data = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
test_data = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

# Show label distribution in train/test sets
print(f"\n=== Label Distribution ===")
for split_name, data in [("Train", train_data), ("Test", test_data)]:
    print(f"\n{split_name} set:")
    for label in label_cols:
        count = data[label].sum()
        pct = (count / len(data)) * 100
        print(f"  {label.upper()}: {count} molecules ({pct:.1f}%)")
    
    # Multi-label stats
    labels_per_molecule = data[label_cols].sum(axis=1)
    print(f"  Avg labels per molecule: {labels_per_molecule.mean():.2f}")
    print(f"  Molecules with multiple labels: {(labels_per_molecule > 1).sum()}")

print(f"\nTrain data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
train_data.head()


=== Creating Train/Test Split ===
Total molecules: 1826
Unique label combinations: 13

Top 10 label combinations:
  ['sweet']: 645 molecules
  ['floral']: 585 molecules
  ['sweet', 'floral']: 300 molecules
  ['minty']: 135 molecules
  ['pungent']: 75 molecules
  ['sweet', 'minty']: 30 molecules
  ['sweet', 'pungent']: 27 molecules
  ['floral', 'minty']: 13 molecules
  ['sweet', 'floral', 'minty']: 8 molecules
  ['floral', 'pungent']: 3 molecules

Stratified split failed (some combinations too rare): The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
Using random split instead...
Train set: 1460 molecules
Test set: 366 molecules

=== Label Distribution ===

Train set:
  SWEET: 806 molecules (55.2%)
  FLORAL: 726 molecules (49.7%)
  MINTY: 155 molecules (10.6%)
  PUNGENT: 89 molecules (6.1%)
  Avg labels per molecule: 1.22
  Molecules with multiple labels: 306

Test set:
  SWEET: 206 molecules (56.3%)
  FL

Unnamed: 0,Stimulus,CID,IsomericSMILES,Descriptors,sweet,floral,minty,pungent
0,143764-28-7,6428938,CC(CC(=O)C)SC,pungent,0,0,0,1
1,185429-83-8,17839537,CC1C2=C(CCCC2(C)C)CCC1(C)C(=O)C,floral;woody;amber,0,1,0,0
2,109682-87-3,528707,CC1C(C(=O)C(=C1C)O)C,sweet;spicy;caramellic;maple;coffee;burnt;brea...,1,0,0,0
3,108861-12-7,11435352,CCCCCC[C@@H]1CCCC(=O)O1,sweet;fruity;creamy,1,0,0,0
4,122-72-5,31226,CC(=O)OCCCC1=CC=CC=C1,sweet;spicy;cinnamon;honey;fruity;cinnamyl;bal...,1,0,0,0


In [28]:
# Save datasets for FART training and testing
print("=== Saving Datasets ===")

# Save train/test splits
train_data.to_csv('goodscents_train.csv', index=False)
test_data.to_csv('goodscents_test.csv', index=False)
print(f"✅ Saved training set: goodscents_train.csv ({len(train_data)} molecules)")
print(f"✅ Saved test set: goodscents_test.csv ({len(test_data)} molecules)")



=== Saving Datasets ===
✅ Saved training set: goodscents_train.csv (1460 molecules)
✅ Saved test set: goodscents_test.csv (366 molecules)


In [None]:
# Prepare bushdid dataset for prediction (molecules without known odors)
print(f"\n=== Preparing Bushdid Prediction Set ===")
# Load the bushdid_2014 dataset
bushdid_dir = os.path.join('pyrfume-data', 'bushdid_2014')
mol_path = os.path.join(bushdid_dir, 'molecules.csv')

# Load molecules
bushdid_molecules = pd.read_csv(mol_path)
print(f"Loaded {len(bushdid_molecules)} molecules from bushdid_2014 dataset")
print(f"Columns: {list(bushdid_molecules.columns)}")
bushdid_molecules.head()
bushdid_molecules.to_csv('bushdid_molecules.csv', index=False)
print(f"✅ Saved prediction set: bushdid_molecules.csv ({len(bushdid_molecules)} molecules)")

# Keep only molecules with SMILES for prediction
bushdid_predict = bushdid_molecules[['CID', 'IsomericSMILES']].dropna(subset=['IsomericSMILES'])
print(f"Bushdid molecules for prediction: {len(bushdid_predict)}")

# Add placeholder columns for the target labels (will be filled by predictions)
for label in label_cols:
    bushdid_predict[label] = -1  # -1 indicates "to be predicted"


# Summary of all datasets
print(f"\n=== Dataset Summary ===")
print(f"📁 goodscents_train.csv: {len(train_data):,} labeled molecules for training")
print(f"📁 goodscents_test.csv: {len(test_data):,} labeled molecules for testing")
print(f"📁 bushdid_predict.csv: {len(bushdid_predict):,} unlabeled molecules for prediction")
print(f"📊 Total molecules: {len(train_data) + len(test_data) + len(bushdid_predict):,}")

# Show file sizes
import os
for filename in ['goodscents_train.csv', 'goodscents_test.csv', 'bushdid_molecules.csv']:
    if os.path.exists(filename):
        size_mb = os.path.getsize(filename) / (1024 * 1024)
        print(f"💾 {filename}: {size_mb:.1f} MB")

print(f"\n🎯 Ready to FART training!")
print(f"Next steps:")
print(f"1. Load FART model and tokenizer")
print(f"2. Create data loaders for train/test sets")
print(f"3. Fine-tune FART on goodscents_train.csv")
print(f"4. Evaluate on goodscents_test.csv")
print(f"5. Predict odor labels for bushdid_predict.csv")



=== Preparing Bushdid Prediction Set ===
Loaded 128 molecules from bushdid_2014 dataset
Columns: ['CID', 'MolecularWeight', 'IsomericSMILES', 'IUPACName', 'name', 'Odorant name', 'C.A.S.', '% odorant', 'Solvent']
✅ Saved prediction set: bushdid_molecules.csv (128 molecules)
Bushdid molecules for prediction: 128

=== Dataset Summary ===
📁 goodscents_train.csv: 1,460 labeled molecules for training
📁 goodscents_test.csv: 366 labeled molecules for testing
📁 bushdid_predict.csv: 128 unlabeled molecules for prediction
📊 Total molecules: 1,954
💾 goodscents_train.csv: 0.1 MB
💾 goodscents_test.csv: 0.0 MB
💾 bushdid_molecules.csv: 0.0 MB

🎯 Ready for FART training!
Next steps:
1. Load FART model and tokenizer
2. Create data loaders for train/test sets
3. Fine-tune FART on goodscents_train.csv
4. Evaluate on goodscents_test.csv
5. Predict odor labels for bushdid_predict.csv


In [25]:
# Load the bushdid_2014 dataset
bushdid_dir = os.path.join('pyrfume-data', 'bushdid_2014')
mol_path = os.path.join(bushdid_dir, 'molecules.csv')

# Load molecules
mols = pd.read_csv(mol_path)
print(f"Loaded {len(mols)} molecules from bushdid_2014 dataset")
print(f"Columns: {list(mols.columns)}")
mols.head()


Loaded 128 molecules from bushdid_2014 dataset
Columns: ['CID', 'MolecularWeight', 'IsomericSMILES', 'IUPACName', 'name', 'Odorant name', 'C.A.S.', '% odorant', 'Solvent']


Unnamed: 0,CID,MolecularWeight,IsomericSMILES,IUPACName,name,Odorant name,C.A.S.,% odorant,Solvent
0,176,60.05,CC(=O)O,acetic acid,acetic acid,acetic acid,64-19-7,10.0,mineral oil
1,177,44.05,CC=O,acetaldehyde,acetaldehyde,acetaldehyde,75-07-0,5.0,water
2,179,88.11,CC(C(=O)C)O,3-hydroxybutan-2-one,acetoin,acetoin,513-86-0,0.1,"1,2-propanediol"
3,180,58.08,CC(=O)C,propan-2-one,acetone,propan-2-one,67-64-1,25.0,water
4,240,106.12,C1=CC=C(C=C1)C=O,benzaldehyde,benzaldehyde,benzaldehyde,100-52-7,0.25,mineral oil
