# DREAM Target 2035: AutoGluon Model with Public Ligand Diversity Filtering

In [None]:
!pip install autogluon pandas numpy pyarrow rdkit-pypi

In [None]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

## Load Training Data and Fingerprints

In [None]:
df = pd.read_parquet('data/WDR91.parquet')
fps = ['ECFP4', 'ECFP6', 'FCFP4', 'FCFP6', 'MACCS', 'RDK', 'AVALON', 'ATOMPAIR', 'TOPTOR']
X = np.hstack([np.stack(df[fp].values) for fp in fps])
y = df['TARGET_VALUE'].values

## Prepare Imbalanced Validation Set

In [None]:
X_pos, X_neg = X[y == 1], X[y == 0]
y_pos, y_neg = y[y == 1], y[y == 0]
n_val_pos = min(500, len(X_pos))
n_val_neg = n_val_pos * 250
X_val_pos, _, y_val_pos, _ = train_test_split(X_pos, y_pos, test_size=(1 - n_val_pos / len(X_pos)), random_state=42)
X_val_neg, _, y_val_neg, _ = train_test_split(X_neg, y_neg, test_size=(1 - n_val_neg / len(X_neg)), random_state=42)
X_val = np.vstack([X_val_pos, X_val_neg])
y_val = np.hstack([y_val_pos, y_val_neg])
X_train = np.vstack([X_pos, X_neg])
y_train = np.hstack([y_pos, y_neg])
train_df = pd.DataFrame(X_train)
train_df['Activity'] = y_train
val_df = pd.DataFrame(X_val)
val_df['Activity'] = y_val

In [None]:
predictor = TabularPredictor(label='Activity', eval_metric='roc_auc').fit(train_df, time_limit=600)

## Load Public Ligands and Compute ECFP4 Fingerprints

In [None]:
public_df = pd.read_csv('data/14_public_domain_WDR91_ligands.csv')
public_df.dropna(subset=['smiles'], inplace=True)
ligand_fps = []
for smi in public_df['smiles']:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        ligand_fps.append(fp)

## Load Test Set and Generate Predictions

In [None]:
test_df = pd.read_parquet('data/Step1_TestData_Target2035.parquet')
X_test = np.hstack([np.stack(test_df[fp].values) for fp in fps])
X_test_df = pd.DataFrame(X_test)
random_ids = test_df['RandomID'].values
test_scores = predictor.predict_proba(X_test_df)[1]
submission = pd.DataFrame({'RandomID': random_ids, 'Score': test_scores})

## Filter Top-500 for Diversity Against Public Ligands

In [None]:
submission_sorted = submission.sort_values('Score', ascending=False).copy()

# Select top candidates avoiding high similarity
selected = []
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
for idx, row in submission_sorted.iterrows():
    mol = Chem.MolFromSmiles(test_df.iloc[idx]['smiles']) if 'smiles' in test_df.columns else None
    if not mol: continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    sim_to_public = [DataStructs.TanimotoSimilarity(fp, pub_fp) for pub_fp in ligand_fps]
    if max(sim_to_public) < 0.8:
        selected.append(idx)
    if len(selected) >= 500: break
submission_sorted['Sel_500'] = 0
submission_sorted.loc[selected[:500], 'Sel_500'] = 1
submission_sorted['Sel_200'] = 0
submission_sorted.loc[selected[:200], 'Sel_200'] = 1
submission_sorted[['RandomID', 'Sel_200', 'Sel_500', 'Score']].to_csv('TeamMyTeamName.csv', index=False)