# DREAM Target 2035 Step 1: Model Interpretation
This notebook trains an AutoGluon model on ECFP4 fingerprints, evaluates feature importance, and uses SHAP + RDKit to interpret the most predictive bits.

In [None]:
# Install required packages
!pip install autogluon shap rdkit-pypi

In [None]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Load and Prepare Data

In [None]:
# Load ECFP4 fingerprint training data
df = pd.read_parquet('TrainData-Target2035.parquet')
X = np.stack(df['ECFP4'].values)
y = df['Activity'].values
df_feat = pd.DataFrame(X.tolist())
df_feat['Activity'] = y
train_data, val_data = train_test_split(df_feat, stratify=df_feat['Activity'], test_size=0.2, random_state=42)

## Train AutoGluon

In [None]:
predictor = TabularPredictor(label='Activity', eval_metric='roc_auc').fit(train_data, time_limit=600)

## Feature Importance

In [None]:
importances = predictor.feature_importance(val_data)
importances[:30].plot(kind='barh', figsize=(10, 8))
plt.title('Top 30 Important ECFP4 Bits')
plt.gca().invert_yaxis()
plt.show()

## SHAP Interaction Analysis (via XGBoost)

In [None]:
import shap
import xgboost as xgb
X_np = np.stack(df['ECFP4'].values)
xgb_model = xgb.XGBClassifier(n_jobs=-1).fit(X_np, y)
explainer = shap.Explainer(xgb_model, X_np[:1000])
shap_values = explainer(X_np[:1000])
shap.plots.beeswarm(shap_values, max_display=20)

## Optional: Visualize Important Bits with RDKit

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import SimilarityMaps

# Example molecule (benzoic acid)
mol = Chem.MolFromSmiles('c1ccccc1C(=O)O')
bitInfo = {}
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048, bitInfo=bitInfo)
onbits = list(fp.GetOnBits())
bit = onbits[0]
SimilarityMaps.GetBitFingerprint(mol, fp, bitInfo, bitId=bit)