# Notebook 3: Annotation & Export

**Cell Annotation Pipeline - Part 3 of 3**

**Stages:** 8-9
**📥 Input:** `outputs/clustered_data.h5ad`
**📤 Output:** `outputs/annotated_data.h5ad` (FINAL)

---


In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load
print("Loading data from Notebook 2...")
adata = sc.read_h5ad('outputs/clustered_data.h5ad')

# Validate
checks = {
    'UMAP': 'X_umap' in adata.obsm,
    'Clusters': 'leiden' in adata.obs.columns,
    'Markers': 'rank_genes_groups' in adata.uns,
}

for check, passed in checks.items():
    print(f"  {'✓' if passed else '✗'} {check}")
    if not passed:
        raise ValueError(f"Missing {check} - run Notebook 2!")

print(f"\n✓ Loaded: {adata.n_obs:,} cells, {adata.obs['leiden'].nunique()} clusters")


In [None]:
# Annotation parameters
ANNOTATION_PARAMS = {
    'margin': 0.05,
    'label_mode': 'cell',
}

# Marker genes (mouse brain)
MARKER_GENES = {
    "Excit": ["Slc17a7", "Camk2a", "Satb2"],
    "Inhib": ["Gad1", "Gad2", "Slc6a1"],
    "Astro": ["Slc1a2", "Aqp4", "Gfap"],
    "Oligo": ["Plp1", "Mog", "Mbp"],
    "OPC": ["Pdgfra", "Cspg4"],
    "Micro": ["P2ry12", "Cx3cr1", "Csf1r"],
    "Endo": ["Pecam1", "Kdr", "Flt1"],
}

PLOTS_DIR = Path('plots/notebook3')
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print("Annotation parameters set")


## Stage 8: Cell Type Annotation

Score cells for each cell type based on marker expression.

In [None]:
# Score cells
print("Scoring cells...")

for cell_type, genes in MARKER_GENES.items():
    available = [g for g in genes if g in adata.raw.var_names]
    if available:
        sc.tl.score_genes(adata, available,
                         score_name=f'{cell_type}_score',
                         use_raw=True)
        print(f"  ✓ {cell_type}: {len(available)}/{len(genes)} markers")

# Assign cell types
score_cols = [col for col in adata.obs.columns if col.endswith('_score')]
scores = adata.obs[score_cols]

# Apply margin
scores_sorted = np.sort(scores.values, axis=1)
max_scores = scores_sorted[:, -1]
second_scores = scores_sorted[:, -2] if scores.shape[1] > 1 else scores_sorted[:, -1]
confident = (max_scores - second_scores) > ANNOTATION_PARAMS['margin']

adata.obs['celltype'] = scores.idxmax(axis=1).str.replace('_score', '')
adata.obs.loc[~confident, 'celltype'] = 'Unlabeled'

print(f"\n✓ {confident.sum():,} / {len(confident):,} confidently labeled")
print(f"\nCell type distribution:")
print(adata.obs['celltype'].value_counts())


In [None]:
# Plot cell types
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sc.pl.umap(adata, color='celltype', ax=axes[0], show=False,
          title='Cell type annotations')
sc.pl.umap(adata, color='leiden', legend_loc='on data',
          ax=axes[1], show=False, title='Clusters')

plt.tight_layout()
plt.savefig(PLOTS_DIR / 'cell_type_umap.png', dpi=300, bbox_inches='tight')
plt.show()

# Composition heatmap
composition = pd.crosstab(adata.obs['leiden'], adata.obs['celltype'],
                         normalize='index')

plt.figure(figsize=(10, 6))
sns.heatmap(composition, annot=True, fmt='.2f', cmap='YlOrRd')
plt.title('Cell type composition per cluster')
plt.tight_layout()
plt.savefig(PLOTS_DIR / 'composition_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()


## Final Export

Save annotated data and metadata.

In [None]:
# Save annotated data
output_file = 'outputs/annotated_data.h5ad'

adata.uns['pipeline_params']['notebook'] = '3_annotation_export'
adata.uns['pipeline_params']['annotation'] = ANNOTATION_PARAMS

adata.write(output_file)

# Export metadata CSV
metadata_cols = ['leiden', 'celltype', 'orig.ident', 'Genotype', 'Sex',
                'n_genes_by_counts', 'total_counts', 'percent_mt']
adata.obs[metadata_cols].to_csv('outputs/cell_metadata.csv')

# Summary
summary = pd.DataFrame({
    'Metric': ['Total cells', 'Clusters', 'Cell types',
               'Median genes/cell', 'Median UMIs/cell'],
    'Value': [
        f"{adata.n_obs:,}",
        adata.obs['leiden'].nunique(),
        adata.obs['celltype'].nunique(),
        f"{adata.obs['n_genes_by_counts'].median():.0f}",
        f"{adata.obs['total_counts'].median():.0f}",
    ]
})
summary.to_csv('outputs/analysis_summary.csv', index=False)

print("\n" + "="*60)
print("PIPELINE COMPLETE!")
print("="*60)
print(f"✓ Annotated data: {output_file}")
print(f"✓ Metadata: outputs/cell_metadata.csv")
print(f"✓ Summary: outputs/analysis_summary.csv")
print(f"\n  {adata.n_obs:,} cells")
print(f"  {adata.obs['leiden'].nunique()} clusters")
print(f"  {adata.obs['celltype'].nunique()} cell types")
print("\n🎉 Ready for downstream analysis!")

display(summary)
