# Notebook 2: Clustering & Markers

**Cell Annotation Pipeline - Part 2 of 3**

**Stages:** 5-7
**📥 Input:** `outputs/qc_filtered_data.h5ad`
**📤 Output:** `outputs/clustered_data.h5ad`
**➡️ Next:** `3_annotation_export.ipynb`

---


## Load Data from Notebook 1

Load the QC-filtered data and validate.

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Load
print("Loading data from Notebook 1...")
adata = sc.read_h5ad('outputs/qc_filtered_data.h5ad')

# Validate
print("\n" + "="*60)
print("DATA VALIDATION")
print("="*60)

checks = {
    'QC metrics': 'percent_mt' in adata.obs.columns,
    'Doublet scores': 'doublet_score' in adata.obs.columns,
    'Sample info': 'orig.ident' in adata.obs.columns,
}

for check, passed in checks.items():
    print(f"  {'✓' if passed else '✗'} {check}")
    if not passed:
        raise ValueError(f"Missing {check} - run Notebook 1!")

print(f"\n✓ Loaded: {adata.n_obs:,} cells × {adata.n_vars:,} genes")


## Parameter Configuration

Set parameters for clustering.

In [None]:
# Clustering parameters
N_PCS = 15
N_NEIGHBORS = 10
CLUSTERING_PARAMS = {'resolution': 0.8}

# Create plots directory
PLOTS_DIR = Path('plots/notebook2')
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print("Parameters:")
print(f"  N_PCS: {N_PCS}")
print(f"  N_NEIGHBORS: {N_NEIGHBORS}")
print(f"  Resolution: {CLUSTERING_PARAMS['resolution']}")


## 7. Stage 5: Normalization & Scaling

Normalize counts to account for sequencing depth differences and identify highly variable genes.

**Steps:**
1. Save raw counts
2. Normalize total counts per cell to 10,000
3. Log-transform (log1p)
4. Identify highly variable genes (HVGs)
5. Scale data (zero mean, unit variance)

**No parameters to tune in this stage.**

In [None]:
def normalize_and_scale(adata):
    """Normalize and scale data
    
    This follows the standard scanpy workflow and matches the original pipeline.
    
    Args:
        adata: AnnData object
    
    Returns:
        Processed AnnData object
    """
    print("\n" + "="*60)
    print("NORMALIZATION AND SCALING")
    print("="*60)
    
    # Save raw counts
    print("[1/5] Saving raw counts...")
    adata.raw = adata
    print("      ✓ Raw data saved\n")
    
    # Normalize to 10,000 reads per cell
    print("[2/5] Normalizing to 10,000 counts per cell...")
    sc.pp.normalize_total(adata, target_sum=1e4)
    print("      ✓ Normalized\n")
    
    # Log transform
    print("[3/5] Log-transforming (log1p)...")
    sc.pp.log1p(adata)
    print("      ✓ Log-transformed\n")
    
    # Find highly variable genes
    print("[4/5] Identifying highly variable genes...")
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    n_hvg = adata.var['highly_variable'].sum()
    print(f"      ✓ Identified {n_hvg:,} highly variable genes\n")
    
    # Plot highly variable genes
    sc.pl.highly_variable_genes(adata, show=False)
    plt.savefig(PLOTS_DIR / 'highly_variable_genes.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f"      ✓ HVG plot saved to {PLOTS_DIR}/highly_variable_genes.png\n")
    
    # Keep only HVGs for downstream analysis
    adata = adata[:, adata.var.highly_variable]
    print(f"      Subset to {adata.n_vars:,} HVGs for downstream analysis\n")
    
    # Scale data
    print("[5/5] Scaling data (zero mean, unit variance, max=10)...")
    sc.pp.scale(adata, max_value=10)
    print("      ✓ Data scaled\n")
    
    print("="*60)
    print(f"FINAL: {adata.n_obs:,} cells × {adata.n_vars:,} HVGs (scaled)")
    print(f"Raw data preserved: {adata.raw.n_obs:,} cells × {adata.raw.n_vars:,} genes")
    print("="*60)
    
    return adata

# Normalize and scale
adata = normalize_and_scale(adata)

## Stage 6: PCA, UMAP & Clustering

Reduce dimensionality and identify cell clusters.

In [None]:
# Run PCA
print("Running PCA...")
sc.tl.pca(adata, svd_solver='arpack', n_comps=50)

# Elbow plot
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True, show=False)
plt.axvline(N_PCS, color='r', linestyle='--', label=f'Selected: {N_PCS}')
plt.legend()
plt.savefig(PLOTS_DIR / 'pca_elbow_plot.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"✓ Using {N_PCS} PCs")


### 🎛️ Parameter Tuning: PCA

<details>
<summary>📊 Clear elbow at PC 15-20</summary>

**Action:** Current N_PCS = 15 is good
</details>

<details>
<summary>📊 Elbow at PC 30-40</summary>

**Action:**
```python
N_PCS = 35
```
**Then:** Re-run from Stage 6
</details>


In [None]:
# Compute neighbors and UMAP
print("Computing neighborhood graph...")
sc.pp.neighbors(adata, n_neighbors=N_NEIGHBORS, n_pcs=N_PCS)

print("Running UMAP...")
sc.tl.umap(adata)

print("Clustering...")
sc.tl.leiden(adata, resolution=CLUSTERING_PARAMS['resolution'])

print(f"✓ Identified {adata.obs['leiden'].nunique()} clusters")


In [None]:
# Plot UMAP
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

sc.pl.umap(adata, color='leiden', legend_loc='on data',
          title='Clusters', ax=axes[0,0], show=False)
sc.pl.umap(adata, color='orig.ident', title='Sample',
          ax=axes[0,1], show=False)
sc.pl.umap(adata, color='Genotype', title='Genotype',
          ax=axes[1,0], show=False)
sc.pl.umap(adata, color='Sex', title='Sex',
          ax=axes[1,1], show=False)

plt.tight_layout()
plt.savefig(PLOTS_DIR / 'umap_embeddings.png', dpi=300, bbox_inches='tight')
plt.show()


### 🎛️ Parameter Tuning: Clustering

<details>
<summary>📊 <5 clusters</summary>

**Action:**
```python
CLUSTERING_PARAMS['resolution'] = 1.0  # Increase
```
</details>

<details>
<summary>📊 >30 clusters</summary>

**Action:**
```python
CLUSTERING_PARAMS['resolution'] = 0.4  # Decrease
```
</details>


## Stage 7: Marker Gene Analysis

Identify differentially expressed genes per cluster.

In [None]:
# Compute markers
print("Computing marker genes...")
sc.tl.rank_genes_groups(adata, groupby='leiden',
                        method='wilcoxon', use_raw=True)

# Plot
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False, show=False)
plt.savefig(PLOTS_DIR / 'top_marker_genes.png', dpi=300, bbox_inches='tight')
plt.show()

# Export to CSV
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names

markers_dict = {}
for group in groups:
    markers_dict[f'Cluster_{group}_genes'] = result['names'][group][:30]
    markers_dict[f'Cluster_{group}_scores'] = result['scores'][group][:30]

markers_df = pd.DataFrame(markers_dict)
markers_df.to_csv(PLOTS_DIR.parent / 'top_markers_by_cluster.csv', index=False)

print(f"✓ Saved markers to top_markers_by_cluster.csv")


In [None]:
# Save clustered data
output_file = 'outputs/clustered_data.h5ad'

adata.uns['pipeline_params']['notebook'] = '2_clustering_markers'
adata.uns['pipeline_params']['n_pcs'] = N_PCS
adata.uns['pipeline_params']['n_neighbors'] = N_NEIGHBORS
adata.uns['pipeline_params']['clustering'] = CLUSTERING_PARAMS

adata.write(output_file)

print("\n" + "="*60)
print("NOTEBOOK 2 COMPLETE")
print("="*60)
print(f"✓ Saved: {output_file}")
print(f"  Clusters: {adata.obs['leiden'].nunique()}")
print("\n➡️  NEXT: Open 3_annotation_export.ipynb")
