# Exploring Concordance Analysis Results

This notebook explores the results from the concordance-based differential expression analysis.

**Key concepts:**
- **Concordant cells**: Cancer cells where expression matches expected CNV pattern (low embedding distance)
- **Discordant cells**: Cancer cells where expression deviates from CNV pattern (high embedding distance)
- **Escape genes**: Upregulated in discordant cells (expressed higher than expected given CNV)
- **Compensation genes**: Downregulated in discordant cells (expressed lower than expected given CNV)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Settings
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100
sns.set_style('whitegrid')

# Change to project root
os.chdir('/Users/gg027/Desktop/CLCC')

# Patient to analyze
PATIENT = 'P0006'
RESULTS_DIR = f'data/de_results/{PATIENT}'

print("Basic imports done. Scanpy will be imported later when needed.")

Basic imports done. Scanpy will be imported later when needed.


## 1. Load Results

In [2]:
# Check what files are available
print("Available result files:")
for f in sorted(os.listdir(RESULTS_DIR)):
    size = os.path.getsize(os.path.join(RESULTS_DIR, f))
    print(f"  {f} ({size/1024:.1f} KB)")

Available result files:
  cell_concordance.csv (332.1 KB)
  chromosome_cis_effects.csv (6.0 KB)
  chromosome_cis_effects.png (66.1 KB)
  de_cancer_vs_normal.csv (1938.6 KB)
  de_cnv_subclusters.csv (5928.8 KB)
  de_concordance_within_subcluster.csv (29380.2 KB)
  de_concordant_vs_discordant.csv (1964.4 KB)
  de_high_vs_low_cnv.csv (2055.5 KB)
  dosage_sensitive_genes.csv (43.9 KB)
  embedding_distance_distribution.png (80.2 KB)
  escape_genes.csv (3.7 KB)
  gene_dosage_sensitivity.csv (3851.4 KB)
  subcluster_composition.csv (2.1 KB)
  subcluster_composition.png (112.9 KB)
  top_concordance_genes.png (21.2 KB)
  volcano_cancer_vs_normal.png (168.7 KB)
  volcano_concordance.png (62.9 KB)
  volcano_high_vs_low_cnv.png (168.5 KB)


In [3]:
# Load all result files
cell_concordance = pd.read_csv(f'{RESULTS_DIR}/cell_concordance.csv', index_col=0)
de_results = pd.read_csv(f'{RESULTS_DIR}/de_concordant_vs_discordant.csv')
escape_genes = pd.read_csv(f'{RESULTS_DIR}/escape_genes.csv')
compensation_genes = pd.read_csv(f'{RESULTS_DIR}/compensation_genes.csv')
subcluster_comp = pd.read_csv(f'{RESULTS_DIR}/subcluster_composition.csv')

print(f"Cells analyzed: {len(cell_concordance):,}")
print(f"Genes tested: {len(de_results):,}")
print(f"Escape genes: {len(escape_genes)}")
print(f"Compensation genes: {len(compensation_genes)}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/de_results/P0006/compensation_genes.csv'

## 2. Cell Concordance Overview

In [None]:
# Concordance distribution
print("Concordance classification:")
print(cell_concordance['cnv_concordance'].value_counts())

# Filter to cancer cells only
if 'cancer_vs_normal' in cell_concordance.columns:
    cancer_cells = cell_concordance[cell_concordance['cancer_vs_normal'] == 'Cancer']
    print(f"\nCancer cells only: {len(cancer_cells):,}")
    print(cancer_cells['cnv_concordance'].value_counts())
else:
    cancer_cells = cell_concordance[cell_concordance['cnv_concordance'] != 'Normal']

In [None]:
# Embedding distance distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(cancer_cells['embedding_distance'], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(cancer_cells['embedding_distance'].quantile(0.25), color='green', 
                linestyle='--', label='Q25 (Concordant)')
axes[0].axvline(cancer_cells['embedding_distance'].quantile(0.75), color='red', 
                linestyle='--', label='Q75 (Discordant)')
axes[0].set_xlabel('Embedding Distance')
axes[0].set_ylabel('Number of Cancer Cells')
axes[0].set_title('Expression-CNV Embedding Distance Distribution')
axes[0].legend()

# By concordance class
colors = {'Concordant': 'green', 'Intermediate': 'gray', 'Discordant': 'red'}
for cat in ['Concordant', 'Intermediate', 'Discordant']:
    mask = cancer_cells['cnv_concordance'] == cat
    if mask.sum() > 0:
        axes[1].hist(cancer_cells.loc[mask, 'embedding_distance'], bins=30, 
                     alpha=0.5, label=cat, color=colors[cat])
axes[1].set_xlabel('Embedding Distance')
axes[1].set_ylabel('Number of Cells')
axes[1].set_title('Distance by Concordance Class')
axes[1].legend()

plt.tight_layout()
plt.show()

## 3. Differential Expression Results

In [None]:
# Summary of DE results
n_sig = de_results['significant'].sum()
n_up = ((de_results['significant']) & (de_results['logfoldchanges'] > 0)).sum()
n_down = ((de_results['significant']) & (de_results['logfoldchanges'] < 0)).sum()

print(f"Total genes tested: {len(de_results):,}")
print(f"Significant genes (padj < 0.05, |logFC| > 0.5): {n_sig:,}")
print(f"  Escape genes (up in discordant): {n_up:,}")
print(f"  Compensation genes (down in discordant): {n_down:,}")

In [None]:
# Volcano plot
fig, ax = plt.subplots(figsize=(10, 8))

de_results['-log10_pval'] = -np.log10(de_results['pvals_adj'] + 1e-300)

# Color by significance
colors = []
for _, row in de_results.iterrows():
    if row['significant']:
        colors.append('red' if row['logfoldchanges'] > 0 else 'blue')
    else:
        colors.append('gray')

ax.scatter(de_results['logfoldchanges'], de_results['-log10_pval'], 
           c=colors, alpha=0.5, s=10)

# Threshold lines
ax.axhline(-np.log10(0.05), color='gray', linestyle='--', alpha=0.5)
ax.axvline(0.5, color='gray', linestyle='--', alpha=0.5)
ax.axvline(-0.5, color='gray', linestyle='--', alpha=0.5)

# Label top genes
top_genes = de_results.nlargest(15, '-log10_pval')
for _, row in top_genes.iterrows():
    ax.annotate(row['names'], (row['logfoldchanges'], row['-log10_pval']), 
                fontsize=8, alpha=0.8)

ax.set_xlabel('Log2 Fold Change (Discordant vs Concordant)')
ax.set_ylabel('-Log10 Adjusted P-value')
ax.set_title(f'{PATIENT}: Concordance DE Analysis\n(Red = Escape, Blue = Compensation)')

plt.tight_layout()
plt.show()

## 4. Top Escape Genes

These genes are expressed **higher** than expected given the CNV state in discordant cells.
They may represent:
- Transcriptional compensation mechanisms
- Alternative regulatory pathways
- Genes escaping CNV-driven silencing

In [None]:
# Top escape genes
print("Top 20 Escape Genes (up in discordant cancer cells):")
print("="*60)
escape_genes.head(20)[['gene', 'logFC', 'pval_adj', 'chromosome']].to_string()

In [None]:
# Visualize top escape genes
fig, ax = plt.subplots(figsize=(10, 8))

top_escape = escape_genes.head(20)
ax.barh(range(len(top_escape)), top_escape['logFC'].values, color='red', alpha=0.7)
ax.set_yticks(range(len(top_escape)))
ax.set_yticklabels(top_escape['gene'].values)
ax.set_xlabel('Log2 Fold Change')
ax.set_title('Top 20 Escape Genes\n(Higher in Discordant = Regulatory Escape)')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

## 5. Top Compensation Genes

These genes are expressed **lower** than expected given the CNV state in discordant cells.
They may represent:
- Dosage compensation mechanisms
- Epigenetic silencing
- Buffering mechanisms

In [None]:
# Top compensation genes
print("Top 20 Compensation Genes (down in discordant cancer cells):")
print("="*60)
compensation_genes.head(20)[['gene', 'logFC', 'pval_adj', 'chromosome']].to_string()

In [None]:
# Visualize top compensation genes
fig, ax = plt.subplots(figsize=(10, 8))

top_comp = compensation_genes.head(20)
ax.barh(range(len(top_comp)), top_comp['logFC'].values, color='blue', alpha=0.7)
ax.set_yticks(range(len(top_comp)))
ax.set_yticklabels(top_comp['gene'].values)
ax.set_xlabel('Log2 Fold Change')
ax.set_title('Top 20 Compensation Genes\n(Lower in Discordant = Dosage Compensation)')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

## 6. Subcluster Composition Analysis

Which CNV subclusters have the highest rates of discordance?

In [None]:
# Subcluster composition
subcluster_comp.sort_values('pct_discordant', ascending=False)

In [None]:
# Visualize subcluster composition
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

df = subcluster_comp.sort_values('pct_discordant', ascending=True)

# Stacked bar chart
x = range(len(df))
axes[0].barh(x, df['pct_concordant'], label='Concordant', color='green', alpha=0.7)
axes[0].barh(x, 100 - df['pct_concordant'] - df['pct_discordant'], 
             left=df['pct_concordant'], label='Intermediate', color='gray', alpha=0.5)
axes[0].barh(x, df['pct_discordant'], 
             left=df['pct_concordant'] + (100 - df['pct_concordant'] - df['pct_discordant']),
             label='Discordant', color='red', alpha=0.7)
axes[0].set_yticks(x)
axes[0].set_yticklabels([f"Cluster {c}" for c in df['subcluster']])
axes[0].set_xlabel('Percentage of Cancer Cells')
axes[0].set_title('Subcluster Composition by Concordance')
axes[0].legend(loc='lower right')

# CNV score vs discordance
scatter = axes[1].scatter(df['mean_cnv_score'], df['pct_discordant'], 
                          s=df['n_cells']/5, alpha=0.6, c='coral')
axes[1].set_xlabel('Mean CNV Score')
axes[1].set_ylabel('% Discordant Cells')
axes[1].set_title('CNV Burden vs Regulatory Discordance\n(bubble size = # cells)')

for _, row in df.iterrows():
    axes[1].annotate(f"{row['subcluster']}", 
                     (row['mean_cnv_score'], row['pct_discordant']),
                     fontsize=9)

plt.tight_layout()
plt.show()

## 7. Chromosome Distribution of DE Genes

In [None]:
# Check if chromosome info is available
if 'chromosome' in escape_genes.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Escape genes by chromosome
    escape_chroms = escape_genes['chromosome'].value_counts().sort_index()
    axes[0].bar(range(len(escape_chroms)), escape_chroms.values, color='red', alpha=0.7)
    axes[0].set_xticks(range(len(escape_chroms)))
    axes[0].set_xticklabels(escape_chroms.index, rotation=45)
    axes[0].set_xlabel('Chromosome')
    axes[0].set_ylabel('Number of Genes')
    axes[0].set_title('Escape Genes by Chromosome')
    
    # Compensation genes by chromosome
    comp_chroms = compensation_genes['chromosome'].value_counts().sort_index()
    axes[1].bar(range(len(comp_chroms)), comp_chroms.values, color='blue', alpha=0.7)
    axes[1].set_xticks(range(len(comp_chroms)))
    axes[1].set_xticklabels(comp_chroms.index, rotation=45)
    axes[1].set_xlabel('Chromosome')
    axes[1].set_ylabel('Number of Genes')
    axes[1].set_title('Compensation Genes by Chromosome')
    
    plt.tight_layout()
    plt.show()
else:
    print("Chromosome information not available in results")

## 8. Load Original Data for Deeper Analysis

In [None]:
# Import scanpy here (takes a few seconds)
import scanpy as sc

# Load the original AnnData to explore gene expression
adata = sc.read_h5ad(f'data/cnv_output/{PATIENT}/{PATIENT}_cnv.h5ad')
print(f"Loaded: {adata.n_obs:,} cells, {adata.n_vars:,} genes")

# Add concordance info
adata.obs['embedding_distance'] = cell_concordance['embedding_distance']
adata.obs['cnv_concordance'] = cell_concordance['cnv_concordance']

In [None]:
# Visualize top escape gene expression
if len(escape_genes) > 0:
    top_gene = escape_genes.iloc[0]['gene']
    if top_gene in adata.var_names:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Filter to cancer cells
        adata_cancer = adata[adata.obs['cancer_vs_normal'] == 'Cancer'].copy()
        
        # Expression by concordance
        sc.pl.violin(adata_cancer, top_gene, groupby='cnv_concordance', ax=axes[0], show=False)
        axes[0].set_title(f'{top_gene} Expression by Concordance')
        
        # UMAP colored by gene expression
        if 'X_umap' in adata.obsm:
            sc.pl.umap(adata_cancer, color=top_gene, ax=axes[1], show=False)
            axes[1].set_title(f'{top_gene} Expression (UMAP)')
        
        plt.tight_layout()
        plt.show()

In [None]:
# Heatmap of top escape and compensation genes
top_escape_names = escape_genes.head(10)['gene'].tolist()
top_comp_names = compensation_genes.head(10)['gene'].tolist()
all_top_genes = [g for g in top_escape_names + top_comp_names if g in adata.var_names]

if len(all_top_genes) > 0:
    # Filter to cancer cells with concordant/discordant status
    adata_subset = adata[
        (adata.obs['cancer_vs_normal'] == 'Cancer') & 
        (adata.obs['cnv_concordance'].isin(['Concordant', 'Discordant']))
    ].copy()
    
    sc.pl.heatmap(adata_subset, all_top_genes, groupby='cnv_concordance', 
                  figsize=(12, 8), show_gene_labels=True,
                  cmap='RdBu_r', vmin=-2, vmax=2)

## 9. Summary Statistics

In [None]:
print("="*60)
print(f"CONCORDANCE ANALYSIS SUMMARY: {PATIENT}")
print("="*60)

# Cell counts
print(f"\nCancer cells analyzed: {len(cancer_cells):,}")
for cat in ['Concordant', 'Intermediate', 'Discordant']:
    n = (cancer_cells['cnv_concordance'] == cat).sum()
    pct = 100 * n / len(cancer_cells)
    print(f"  {cat}: {n:,} ({pct:.1f}%)")

# DE summary
print(f"\nDifferential Expression:")
print(f"  Total significant genes: {n_sig:,}")
print(f"  Escape genes (up in discordant): {n_up:,}")
print(f"  Compensation genes (down in discordant): {n_down:,}")

# Top genes
print(f"\nTop 5 Escape Genes:")
for _, row in escape_genes.head(5).iterrows():
    print(f"  {row['gene']}: logFC={row['logFC']:.2f}, p={row['pval_adj']:.2e}")

print(f"\nTop 5 Compensation Genes:")
for _, row in compensation_genes.head(5).iterrows():
    print(f"  {row['gene']}: logFC={row['logFC']:.2f}, p={row['pval_adj']:.2e}")

# Subcluster with highest discordance
top_disc = subcluster_comp.nlargest(1, 'pct_discordant').iloc[0]
print(f"\nSubcluster with highest discordance:")
print(f"  Cluster {top_disc['subcluster']}: {top_disc['pct_discordant']:.1f}% discordant")
print(f"  Mean CNV score: {top_disc['mean_cnv_score']:.3f}")

## 10. Export Gene Lists for Pathway Analysis

In [None]:
# Export gene lists for external pathway analysis (e.g., Enrichr, GSEA)
output_dir = f'{RESULTS_DIR}/pathway_analysis'
os.makedirs(output_dir, exist_ok=True)

# Escape genes (for upregulated pathway analysis)
escape_gene_list = escape_genes['gene'].tolist()
with open(f'{output_dir}/escape_genes.txt', 'w') as f:
    f.write('\n'.join(escape_gene_list))
print(f"Saved {len(escape_gene_list)} escape genes to {output_dir}/escape_genes.txt")

# Compensation genes (for downregulated pathway analysis)
comp_gene_list = compensation_genes['gene'].tolist()
with open(f'{output_dir}/compensation_genes.txt', 'w') as f:
    f.write('\n'.join(comp_gene_list))
print(f"Saved {len(comp_gene_list)} compensation genes to {output_dir}/compensation_genes.txt")

# All significant genes with direction
sig_genes = de_results[de_results['significant']][['names', 'logfoldchanges', 'pvals_adj']].copy()
sig_genes.columns = ['gene', 'logFC', 'pval_adj']
sig_genes['direction'] = np.where(sig_genes['logFC'] > 0, 'escape', 'compensation')
sig_genes.to_csv(f'{output_dir}/significant_genes_ranked.csv', index=False)
print(f"Saved {len(sig_genes)} significant genes to {output_dir}/significant_genes_ranked.csv")

print(f"\nGene lists ready for pathway analysis tools like:")
print("  - Enrichr (https://maayanlab.cloud/Enrichr/)")
print("  - GSEA (https://www.gsea-msigdb.org/)")
print("  - DAVID (https://david.ncifcrf.gov/)")