In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import issparse

def perform_eda(adata):
    # Basic dataset overview
    print(f'Number of cells: {adata.n_obs}')
    print(f'Number of genes: {adata.n_vars}')
    print(f'First few variable names: {adata.var_names[:5]}')

    # Cell quality control metrics
    if issparse(adata.X):
        adata.obs['n_counts'] = np.asarray(adata.X.sum(axis=1)).flatten()
        adata.obs['n_genes'] = np.asarray((adata.X > 0).sum(axis=1)).flatten()
    else:
        adata.obs['n_counts'] = adata.X.sum(axis=1)
        adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)

    # Handling potential mitochondrial genes (commonly start with 'MT-')
    mito_genes = adata.var_names.str.startswith('MT-')
    if mito_genes.sum() > 0:
        if issparse(adata.X):
            adata.obs['pct_counts_mt'] = np.asarray(
                (adata[:, mito_genes].X.sum(axis=1) / adata.obs['n_counts']).A1).flatten()
        else:
            adata.obs['pct_counts_mt'] = np.sum(
                adata[:, mito_genes].X, axis=1) / adata.obs['n_counts']
    else:
        adata.obs['pct_counts_mt'] = 0

    # Plot QC metrics
    sc.pl.violin(adata, ['n_counts', 'n_genes', 'pct_counts_mt'], jitter=0.4, multi_panel=True)
    sc.pl.scatter(adata, x='n_counts', y='pct_counts_mt')
    sc.pl.scatter(adata, x='n_counts', y='n_genes')

    # Filter cells and genes
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    # Log-normalize the data
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

    # Highly variable genes
    sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
    sc.pl.highly_variable_genes(adata)

    # Dimensionality reduction with PCA
    sc.tl.pca(adata, svd_solver='arpack')
    sc.pl.pca_variance_ratio(adata, log=True)

    # Clustering and UMAP visualization
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adata)
    sc.tl.louvain(adata, resolution=0.5)
    sc.pl.umap(adata, color=['louvain', 'n_counts', 'n_genes', 'pct_counts_mt'])

    # Save the processed data
    adata.write('processed_data.h5ad')

    plt.show()

# Usage
adata = sc.read_h5ad('path_to_your_file.h5ad')
perform_eda(adata)
