In [None]:
# Import necessary libraries for directory creation
import os

# List of directories to create
dirs = ['data_3k', 'results_3k', 'figures']

# Create directories if they don't exist
for directory in dirs:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")


In [None]:
import tarfile

# Path to the tar file
tar_file_path = "data_3k/pbmc3k_filtered_gene_bc_matrices.tar.gz"

# Unzip the tar file into the 'data' folder
with tarfile.open(tar_file_path, "r:gz") as tar:
    tar.extractall(path="data")
    print("Data unzipped successfully.")


In [None]:
import scanpy as sc

# Define the path to the unzipped data
data_path = "data_3k/filtered_gene_bc_matrices/hg19/"

# Load the 10X data into an AnnData object
adata = sc.read_10x_mtx(data_path, var_names='gene_symbols', cache=True)
print("Data loaded successfully.")

# Save the raw AnnData object for future use
adata.write("results_3k/adata_raw.h5ad")
print("AnnData object saved as 'adata_raw.h5ad'.")


In [None]:
# Read the raw data
adata = sc.read("results_3k/adata_raw.h5ad")

# Preprocessing: filter cells and genes
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Annotate mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('MT-')

# Compute QC metrics (e.g., percentage of mitochondrial genes)
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)

# Normalize the data
sc.pp.normalize_total(adata, target_sum=1e4)

# Log-transform the data
sc.pp.log1p(adata)

# Save the preprocessed AnnData object
adata.write("results_3k/adata_preprocessed.h5ad")
print("Preprocessed data saved as 'adata_preprocessed.h5ad'.")


In [None]:
# Load the preprocessed data
adata = sc.read("results_3k/adata_preprocessed.h5ad")

# Identify highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]

hvg_3k = sum(adata.var['highly_variable'])
print(f"Number of highly variable genes in the 3k dataset: {hvg_3k}")

# Scale the data for PCA
# Scale the data to have mean 0 and variance 1 for each gene
sc.pp.scale(adata, zero_center=True, max_value=10)

# Perform PCA
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, save="pca_3k.png")
print("PCA complete: Saved PCA plot in the 'figures' folder.")

# Save the PCA-transformed data
adata.write("results_3k/adata_pca.h5ad")
print("PCA completed and saved.")


In [None]:

!pip install leidenalg
!pip install igraph

import leidenalg
import igraph

# Load the PCA data
adata = sc.read("results_3k/adata_pca.h5ad")

# Compute the neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

# Run UMAP
sc.tl.umap(adata)

# Perform clustering using the Leiden algorithm
sc.tl.leiden(adata)

# Save the clustered data
adata.write("results_3k/adata_clustered.h5ad")
print("Clustering completed and saved.")

In [None]:

# Load the clustered data
adata = sc.read("results_3k/adata_clustered.h5ad")

print(adata.var_names[:10])

# Plot UMAP with clusters
sc.pl.umap(adata, color=['leiden'], save='_clusters_3k.png')

# Plot UMAP with marker genes
sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP', 'MS4A1'], save='_markers_3k.png')

print("Plots saved in the 'figures' directory.")


In [None]:
# Import necessary libraries
import scanpy as sc
import pandas as pd

# Compute the ranked genes for each cluster using 'leiden' method
# Perform differential expression analysis
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')

# Save the plot of ranked genes in the 'figures' folder
sc.pl.rank_genes_groups(adata, save='ranked_genes_leiden.png')  # This saves to figures folder by default

# Save the ranked genes data to the 'results' folder
# First, extract the ranked genes from the AnnData object
ranked_genes_df = pd.DataFrame({
    group: pd.DataFrame(adata.uns['rank_genes_groups']['names'])[group] 
    for group in adata.uns['rank_genes_groups']['names'].dtype.names
})

# Save the DataFrame with ranked genes into a CSV file in the 'results' folder
ranked_genes_df.to_csv('results_3k/ranked_genes_leiden.csv', index=False)
