In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import sparse
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Load the count matrix
# Assuming it's a csv file with rows as genes and columns as cells
counts = pd.read_csv("counts.csv", index_col=0)

# Transpose so that rows are cells and columns are genes
counts = counts.transpose()

# Normalization (CPM normalization)
counts = counts.div(counts.sum(axis=1), axis=0)
counts = counts.mul(10**6)

# Logarithmize the data
counts = np.log1p(counts)

# Scale data to zero mean and unit variance
scaler = StandardScaler()
counts_scaled = pd.DataFrame(scaler.fit_transform(counts), columns=counts.columns, index=counts.index)

# Run PCA
pca = PCA(n_components=50)
pca_result = pca.fit_transform(counts_scaled)

# Plot explained variance
plt.plot(range(pca.n_components_), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

# Run k-means clustering
kmeans = KMeans(n_clusters=10, random_state=0).fit(pca_result)

# Assign clusters back to original dataframe
counts['cluster'] = kmeans.labels_
