In [1]:
def GetAccuracy(truth, label): 
    truth_relation = np.zeros((len(truth),len(truth)))
    label_relation = np.zeros((len(label),len(label)))

    for i in range(len(truth)):
        for j in range(len(truth)):
            if truth[i] == truth[j]:
                truth_relation[i][j] = 0.5
            else:
                truth_relation[i][j] = -0.5

    for i in range(len(label)):
        for j in range(len(label)):
            if label[i] == label[j]:
                label_relation[i][j] = 0.5
            else:
                label_relation[i][j] = -0.5

    accuracy = np.mean(np.abs(truth_relation+label_relation))
    print(accuracy)

In [4]:
# import some necessary packages
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from plot import getTsne
from data_helper import scanpy

def CreateDir_PCA(filepath, num, clusters, mingene, mincell):
    # now for PCA
    # create directory in data
    if not os.path.exists("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/PCA_branch_"+str(num)+"D"):
        os.makedirs("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/PCA_branch_"+str(num)+"D")
    # create directory in process_image
    if not os.path.exists("./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/PCA_branch_"+str(num)+"D"):
        os.makedirs("./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/PCA_branch_"+str(num)+"D")

def CreateDir_TSNE(filepath, num, clusters, mingene, mincell):
    # now for TSNE
    # create directory in data
    if not os.path.exists("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/TSNE_branch_"+str(num)+"D"):
        os.makedirs("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/TSNE_branch_"+str(num)+"D")
    # create directory in process_image
    if not os.path.exists("./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/TSNE_branch_"+str(num)+"D"):
        os.makedirs("./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/TSNE_branch_"+str(num)+"D")

        
def GetPCA(filepath, num, clusters, mingene, mincell):
    df = pd.read_csv(filepath,header='infer',index_col=0,sep='\t')
    truth = df.index.values.tolist()
    truth_set = list(set(truth))
    X = df.values
    ipca = PCA(n_components=num).fit(X)
    
    
    # First get pca coordinates
    X_ipca = ipca.fit_transform(X)
    processed_PCA = pd.DataFrame(X_ipca,index=df.index.values,columns=[i+1 for i in range(num)])
    processed_PCA.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                         "/PCA_branch_"+str(num)+"D"+"/PCA_coordinate.csv",
                         sep="\t",header=False,index=False)
    processed_PCA.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                         "/PCA_branch_"+str(num)+"D"+"/PCA_coordinate_headed.csv", sep=",")
    processed_PCA.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                         "/PCA_branch_"+str(num)+"D"+"/PCA_coordinate_headed_noindex.tsv", sep="\t",index=False) 

    # Then get label from K-means
    label = KMeans(n_clusters=clusters).fit_predict(X_ipca)
    # generate graph with K-means label
    (lisXtsne, lisYtsne) = getTsne("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                                   "/PCA_branch_"+str(num)+"D"+"/PCA_coordinate_headed.csv", 
                                   "./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                                   "/PCA_branch_"+str(num)+"D"+"/tsne.jpg", 
                                   label)
    
    # Generate tsne coordinate file
    tsne_x = np.array([lisXtsne])
    tsne_y = np.array([lisYtsne])
    tsne_coor = np.concatenate((tsne_x.T, tsne_y.T),axis = 1)
    tsne_file = pd.DataFrame(tsne_coor, index=label, columns=["tsne_x","tsne_y"])
    tsne_file.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                     "/PCA_branch_"+str(num)+"D"+"/tsne_coordinate.csv")
    
    # Generate label file
    label_file = pd.DataFrame(label, columns = ["label"])
    label_file.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                     "/PCA_branch_"+str(num)+"D"+"/label.tsv",sep="\t", header = ["cluster"], index =False)
    
    # generate graph with groud truth label
    colors = ['black', 'purple', 'blue', 'yellow', 'pink', 'grey', 'green', 'bisque', 'blanchedalmond', 'red',
                  'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate']
    fig = plt.figure(figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
    for i in range(len(lisXtsne)):
        plt.scatter(lisXtsne[i], lisYtsne[i], c=colors[truth_set.index(truth[i])])
    plt.title("ground truth")
    plt.xlabel('x')
    plt.ylabel('y')
    fig.savefig("./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                "/PCA_branch_"+str(num)+"D"+"/ground_truth.jpg")
        
    # caltulate accuracy
    GetAccuracy(np.array(truth), label)
    
    
def GetTSNE(filepath, num, clusters, mingene, mincell):
    df = pd.read_csv(filepath,header='infer',index_col=0,sep='\t')
    truth = df.index.values.tolist()
    truth_set = list(set(truth))
    X = df.values
    tSNE = TSNE(n_components=num, early_exaggeration=50, perplexity=300).fit(X)
    
    
    # First get tSNE coordinates
    X_tSNE = tSNE.fit_transform(X)
    processed_tSNE = pd.DataFrame(X_tSNE,index=df.index.values,columns=[i+1 for i in range(num)])
    processed_tSNE.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                         "/TSNE_branch_"+str(num)+"D"+"/TSNE_coordinate.csv",
                         sep="\t",header=False,index=False)
    processed_tSNE.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                         "/TSNE_branch_"+str(num)+"D"+"/TSNE_coordinate_headed.csv", sep=",")
    processed_tSNE.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                         "/TSNE_branch_"+str(num)+"D"+"/TSNE_coordinate_headed_noindex.tsv", sep="\t",index=False) 

    # Then get label from K-means
    label = KMeans(n_clusters=clusters).fit_predict(X_tSNE)
    # generate graph with K-means label
    (lisXtsne, lisYtsne) = getTsne("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                                   "/TSNE_branch_"+str(num)+"D"+"/TSNE_coordinate_headed.csv", 
                                   "./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                                   "/TSNE_branch_"+str(num)+"D"+"/tsne.jpg", 
                                   label)
    
    # Generate tsne coordinate file
    tsne_x = np.array([lisXtsne])
    tsne_y = np.array([lisYtsne])
    tsne_coor = np.concatenate((tsne_x.T, tsne_y.T),axis = 1)
    tsne_file = pd.DataFrame(tsne_coor, index=label, columns=["tsne_x","tsne_y"])
    tsne_file.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                     "/TSNE_branch_"+str(num)+"D"+"/post_tsne_coordinate.csv")
    
    # Generate label file
    label_file = pd.DataFrame(label, columns = ["label"])
    label_file.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                     "/TSNE_branch_"+str(num)+"D"+"/label.tsv",sep="\t", header = ["cluster"], index =False)
    
    # generate graph with groud truth label
    colors = ['black', 'purple', 'blue', 'yellow', 'pink', 'grey', 'green', 'bisque', 'blanchedalmond', 'red',
                  'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate']
    fig = plt.figure(figsize=(20, 10), dpi=80, facecolor='w', edgecolor='k')
    for i in range(len(lisXtsne)):
        plt.scatter(lisXtsne[i], lisYtsne[i], c=colors[truth_set.index(truth[i])])
    plt.title("ground truth")
    plt.xlabel('x')
    plt.ylabel('y')
    fig.savefig("./process_images/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+
                "/TSNE_branch_"+str(num)+"D"+"/ground_truth.jpg")
        
    # caltulate accuracy
    GetAccuracy(np.array(truth), label)

In [5]:
mingene = 200
mincell = 3
PCAdim = 10
TSNEdim = 3
clusters = 8
filepath = "./data/data_indexed_with_label_transposed.csv"


CreateDir_PCA(filepath, PCAdim, clusters, mingene, mincell)
CreateDir_TSNE(filepath, TSNEdim, clusters, mingene, mincell)


data = scanpy(filepath, mingene, mincell)
data.getScanpy("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/filtered.tsv")


head = data.getFilteredGeneList().tolist()
index = data.getFilteredCellList().tolist()
headed = pd.read_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/filtered.tsv",sep="\t",
                     header=None, index_col=None)
headed.index = index
headed.to_csv("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/filtered_headed.tsv",sep="\t",
             header=head)


coordinate = GetTSNE("./data/pbmc_gene="+str(mingene)+"_cell="+str(mincell)+"/filtered_headed.tsv",
                 TSNEdim, clusters, mingene, mincell)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
(2638, 1204)


KeyboardInterrupt: 