In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
def get_count_data():
    """
    Read in the raw read count data, and cell type identities as available on NCBI GEO
    """
    
    # read in count data
    fname = 'Matlab/GSE99888_gene_expression.tab'
    df = pd.read_csv(fname, sep='\t', header=0, index_col=0)
    
    # read in cell type labels
    fname = 'References/Harris References/Cell Identities.tsv'
    df_labels = pd.read_csv(fname, sep='\t', header=0, index_col=0)
    df_labels = df_labels.loc[df.columns,:]
    
    # add multi indexing
    arrays = [df_labels.Accession, df_labels.CellType]
    names = ('Cell', 'CellType')
    df.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    return df

def get_matrix_data():
    """
    Read in the raw count data, from Harris et al's nbt-SNE data
    """
    fname = 'Matlab/Matlab_Counts.tsv'
    params = {'sep':'\t', 'header':[0,1], 'index_col':0}
    df = pd.read_csv(fname, **params)
    
    return df

def get_data():
    """
    Read in all raw count data
    """
    
    df_count = get_count_data()
    df_matrix = get_matrix_data()
    
    return df_count, df_matrix

def get_pca_reduce(df_count, df_matrix):
    """
    Reduce input matrices via PCA transformation
    """
    datalist_count = df_count.values.T
    datalist_matrix = df_matrix.values.T
    pca = PCA(n_components=100)
    pca.fit(datalist_count)
    
    datalist_count = pca.transform(datalist_count)
    datalist_matrix = pca.transform(datalist_matrix)
    
    df_count = pd.DataFrame(datalist_count, index=df_count.columns)
    df_matrix = pd.DataFrame(datalist_matrix, index=df_matrix.columns)
    
    return df_count, df_matrix

def get_dist_matrix(df_count, df_matrix):
    """
    Produce a distance matrix between cells of df_count and df_matrix
    """
    df_dist = pd.DataFrame(np.NaN, index=df_count.columns, columns=df_matrix.columns)
    df_count, df_matrix = get_pca_reduce(df_count, df_matrix)
    
    for index, data_index in df_count.iterrows():
        for column, data_column in df_matrix.iterrows():
            df_dist.loc[index, column] = np.sum(np.square(data_index.values - data_column.values))
    
    return df_dist

def create_dist_matrix():
    """
    Produce a distance matrix between cells of df_count and df_matrix
    """
    df_count, df_matrix = get_data()
    df_dist = get_dist_matrix(df_count, df_matrix)
    
    df_dist.to_csv('Matlab/Harris_Compare.tsv', sep='\t')
    
    return

def create_matching_matrix():
    """
    Read in distance matrix, to create a pandas dataframe that matches NCBI cell labels
    to nbt-SNE cell labels. Save it as Harris_Matches.tsv
    """
    # read in distance matrix
    fname = 'Matlab/Harris_Compare.tsv'
    params = {'sep':'\t', 'header':[0,1], 'index_col':[0,1]}
    df_dist = pd.read_csv(fname, **params)
    
    # for each column find nearest match
    df_match = df_comp.idxmin(axis=0).apply(pd.Series)
    df_match.columns = ['Cell', 'CellType']
    df_match['Name'] = df_match.index.get_level_values(0)
    df_match['gSetType'] = df_match.index.get_level_values(1)
    
    # save data
    df_match.to_csv('Matlab/Harris_Matches.tsv', sep='\t', index=False)
    
    return

def read_harris_nbtsne_data():
    """
    Get Harris et al's nbt-SNE reduction data
    """
    fname = 'Matlab/Harris_nbtsne.tsv'
    params = {'sep':'\t', 'header':0, 'index_col':[0,1], 'usecols':[0,1,2,3]}
    df = pd.read_csv(fname, **params)
    df.columns = ['Plot_X', 'Plot_Y']
    
    return df

def read_match_data():
    """
    Read in the Harris_Matches.tsv file produced by create_matching_matrix
    """
    fname = 'Matlab/Harris_Matches.tsv'
    params = {'sep':'\t', 'header':0, 'index_col':2}
    df = pd.read_csv(fname, **params)
    
    return df

def create_nbtsne_data():
    """
    Reproduce Harris et al's nbt-SNE data, with cell labels fixed
    """
    # read in data
    df_nbtsne = read_harris_nbtsne_data()
    df_match = read_match_data()
    df_match = df_match.loc[df_nbtsne.index.get_level_values(0),:]
    
    # adjust index labels
    arrays = [df_match.Cell, df_match.gSetType]
    names = ('Cell', 'CellType')
    df_nbtsne.index = pd.MultiIndex.from_arrays(arrays, names=names)
    
    df_nbtsne.to_csv('Mapping/Embeddings/Harris_nbtSNE.tsv', sep='\t')
    
    return df_nbtsne

In [3]:
%%time

create_dist_matrix()

CPU times: user 44 s, sys: 1.54 s, total: 45.5 s
Wall time: 46.1 s


In [4]:
%%time

create_matching_matrix()

CPU times: user 2h 35min 57s, sys: 2.1 s, total: 2h 35min 59s
Wall time: 2h 35min 20s


In [None]:
%%time

create_nbtsne_data()