In [1]:
import sys
import numpy as np
import pandas as pd

from Modules import rnaseqTools
from Modules import map_to_embeddings

In [2]:
def plot_embedding_data(df, ax, colors, alpha=1, s=8, annotate=False):
    # do scatter plot
    ax.scatter(df.Plot_X, df.Plot_Y, color=colors, s=s, alpha=alpha)
    
    # annotate labels
    if annotate:
        # get cell type median positions
        celltypes = ['.'.join(cell.split('.')[:2]) for cell in df.index.get_level_values('CellType')]
        df_cell = df.groupby(celltypes, axis=0).median()
        
        scatter_params = {'facecolor':'none', 'edgecolor':'black', 's':128, 'linewidths':1}
        text_params = {'ha':'left', 'va':'center', 'fontsize':8, 'weight':'bold'}
        ax.scatter(df_cell.Plot_X, df_cell.Plot_Y, **scatter_params)
        texts = [ax.text(row.Plot_X, row.Plot_Y, cell, **text_params) for cell, row in df_cell.iterrows()]
        adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'), ax=ax)
    
    return

def get_positions(df_corr, df_base, knn=(5,25)):
    knn_low, knn_high = knn
    steps = knn_high - knn_low + 1
    idx = df_corr.values.argsort(axis=1)[:,-knn_high:]
    points = np.zeros((df_corr.shape[0], steps, 2), dtype=float)
    
    for row, ind in enumerate(idx):
        values = df_base.values[ind,:]
        for col, knn_count in enumerate(range(knn_low, knn_high+1)):
            points[row, col, :] = np.median(values[-knn_count:,:], axis=0)
    
    positions = np.median(points, axis=1)
    df_position = pd.DataFrame(positions, index=df_corr.index, columns=df_base.columns)
    return df_position
    
def plot_mapping_data(df_corr, df_base, ax, colors, s=16, knn=(5,25)):
    df_position = get_positions(df_corr, df_base, knn=knn)
    ax.scatter(df_position.Plot_X, df_position.Plot_Y, facecolor=colors, s=s, zorder=1, edgecolor='black')
    
    return

def plot_mapping(df_corr, df_base, color_dict, title='', knn=(5,25)):
    base_colors = df_base.index.get_level_values('CellType').map(color_dict)
    emb_colors = df_corr.index.get_level_values('CellType').map(color_dict)
    
    fig_map, ax = create_figure(title=title)
    plot_embedding_data(df_base, ax, base_colors, alpha=.5, s=1, annotate=False)
    plot_mapping_data(df_corr, df_base, ax, emb_colors, s=16, knn=knn)
    
    return fig_map

def plot_embeddings_figure(dfs, titles, colors):
    fig = plt.figure(figsize=(8.5,11))
    fig.suptitle('Embeddings', fontsize=14)
    
    axes = []
    for ax_num in range(4):
        row = ax_num // 2
        col = ax_num % 2
        ax = fig.add_axes([.15+.4 * col, .62 - .30*row, .33, .255])
        ax.set_xticks([]), ax.set_yticks([])
        axes.append(ax)
    
    for df, title, ax in zip(dfs, titles, axes):
        ax.set_title(title, fontsize=12)
        plot_embedding_data(df, ax, colors, alpha=.5, s=1, annotate=False)
    
    return fig

def plot_all_embeddings(df_corr, df_nbtsne, df_tsne, df_umap, df_fit, color_dict,
                        knn=(5,25), pdfname='Compare_Mappings'
                       ):
    pp = PdfPages('Mapping/Plots/%s.pdf' % pdfname)
    
    titles = ['nbt-SNE', 't-SNE', 'UMAP', 'FIt-SNE']
    dfs = (df_nbtsne, df_tsne, df_umap, df_fit)
    
    base_colors = df_corr.columns.get_level_values('CellType').map(color_dict)
    fig = plot_embeddings_figure(dfs, titles, base_colors)
    pp.savefig(fig)
    plt.close()
    
    for title, df_base in zip(titles, dfs):
        fig_map = plot_mapping(df_corr, df_base, color_dict, title=title, knn=knn)
        pp.savefig(fig_map)
        plt.close()
    
    pp.close()
    
    return

def get_mapped_types(df_map, df_base, gene_title, knn=25, continent=[]):    
    # get correlation matrix
    df_corr = map_to_embeddings.get_df_corr(df_map, df_base, filename=gene_title, continent=continent)
    
    # convert to a dataframe of corresponding cell types
    idx = df_corr.values.argsort(axis=1)[:,-knn:]
    neighbor_types = df_corr.columns.get_level_values('Continent').astype(str).values[idx]
    df_celltypes = pd.DataFrame(neighbor_types, index=df_corr.index)
    df_celltypes.columns = np.arange(df_celltypes.shape[1],dtype=int) + 1
    df_celltypes.columns.name = 'NearestNeighbor'
    
    return df_celltypes

def get_mapped_celltype(data):
    celltypes, counts = np.unique(data, return_counts=True)
    idx = counts.argmax()
    if counts[idx] / counts.sum() > 0.5:
        return celltypes[idx]
    
    longest = max([celltype.count('.') for celltype in data])
    data = np.array(['.'.join(celltype.split('.')[:longest]) for celltype in celltypes])
    
    return get_mapped_celltype(data)

def get_genes_mapping(df_map, df_base, gene_title, knn=25):
    df_celltypes = get_mapped_types(df_map, df_base, gene_title, knn=knn)
    
    df_mapping = pd.Series([get_mapped_celltype(row) for cell, row in df_celltypes.iterrows()], index=df_map.index)
    
    return df_mapping    

def get_mappings(df_map, df_base, gene_titles, methods, knn=25):
    # initialize variables
    df_mappings = pd.DataFrame('', index=df_map.index, columns=methods)
    
    for gene_title, method in zip(gene_titles, methods):
        df_mappings[method] = get_genes_mapping(df_map, df_base, gene_title, knn=knn)
    
    return df_mappings

def get_mapping_distributions(df_mappings):
    columns = df_mappings.columns
    index = np.unique(df_mappings.values)
    
    df_map_counts = pd.DataFrame(0, index=index, columns=columns)
    
    for column in columns:
        celltypes, counts = np.unique(df_mappings[column], return_counts=True)
        df_map_counts.loc[celltypes, column] = counts
        
    return df_map_counts

def get_mapping_neighbors(nearest_neighbors, methods, knn=15):
    columns = methods
    index = nearest_neighbors[methods[0]].index
    
    df_map = pd.DataFrame('', index=index, columns=columns)
    
    for method, df_knn in nearest_neighbors.items():
        df_knn = df_knn.iloc[:,:knn]
        for index, row in df_knn.iterrows():
            df_map.loc[index, method] = get_mapped_celltype(row)
    
    return df_map

def get_multi_mappings(nearest_neighbors, methods, knns=[15]):
    df_neighbors = {knn:get_mapping_neighbors(nearest_neighbors, methods, knn=knn) for knn in knns}
    
    return df_neighbors

In [3]:
%%time

df_harris, df_lin, df_nbtsne = map_to_embeddings.get_input_data(label='Morph-PV-types')

CPU times: user 39.3 s, sys: 1.11 s, total: 40.4 s
Wall time: 40.4 s


In [4]:
%%time

gene_titles = ['Chi2',
               'Correlation',
               'F_Classif',
               'Kobak',
               'Mutual_Information',
               'Harris_Genes',
               'proMMT_Genes',
               'Pvalb_Kobak',
               'Pvalb_Kobak_Lin',
               'Harris_Genes',
               'proMMT_Genes',
               'Pvalb_Kobak',
               'Pvalb_Kobak_Lin'
              ]

continents = [[],
              [],
              [],
              [],
              [],
              [],
              [],
              [],
              [],
              [2,3],
              [2,3],
              [2,3],
              [2,3]
             ]

methods = ['Chi_Squared',
           'Correlation',
           'F_Classif',
           'Kobak_Method',
           'Mutual_Info',
           'Harris_Genes',
           'proMMT_Genes',
           'Pvalb_Kobak',
           'Pvalb_Kobak_Lin',
           'PV_Harris_Genes',
           'PV_proMMT_Genes',
           'PV_Pvalb_Kobak',
           'PV_Pvalb_Kobak_Lin'
              ]

knns = [5, 10, 15, 20, 25]

nearest_neighbors = {method:get_mapped_types(df_lin, df_harris, gene_title, knn=max(knns), continent=continent)
                     for method, gene_title, continent
                     in zip(methods, gene_titles, continents)
                    }

CPU times: user 5.11 s, sys: 4.98 s, total: 10.1 s
Wall time: 1.86 s


In [5]:
%%time

df_mappings = get_multi_mappings(nearest_neighbors, methods, knns=knns)
df_dists = {knn:get_mapping_distributions(df_mapping) for knn, df_mapping in df_mappings.items()}

CPU times: user 4.33 s, sys: 360 ms, total: 4.69 s
Wall time: 4.11 s


In [6]:
%%time

with pd.ExcelWriter('Mapping/Excel/Map_to_Cell_Types.xlsx') as writer:
    for method in methods:
        df_knn = nearest_neighbors[method]
        df_knn.to_excel(writer, sheet_name='Neighbors %s' % method)
    
    for knn in knns:
        df_map = df_mappings[knn]
        df_map.to_excel(writer, sheet_name='Cell Mappings (knn = %d)' % knn)
    
    for knn in knns:
        df_dist = df_dists[knn]
        df_dist.to_excel(writer, sheet_name='Mapping Counts (knn = %d)' % knn)

CPU times: user 964 ms, sys: 16.1 ms, total: 980 ms
Wall time: 1.03 s
