In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from adjustText import adjust_text
from matplotlib.backends.backend_pdf import PdfPages

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from Modules import file_navigation
from Modules import rnaseqTools
from Modules import map_to_embeddings
from Modules import read_data

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.linewidth'] = 0.5

color_dict = file_navigation.get_color_dict()

In [2]:
def get_color_dataframe():
    fname = 'References/marker_ref.txt'
    params = {'sep':'\t', 'header':0, 'index_col':0}
    df = pd.read_csv(fname, **params)
    
    return df

def get_dataset(dataset):
    df, df_labels = read_data.read_tpm_data(dataset, log=True)
    
    arrays = [df_labels.index.astype(str), df_labels.CellType]
    names = ('Cell', 'CellType')
    df.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    return df.T

def get_input_data():
    df_harris = get_dataset('GSE99888')
    df_gauwens = get_dataset('Gauwens')
    df_lin = get_dataset('Lab_Pvalb')
    df_pvalb = read_data.read_sub_data('Harris_Pvalb', 'GSE99888').T
    df_pvalb = np.log2(1+df_pvalb)
    df_gauwens_pvalb = np.log2(1+read_data.read_sub_data('Gauwens_Pvalb', 'Gauwens').T)
    df_nbtsne = map_to_embeddings.read_embedding_data()[0]
    df_gauwens_umap = map_to_embeddings.read_embedding_data(dataset='Gauwens')[2]
    
    
    read_data.add_continents(df_harris)
    read_data.add_continents(df_pvalb)
    read_data.add_continents(df_nbtsne)
    read_data.add_continents(df_gauwens, refname='Gauwens_Continents.txt')
    read_data.add_continents(df_gauwens_pvalb, refname='Gauwens_Continents.txt')
    read_data.add_continents(df_gauwens_umap, refname='Gauwens_Continents.txt')
    
    return df_harris, df_pvalb, df_lin, df_nbtsne, df_gauwens, df_gauwens_pvalb, df_gauwens_umap

def create_figure(title=''):
    fig = plt.figure(figsize=(8.5,11))
    if type(title) is str and len(title) > 0:
        fig.suptitle(title, fontsize=14)
    
    ax = fig.add_axes([.1, .28, .8, .62])
    ax.set_xticks([])
    ax.set_yticks([])
    
    return fig, ax

def plot_embedding_data(df, ax, colors, alpha=1, s=8, annotate=False):
    # do scatter plot
    ax.scatter(df.Plot_X, df.Plot_Y, color=colors, s=s, alpha=alpha)
    
    # annotate labels
    if annotate:
        # get cell type median positions
        celltypes = ['.'.join(cell.split('.')[:2]) for cell in df.index.get_level_values('CellType')]
        df_cell = df.groupby(celltypes, axis=0).median()
        
        scatter_params = {'facecolor':'none', 'edgecolor':'none', 's':1, 'linewidths':0}
        text_params = {'ha':'left', 'va':'center', 'fontsize':8, 'weight':'bold'}
        ax.scatter(df_cell.Plot_X, df_cell.Plot_Y, **scatter_params)
        texts = [ax.text(row.Plot_X, row.Plot_Y, cell, **text_params) for cell, row in df_cell.iterrows()]
        adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'), ax=ax)
    
    return

def get_positions(df_corr, df_base, knn=(5,25)):
    knn_low, knn_high = knn
    steps = knn_high - knn_low + 1
    idx = df_corr.values.argsort(axis=1)[:,-knn_high:]
    points = np.zeros((df_corr.shape[0], steps, 2), dtype=float)
    
    for row, ind in enumerate(idx):
        values = df_base.values[ind,:]
        for col, knn_count in enumerate(range(knn_low, knn_high+1)):
            points[row, col, :] = np.median(values[-knn_count:,:], axis=0)
    
    positions = np.median(points, axis=1)
    df_position = pd.DataFrame(positions, index=df_corr.index, columns=df_base.columns)
    return df_position
    
def plot_mapping_data(df_corr, df_base, ax, df_color, s=16, knn=(5,25)):
    df_base = df_base.loc[df_corr.columns,:]
    df_position = get_positions(df_corr, df_base, knn=knn)
    for celltype in sorted(set(df_position.index.get_level_values('CellType')), reverse=True):
        face, edge, marker = df_color.loc[celltype, ['Face', 'Edge', 'Marker']]
        df_celltype = df_position.xs(celltype, level='CellType')
        params = {'facecolor':face, 's':s, 'zorder':1, 'edgecolor':edge, 'marker':marker}
        ax.scatter(df_celltype.Plot_X, df_celltype.Plot_Y, **params)
    
    return

def plot_mapping(df_corr, df_base, color_dict, df_color, title='', knn=(5,25)):
    base_colors = df_base.index.get_level_values('CellType').map(color_dict)
    
    fig_map, ax = create_figure(title=title)
    plot_embedding_data(df_base, ax, base_colors, alpha=.25, s=1, annotate=True)
    plot_mapping_data(df_corr, df_base, ax, df_color, s=16, knn=knn)
    
    return fig_map

def plot_embeddings_figure(dfs, titles, colors):
    fig = plt.figure(figsize=(8.5,11))
    fig.suptitle('Embeddings', fontsize=14)
    
    axes = []
    for ax_num in range(4):
        row = ax_num // 2
        col = ax_num % 2
        ax = fig.add_axes([.15+.4 * col, .62 - .30*row, .33, .255])
        ax.set_xticks([]), ax.set_yticks([])
        axes.append(ax)
    
    for df, title, ax in zip(dfs, titles, axes):
        ax.set_title(title, fontsize=12)
        plot_embedding_data(df, ax, colors, alpha=.5, s=1, annotate=True)
    
    return fig

def plot_all_embeddings(df_corr, df_nbtsne, df_tsne, df_umap, df_fit, color_dict, df_color,
                        knn=(5,25), pdfname='Compare_Mappings'
                       ):
    pp = PdfPages('Mapping/Plots/%s.pdf' % pdfname)
    
    titles = ['nbt-SNE', 't-SNE', 'UMAP', 'FIt-SNE']
    dfs = (df_nbtsne, df_tsne, df_umap, df_fit)
    
    base_colors = df_nbtsne.index.get_level_values('CellType').map(color_dict)
    fig = plot_embeddings_figure(dfs, titles, base_colors)
    pp.savefig(fig)
    plt.close()
    
    for title, df_base in zip(titles, dfs):
        fig_map = plot_mapping(df_corr, df_base, color_dict, df_color, title=title, knn=knn)
        pp.savefig(fig_map)
        plt.close()
    
    pp.close()
    
    return

def get_mapped_types(df_corr, knn=25):    
    # convert to a dataframe of corresponding cell types
    idx = df_corr.values.argsort(axis=1)[:,-knn:]
    neighbor_types = df_corr.columns.get_level_values('Continent').astype(str).values[idx]
    df_celltypes = pd.DataFrame(neighbor_types, index=df_corr.index)
    df_celltypes.columns = np.arange(df_celltypes.shape[1],dtype=int) + 1
    df_celltypes.columns.name = 'NearestNeighbor'
    
    return df_celltypes

def get_mapped_categories(df_knn):
    categories = pd.Series('', index=df_knn.index)
    
    for cell, mapping in df_knn.iterrows():
        labels, counts = np.unique(mapping, return_counts=True)
        idx = counts.argmax()
        if counts[idx] / counts.sum() > .5:
            categories[cell] = labels[idx]
    
    return categories

In [3]:
%%time

df_color = get_color_dataframe()
df_harris, df_pvalb, df_lin, df_nbtsne, df_gauwens, df_gauwens_pvalb, df_gauwens_umap = get_input_data()

CPU times: user 1min 24s, sys: 1.44 s, total: 1min 25s
Wall time: 1min 25s


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [4]:
%%time

methods = ['Correlation', 
           'F_Classif',
           'Chi2',
           'Mutual_Information',
           'Kobak'
          ]

starts = ['',
          'PV_',
          'CAM_',
          'CAM_PV_'
         ]
df_bases = [df_harris,
            df_pvalb,
            df_harris,
            df_pvalb
           ]

df_celltypes = pd.DataFrame('', index=df_lin.index, columns=[])

with pd.ExcelWriter('Mapping/Excel/Nearest_Neighbors.xlsx') as writer:
    for method in methods:
        for start, df_base in zip(starts, df_bases):
            gene_title = '%s%s' % (start, method)
            df_corr = map_to_embeddings.get_df_corr(df_lin, df_base, filename=gene_title)
            df_knn = get_mapped_types(df_corr, knn=25)
            df_knn.to_excel(writer, sheet_name=gene_title)

            df_map = map_to_embeddings.get_positions(df_corr, df_nbtsne, knn=(5,25))
            df_map.to_csv('Mapping/MappingPositions/%s.tsv' % gene_title, sep='\t')
            
            df_celltypes[gene_title] = get_mapped_categories(df_knn)
    
    starts = ['',
              'PV_'
             ]
    df_bases = [df_harris,
                df_pvalb,
                df_harris,
                df_pvalb
               ]
    
    for method in ['Harris_Genes', 'proMMT_Genes']:
        for start, df_base in zip(starts, df_bases):
            gene_title = '%s%s' % (start, method)
            df_corr = map_to_embeddings.get_df_corr(df_lin, df_base, filename=gene_title)
            df_knn = get_mapped_types(df_corr, knn=25)
            df_knn.to_excel(writer, sheet_name=gene_title)

            df_map = map_to_embeddings.get_positions(df_corr, df_nbtsne, knn=(5,25))
            df_map.to_csv('Mapping/MappingPositions/%s.tsv' % gene_title, sep='\t')
            
            df_celltypes[gene_title] = get_mapped_categories(df_knn)
            
    df_celltypes.to_excel(writer, sheet_name='Mapped Types')
    df_distribution = df_celltypes.apply(pd.Series.value_counts).fillna(0).astype(int).T
    df_distribution.to_excel(writer, sheet_name='Mapping Distribution')

CPU times: user 16.6 s, sys: 12.6 s, total: 29.3 s
Wall time: 8.81 s


In [5]:
%%time

methods = ['Correlation', 
           'F_Classif',
           'Chi2',
           'Mutual_Information',
           'Kobak'
          ]

starts = ['',
          'PV_',
          'CAM_',
          'CAM_PV_'
         ]
df_bases = [df_gauwens,
            df_gauwens_pvalb,
            df_gauwens,
            df_gauwens_pvalb
           ]

df_celltypes = pd.DataFrame('', index=df_lin.index, columns=[])

with pd.ExcelWriter('Mapping/Excel/Nearest_Neighbors_Gauwens.xlsx') as writer:
    for method in methods:
        for start, df_base in zip(starts, df_bases):
            gene_title = '%s%s' % (start, method)
            df_corr = map_to_embeddings.get_df_corr(df_lin, df_base, filename=gene_title)
            df_knn = get_mapped_types(df_corr, knn=25)
            df_knn.to_excel(writer, sheet_name=gene_title)

            df_map = map_to_embeddings.get_positions(df_corr, df_gauwens_umap, knn=(5,25))
            df_map.to_csv('Mapping/MappingPositions/%s_Gauwens.tsv' % gene_title, sep='\t')
            
            df_celltypes[gene_title] = get_mapped_categories(df_knn)
    
    starts = ['',
              'PV_'
             ]
    df_bases = [df_gauwens,
                df_gauwens_pvalb,
                df_gauwens,
                df_gauwens_pvalb
               ]
    
    for method in ['Harris_Genes', 'proMMT_Genes']:
        for start, df_base in zip(starts, df_bases):
            gene_title = '%s%s' % (start, method)
            df_corr = map_to_embeddings.get_df_corr(df_lin, df_base, filename=gene_title)
            df_knn = get_mapped_types(df_corr, knn=25)
            df_knn.to_excel(writer, sheet_name=gene_title)

            df_map = map_to_embeddings.get_positions(df_corr, df_gauwens_umap, knn=(5,25))
            df_map.to_csv('Mapping/MappingPositions/%s_Gauwens.tsv' % gene_title, sep='\t')
            
            df_celltypes[gene_title] = get_mapped_categories(df_knn)
            
    df_celltypes.to_excel(writer, sheet_name='Mapped Types')
    df_distribution = df_celltypes.apply(pd.Series.value_counts).fillna(0).astype(int).T
    df_distribution.to_excel(writer, sheet_name='Mapping Distribution')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_nested_tuple(tup)


CPU times: user 17.6 s, sys: 13.4 s, total: 31 s
Wall time: 8.88 s
