In [1]:
import umap
import glob
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import Modules.rnaseqTools as rnaT
from sklearn.decomposition import PCA
from matplotlib.backends.backend_pdf import PdfPages
from scipy.cluster.hierarchy import dendrogram, linkage

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.linewidth'] = 0.5
plt.rcParams["xtick.major.size"] = 2
plt.rcParams["ytick.major.size"] = 2
plt.rcParams['xtick.major.width'] = .5
plt.rcParams['ytick.major.width'] = .5

In [2]:
def get_pvalb():
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/Lab_Pvalb-tpm.tsv'
    df = pd.read_csv(fname, **kwargs)
    
    fname = 'Datasets/Lab_Pvalb-transcriptional_labels.tsv'
    df_labels = pd.read_csv(fname, **kwargs)
    
    df_labels = df_labels.loc[df_labels.Age>20]
    df = df.loc[:,df_labels.index]
    df = df.loc[(df.values>0).sum(axis=1)>0,:]
    df = np.log2(1+df)
    
    return df, df_labels

def trim_data(df, n=500):
    df = df.loc[(df.values>np.log2(6)).sum(axis=1)>4,:].T
    #df = df.loc[df.values.mean(axis=1)>4,:].T
    #df = df.loc[(df.values>0).sum(axis=1)>10,:]
    #df = df.loc[(df.values<5).sum(axis=1)>10,:].T
    
    #importantGenes = rnaT.geneSelection(df.values, n=n, threshold=5, plot=False)
    #df = df.loc[:,importantGenes]
    
    return df

def get_fitted_data(df, number):
    X = df.values
    Y = df.index.get_level_values('Label').values
    svc = SVC(kernel="linear", C=1)
    rfe = RFE(estimator=svc, n_features_to_select=number, step=10)
    rfe = rfe.fit(X, Y)
    genes = df.columns[rfe.ranking_==1].values.tolist()
    
    plotlist = rfe.transform(X)
    plotlist += .01*np.random.rand(*plotlist.shape)
    pcalist = PCA(n_components=2).fit_transform(plotlist)
    reducer = umap.UMAP(n_components=2, n_neighbors=7, min_dist=.1, random_state=42, metric='euclidean')
    umaplist = reducer.fit_transform(plotlist)
    
    return genes, pcalist, umaplist

def generate_gene_data(df):
    numbers = [2, 5, 10, 20, 50, 100, 200, 500, 1000]
    
    pca_columns = ['PCA_%d_%s' % (number, ending) for number in numbers for ending in ('X', 'Y')]
    umap_columns = ['UMAP_%d_%s' % (number, ending) for number in numbers for ending in ('X', 'Y')]
    df_pca = pd.DataFrame(np.NaN, index=df.index, columns=pca_columns)
    df_umap = pd.DataFrame(np.NaN, index=df.index, columns=umap_columns)
    df_gene = pd.DataFrame('', index=np.arange(max(numbers)), columns=numbers)
    df_gene.index.name = 'Gene Number'
    
    for number in numbers:
        genes, pcalist, umaplist = get_fitted_data(df, number)
        df_gene.loc[np.arange(number), number] = genes
        df_pca.loc[:,['PCA_%d_X' % number, 'PCA_%d_Y' % number]] = pcalist
        df_umap.loc[:,['UMAP_%d_X' % number, 'UMAP_%d_Y' % number]] = umaplist
    
    return df_gene, df_pca, df_umap

def generate_label_data(df, df_labels, label, title):
    labels = df_labels[label]
    print(label, set(labels))
    df = trim_data(df, n=5000).copy()
    
    arrays = [labels.index, labels.values]
    names = ('Cell', 'Label')
    df.index = pd.MultiIndex.from_arrays(arrays, names=names)
    
    df_gene, df_pca, df_umap = generate_gene_data(df)
    
    fname = 'Mapping/Excel/%s.xlsx' % title
    with pd.ExcelWriter(fname) as writer:
        df_gene.to_excel(writer, sheet_name='Key Genes')
        df_pca.to_excel(writer, sheet_name='PCA Embedding')
        df_umap.to_excel(writer, sheet_name='UMAP Embedding')
    
    return

def add_legend(fig, labels, df_marker):
    ax = fig.add_axes([.2, .25, .6, .02])
    ax.set_xticks([]), ax.set_yticks([])
    
    for label in sorted(set(labels)):
        edge, face, marker = df_marker.loc[label, ['Edge', 'Face', 'Marker']]
        ax.scatter([], [], facecolor=face, edgecolor=edge, marker=marker, s=16, label=label, linewidth=.5)
    
    ax.legend(scatterpoints=1, frameon=False, labelspacing=0.2, ncol=7, fontsize=7, loc='center')
    
    return

def plot_generated_mapping(title, df_marker, pp, use_labels=[]):
    
    fname = 'Mapping/Excel/%s.xlsx' % title
    for sheet_name in ('PCA Embedding', 'UMAP Embedding'):
        df = pd.read_excel(fname, sheet_name=sheet_name, index_col=[0,1], header=0)
        labels = df.index.get_level_values('Label')
        if len(use_labels) > 0:
            labels = use_labels.loc[df.index.get_level_values('Cell')].values
        fig = plt.figure(figsize=(8.5,11))
        
        for ind in range(0,df.shape[1],2):
            row = ind // 6
            col = (ind//2) % 3
            ax = fig.add_axes([.1 + .315 * col, .7 - .2 * row, .22, .17])
            col_title = df.columns[ind][:-2]
            ax.set_title(col_title, fontsize=10)
            ax.set_xticks([]), ax.set_yticks([])
            plotlist = df.iloc[:,[ind,ind+1]].values
            
            for label in set(labels):
                edge, face, marker = df_marker.loc[label,:]
                inds = labels == label
                ax.scatter(plotlist[inds,0],
                           plotlist[inds,1],
                           edgecolor=edge,
                           facecolor=face,
                           marker=marker,
                           s=16,
                           linewidth=.5
                          )
        
        fig.suptitle(title, fontsize=14, y=.93)
        add_legend(fig, labels, df_marker)
        pp.savefig(fig)
        plt.close()
    
    return

In [3]:
%%time

df, df_labels = get_pvalb()

CPU times: user 363 ms, sys: 39 ms, total: 402 ms
Wall time: 404 ms


In [4]:
%%time

labels = ['Morph-PV-types', 'MorphMarker-PV-types', 'MorphDirectional-PV-types']
titles = ['Morph', 'MorphMarker', 'MorphDirectional']
for label, title in zip(labels, titles):
    generate_label_data(df, df_labels, label, title)

Morph-PV-types {'vBIC', 'vAAC', 'hBIC', 'vBC', 'hBC'}


Compilation is falling back to object mode WITH looplifting enabled because Function "fuzzy_simplicial_set" failed type inference due to: Untyped global name 'nearest_neighbors': cannot determine Numba type of <class 'function'>

File "../../../../../../anaconda3/envs/analyze/lib/python3.6/site-packages/umap/umap_.py", line 467:
def fuzzy_simplicial_set(
    <source elided>
    if knn_indices is None or knn_dists is None:
        knn_indices, knn_dists, _ = nearest_neighbors(
        ^

  @numba.jit()

File "../../../../../../anaconda3/envs/analyze/lib/python3.6/site-packages/umap/umap_.py", line 350:
@numba.jit()
def fuzzy_simplicial_set(
^

  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "../../../../../../anaconda3/envs

MorphMarker-PV-types {'AAC', 'BC', 'BIC'}
MorphDirectional-PV-types {'Horizontal', 'Vertical'}
CPU times: user 2h 45min 35s, sys: 3h 18min 48s, total: 6h 4min 24s
Wall time: 34min 17s


In [6]:
titles = ['Morph', 'MorphMarker', 'MorphDirectional']
base_name = 'Mapping/Excel/%s.xlsx'

with pd.ExcelWriter('Mapping/Excel/Separating Genes.xlsx') as writer:
    for title in titles:
        df_excel = pd.read_excel(base_name % title, sheet_name = 'Key Genes', index_col=0, header=0)
        df_excel.to_excel(writer, sheet_name=title)