In [1]:
import sys
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap
from fast_tsne import fast_tsne

from Modules import rnaseqTools
from Modules import file_navigation
from Modules import read_data

In [2]:
def read_tpm_data(dataset):
    fname = 'Datasets/%s-tpm.tsv' % dataset
    params = {'sep':'\t', 'header':0, 'index_col':0}
    df = pd.read_csv(fname, **params)
    
    return df

def get_lin_data():
    df = read_tpm_data('Lab_Pvalb')
    fname = 'Datasets/Lab_Pvalb-labels.tsv'
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    df_labels = pd.read_csv(fname, **kwargs)
    
    df_labels = df_labels.loc[df_labels.Age>20]
    df_labels = df_labels.loc[df_labels.CellType.isin(('vAAC', 'vBC', 'vBIC', 'hBC', 'hBIC'))]
    
    df = df.loc[:,df_labels.index].T
    
    return df

def get_olm_data():
    df = read_tpm_data('Lab_OLM')
    
    fname = 'Datasets/Lab_OLM-labels.tsv'
    params = {'sep':'\t', 'header':0, 'index_col':0}
    df_labels = pd.read_csv(fname, **params)
    df_labels = df_labels.loc[df_labels.CellType == 'Sst-OLM']
    df = df.loc[:,df_labels.index].T
    
    return df

def get_lab_data():
    df_lin = get_lin_data()
    df_olm = get_olm_data()
    
    df_lab = pd.concat((df_lin, df_olm), axis=0)
    
    return df_lin, df_lab

def get_color_dataframe():
    fname = 'References/marker_ref.txt'
    params = {'sep':'\t', 'header':0, 'index_col':0}
    df = pd.read_csv(fname, **params)
    
    return df

def read_nbtsne_data():    
    fname = 'Matlab/Lab_Pvalb_144_144_nbtsne.tsv'
    params = {'sep':'\t', 'header':[0,1], 'index_col':None, 'skiprows':1}
    df_nbtsne = pd.read_csv(fname, **params).T
    df_nbtsne.columns = ['Plot_X', 'Plot_Y']
    df_nbtsne.index = df_nbtsne.index.get_level_values(0)
    df_nbtsne.index.name = 'Cell'
    
    return df_nbtsne

def get_data():
    df_lin, df_lab = get_lab_data()
    
    fname = 'Datasets/Lab_Pvalb-transcriptional_labels.tsv'
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    df_labels = pd.read_csv(fname, **kwargs)
    df_labels = df_labels.loc[df_labels.Age>20]
    
    df_nbtsne = read_nbtsne_data()
    
    return df_lin, df_lab, df_nbtsne

def do_feature_selection(df, filename='Lin_Kobak'):
    fname = 'Mapping/MarkerGenes/%s.txt' % filename
    with open(fname) as f:
        importantGenes = [line.split()[0] for line in f]
        df = df.loc[:,df.columns.isin(importantGenes)].copy()
    return df

def do_preliminary_reduction(df, normalize=False, feature_selection=True, filename='Lin_Kobak'):
    df = np.log2(1+df)
    if normalize:
        datalist = StandardScaler().fit_transform(df.values)
        df = pd.DataFrame(datalist, index=df.index, columns=df.columns)
    if feature_selection:
        df = do_feature_selection(df, filename=filename)
    
    pca = PCA(n_components=50)
    datalist = pca.fit_transform(df.values)
    df_pca = pd.DataFrame(datalist, index=df.index)
    
    return df_pca

def perform_embeddings(df, perplexity=10, normalize=False, trim=True, feature_selection=True, filename='Lin_Kobak'):
    if trim:
        df_pca = do_preliminary_reduction(df,
                                          normalize=normalize,
                                          feature_selection=feature_selection,
                                          filename=filename
                                         )
    else:
        df_pca = df
        if normalize:
            datalist = StandardScaler().fit_transform(df_pca.values)
            df_pca = pd.DataFrame(datalist, index=df_pca.index, columns=df_pca.columns)
    
    columns = ['PCA_X', 'PCA_Y',
               't-SNE_X', 't-SNE_Y',
               'FIt-SNE_X', 'FIt-SNE_Y',
               'UMAP_X', 'UMAP_Y']
    df_embedding = pd.DataFrame(np.NaN, index=df_pca.index, columns=columns)
    
    pca = PCA(n_components=2)
    columns = ['PCA_X', 'PCA_Y']
    df_embedding[columns] = pca.fit_transform(df_pca)
    
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    columns = ['t-SNE_X', 't-SNE_Y']
    df_embedding[columns] = tsne.fit_transform(df_pca)
    
    reducer = umap.UMAP(n_components=2, n_neighbors=perplexity, min_dist=.01, random_state=42, metric='euclidean')
    columns = ['UMAP_X', 'UMAP_Y']
    df_embedding[columns] = reducer.fit_transform(df_pca)
    
    datalist = fast_tsne(df_pca.values, perplexity = perplexity,
                            learning_rate = df_pca.shape[0]/12, knn_algo='vp-tree', seed=42)
    columns = ['FIt-SNE_X', 'FIt-SNE_Y']
    df_embedding[columns] = datalist    
    
    return df_embedding

def get_kobak_genes(df, n=100, threshold=32):
    # initialize variables
    
    X = df.values
    importantGenes = rnaseqTools.geneSelection(X, n=n, threshold=threshold, plot=False)
    
    df = df.loc[:,importantGenes]
    
    return df

def get_cam_data(df_tpm):
    fname = '/home/soma/Documents/Newest/analysisfiles/CAMmouse.txt'
    with open(fname) as f:
        genes = [line.split()[0] for line in f]
        
    df_cam = df_tpm.loc[:,df_tpm.columns.isin(genes)]
    
    df_cam = get_kobak_genes(df_cam, n=100)
    
    return df_cam

def generate_embeddings_data(perplexity_trans=10, perplexity_ephys=10, perplexity_cam=10):
    df_lin, df_lab, df_nbtsne = get_data()
    df_cam = get_cam_data(df_lin)
    
    # do embedding on tpm data
    df_embedding = perform_embeddings(df_lin, perplexity=perplexity_trans, filename='Lin_Kobak')
    df_embedding['nbt-SNE_X'] = df_nbtsne.Plot_X
    df_embedding['nbt-SNE_Y'] = df_nbtsne.Plot_Y
    df_embedding.to_csv('Mapping/Embeddings/Lab_Pvalb-tpm.tsv', sep='\t')
    
    df_embedding = perform_embeddings(df_lab, perplexity=perplexity_trans, filename='Lab_Kobak')
    df_embedding.to_csv('Mapping/Embeddings/Lab_Data-tpm.tsv', sep='\t')
    
    df_embedding = perform_embeddings(df_cam, perplexity=perplexity_cam, feature_selection=False)
    df_embedding.to_csv('Mapping/Embeddings/Lab_Pvalb-cam.tsv', sep='\t')
    
    return

In [3]:
generate_embeddings_data(perplexity_trans=10, perplexity_ephys=6, perplexity_cam=6)

Compilation is falling back to object mode WITH looplifting enabled because Function "fuzzy_simplicial_set" failed type inference due to: Untyped global name 'nearest_neighbors': cannot determine Numba type of <class 'function'>

File "../../../../../../anaconda3/envs/analyze/lib/python3.6/site-packages/umap/umap_.py", line 467:
def fuzzy_simplicial_set(
    <source elided>
    if knn_indices is None or knn_dists is None:
        knn_indices, knn_dists, _ = nearest_neighbors(
        ^

  @numba.jit()

File "../../../../../../anaconda3/envs/analyze/lib/python3.6/site-packages/umap/umap_.py", line 350:
@numba.jit()
def fuzzy_simplicial_set(
^

  state.func_ir.loc))
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit

File "../../../../../../anaconda3/envs