In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest

from Modules import rnaseqTools

In [2]:
def read_tpm_data(dataset, age_cutoff = None):
    params = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/%s-tpm.tsv' % dataset
    df = pd.read_csv(fname, **params)
    
    fname = 'Datasets/%s-labels.tsv' % dataset
    df_labels = pd.read_csv(fname, **params)
    
    if age_cutoff != None:
        df_labels = df_labels.loc[df_labels.Age > age_cutoff,:]
        df = df.loc[:,df_labels.index]
    
    arrays = [df_labels.index, df_labels.CellType]
    names = ('Cell', 'CellType')
    df.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    df = df
    
    return df

def get_marker_genes():
    fname = 'Mapping/MarkerGenes/Harris_Genes.txt'
    with open(fname) as f:
        genes = [line.split()[0] for line in f]
        
    return genes

def get_lab_data():
    df_lin = read_tpm_data('Lab_Pvalb')
    df_olm = read_tpm_data('Lab_OLM')
    
    df_olm = df_olm.loc[:,df_olm.columns.get_level_values('CellType')=='SST-OLM']
    fname = 'Datasets/Lab_Pvalb-labels.tsv'
    kwargs = {'sep':'\t', 'header':0, 'index_col':0}
    df_labels = pd.read_csv(fname, **kwargs)
    df_labels = df_labels.loc[df_labels.Age>20]
    df_lin = df_lin.loc[:,df_lin.columns.get_level_values('Cell').isin(df_labels.index)]
    
    df_lab = pd.concat((df_lin, df_olm), axis=1)
    
    return df_lin, df_lab

def read_in_data():
    df_tpm = read_tpm_data('GSE99888')
    celltypes = df_tpm.columns.get_level_values('CellType')
    is_pvalb = celltypes.str.startswith('Pvalb')
    is_sst = celltypes.str.startswith('Sst.Erbb4')
    keep = np.logical_or(is_pvalb, is_sst)
    df_sub = df_tpm.loc[:,keep]
    markers = get_marker_genes()
    
    marker_inds = df_tpm.index.isin(markers)
    df_marker = df_sub.loc[marker_inds,:]
    rates = (df_sub.values>0).mean(axis=1)
    df_tpm = df_tpm.loc[np.logical_and(rates>.04, rates<.96),:]
    df_sub = df_sub.loc[np.logical_and(rates>.04, rates<.96),:]
    
    df_lin, df_lab = get_lab_data()
    
    df_marker = np.log2(1+df_marker)
    df_tpm = np.log2(1+df_tpm)
    df_sub = np.log2(1+df_sub)
    df_lin = np.log2(1+df_lin)
    df_lab = np.log2(1+df_lab)
    
    return df_marker, df_tpm, df_sub, df_lin, df_lab

# Computing the matrix of correlations
def corr2(A,B):
    A = A - A.mean(axis=1, keepdims=True)
    B = B - B.mean(axis=1, keepdims=True)
    ssA = (A**2).sum(axis=1, keepdims=True)
    ssB = (B**2).sum(axis=1, keepdims=True)
    C = np.dot(A, B.T) / np.sqrt(np.dot(ssA,ssB.T))
    return C

def get_correlation_matrix():
    df_marker, df_tpm = read_in_data()
    data_corr = corr2(df_marker.values, df_tpm.values)
    df_corr = pd.DataFrame(data_corr, index=df_marker.index, columns = df_tpm.index)
    
    return df_corr

def save_correlation_genes(df_marker, df_tpm, n=1000):
    
    # get correlation scores
    data_corr = corr2(df_marker.values, df_tpm.values)
    df_corr = pd.DataFrame(data_corr, index=df_marker.index, columns = df_tpm.index)
    df_qual = np.abs(df_corr).max(axis=0)
        
    # get top genes
    genes = df_qual.nlargest(n).index.tolist()

    with open('Mapping/MarkerGenes/Pvalb_Correlation.txt', 'w') as w:
        w.write('\n'.join(genes))
        
    return

def get_feature_selection_genes(df, method, title, n=1000):
    # initialize variables
    rates = (df.values>0).mean(axis=1)
    df = df.loc[np.logical_and(rates>.20, rates<.80)]
    if method != 'Chi2':
        X = (df.values>0).astype(int).T
    else:
        X = df.values.T
    celltypes = df.columns.get_level_values('CellType')
    Y = celltypes.str.startswith('Pvalb.C1ql1')
        
    # get genes
    select = SelectKBest(method, k=n)
    select.fit(X, Y)
    genes = df.index[select.get_support()].tolist()
    
    with open('Mapping/MarkerGenes/Pvalb_%s.txt' % title, 'w') as w:
        w.write('\n'.join(genes))
        
    return

def get_kobak_genes(df_tpm, n=3000, threshold=32, title='Pvalb_Kobak', atleast=10):
    # initialize variables
    X = np.power(2,df_tpm.values.T)-1
    importantGenes_all = rnaseqTools.geneSelection(X, n=n, threshold=threshold, plot=False, atleast=atleast)
    
    # get top genes
    genes = df_tpm.loc[importantGenes_all,:].index.tolist()
    
    with open('Mapping/MarkerGenes/%s.txt' % title, 'w') as w:
        w.write('\n'.join(genes))
    
    return

def generate_reduced_data(df_tpm, title):
    with open('Mapping/MarkerGenes/%s.txt' % title) as f:
        genes = [line.split()[0] for line in f]
    
    df_reduced = df_tpm.loc[df_tpm.index.isin(genes),:].T
    
    pca = PCA(n_components=50)
    datalist = pca.fit_transform(df_reduced.values)
    
    df_pca = pd.DataFrame(datalist, index=df_reduced.index, columns=np.arange(datalist.shape[1])+50)
    
    df_pca.to_csv('Mapping/Datasets/Harris_Subset_%s.txt' % title, sep='\t')
    
    return

In [3]:
%%time 
df_marker, df_tpm, df_sub, df_lin, df_lab = read_in_data()

CPU times: user 30 s, sys: 1.04 s, total: 31.1 s
Wall time: 31.1 s


In [4]:
%%time

# initialize variables
number = 150
methods = f_classif, chi2, mutual_info_classif
titles = ['F_Classif', 'Chi2', 'Mutual_Information']

# generate key genes
save_correlation_genes(df_marker, df_sub, n=number)
generate_reduced_data(df_tpm, 'Pvalb_Correlation')
for method, title in zip(methods, titles):
    get_feature_selection_genes(df_sub, method, title, n=number)
    generate_reduced_data(df_tpm, 'Pvalb_%s' % title)

CPU times: user 32.5 s, sys: 3.54 s, total: 36.1 s
Wall time: 29.7 s


In [5]:
%%time

get_kobak_genes(df_sub, n=150)
generate_reduced_data(df_tpm, 'Pvalb_Kobak')

CPU times: user 1.3 s, sys: 843 ms, total: 2.15 s
Wall time: 769 ms


In [6]:
%%time

get_kobak_genes(df_lin, n=150, title='Pvalb_Kobak_Lin', atleast=10)
get_kobak_genes(df_lab, n=150, title='Pvalb_Kobak_Lab', atleast=10)
generate_reduced_data(df_tpm, 'Pvalb_Kobak_Lin')

CPU times: user 1.21 s, sys: 738 ms, total: 1.95 s
Wall time: 606 ms
