In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest

from Modules import rnaseqTools
from Modules import map_to_embeddings
from Modules import read_data

In [2]:
def write_defined_marker_genes():
    """
    write marker genes that were generated by extraneous sources to the relevant text files
    """
    
    # write genes that were generated by proMMT to file
    fname = 'Matlab/Lab_Pvalb_150_nbtsne.tsv'
    with open(fname) as f:
        genes = f.readline().strip().split('\t')
    
    with open('Mapping/MarkerGenes/proMMT_Genes.txt','w') as w:
        w.write('\n'.join(genes))
        
    with open('Mapping/MarkerGenes/PV_proMMT_Genes.txt','w') as w:
        w.write('\n'.join(genes))
    
    # write genes from Harris et al., 2018 to file
    fname = 'Mapping/MarkerGenes/Harris_Genes.txt'
    with open(fname) as f:
        genes = [line.split()[0] for line in f]
    
    with open('Mapping/MarkerGenes/PV_Harris_Genes.txt', 'w') as w:
        w.write('\n'.join(genes))
    
    return

def get_marker_genes(title='PV_Harris_Genes'):
    """
    get genes that are used as markers for a dataset
    Inputs:
        title - title of file to use. Default: 'PV_Harris_Genes'
    Outputs:
        genes - list of marker genes
    """
    
    with open('Mapping/MarkerGenes/%s.txt' % title) as f:
        genes = [line.split()[0] for line in f]
    
    return genes

def trim_to_continents(df, continents=[1]):
    """
    A function used to be able to zoom in on a part of a plot.
    Inputs:
        df - pandas dataframe of data
        continents - list of continents to keep
    Outputs:
        df - pandas dataframe of only data in the relevant region
    """
    
    # trim to data on continents
    df = df.loc[df.index.get_level_values('Continent').isin(continents),:]
    
    # to remove outliers, get range of 2nd to 98th percentiles, and then
    # expand the range by 5%
    X_min, Y_min = df.quantile(.02)
    X_max, Y_max = df.quantile(.98)
    X_step = .05 * (X_max - X_min)
    X_min -= X_step
    X_max += X_step
    Y_step = .05 * (Y_max - Y_min)
    Y_min -= Y_step
    Y_max += Y_step
    
    # keep all cells that are in the given range
    X_keep = np.logical_and(df.Plot_X >= X_min, df.Plot_X <= X_max)
    Y_keep = np.logical_and(df.Plot_Y >= Y_min, df.Plot_Y <= Y_max)
    df = df.loc[np.logical_and(X_keep, Y_keep),:].copy()
    
    return df

def read_in_data():
    df_harris = read_data.read_labeled_tpm_data('GSE99888').T
    df_lin = read_data.read_labeled_tpm_data('Lab_Pvalb', tpm_args={'age_cutoff':20}).T
    df_olm = read_data.read_labeled_tpm_data('Lab_OLM').T
    df_pvalb = read_data.read_sub_data('Harris_Pvalb', 'GSE99888').T
    
    df_lab = pd.concat((df_lin, df_olm), axis=0)
    df_lab = df_lab.loc[~(df_lab.index.get_level_values('Cell')=='Htr3a-OLM'),:]
    df_lab = df_lab.loc[:,(df_lab.values>0).sum(axis=0)>2]
    df_lab = df_lab.loc[:,(df_lab.values==0).sum(axis=0)>0].copy()
    
    harris_rate = (df_harris.values>0).mean(axis=0)
    df_harris = df_harris.loc[:,np.logical_and(harris_rate>.04, harris_rate<.96)]
    
    pvalb_rate = (df_pvalb.values>0).mean(axis=0)
    df_pvalb = df_pvalb.loc[:,np.logical_and(pvalb_rate>.04, pvalb_rate<.96)]
    
    df_lin = df_lin.loc[:,(df_lin.values>0).sum(axis=0)>2]
    df_lin = df_lin.loc[:,(df_lin.values==0).sum(axis=0)>0].copy()
    
    genes_harris = get_marker_genes()
    
    read_data.add_continents(df_harris)
    read_data.add_continents(df_pvalb)
    
    return df_harris, df_pvalb, df_lin, df_lab, genes_harris

def get_cam_subsets(*dfs):
    fname = '/home/soma/Documents/Newest/analysisfiles/CAMmouse.txt'
    
    with open(fname) as f:
        cams = [line.split()[0] for line in f]
        
    dfs = [df.loc[:,df.columns.isin(cams)] for df in dfs]
    
    return dfs

# Computing the matrix of correlations
def corr2(A,B):
    A = A - A.mean(axis=1, keepdims=True)
    B = B - B.mean(axis=1, keepdims=True)
    ssA = (A**2).sum(axis=1, keepdims=True)
    ssB = (B**2).sum(axis=1, keepdims=True)
    C = np.dot(A, B.T) / np.sqrt(np.dot(ssA,ssB.T))
    return C

def get_correlation_matrix():
    df_marker, df_tpm = read_in_data()
    data_corr = corr2(df_marker.values, df_tpm.values)
    df_corr = pd.DataFrame(data_corr, index=df_marker.index, columns = df_tpm.index)
    
    return df_corr

def save_correlating(title, df_main, markers, n=1000):
    
    df_main = np.log2(1+df_main)
    
    # get correlation scroes
    df_marker = df_main.loc[:,df_main.columns.isin(markers)]
    data_corr = corr2(df_marker.values.T, df_main.values.T)
    df_corr = pd.DataFrame(data_corr, index=df_marker.columns, columns = df_main.columns)
    df_qual = np.abs(df_corr).max(axis=0)
    
    # get top genes
    df_qual = df_qual.loc[~df_qual.index.isin(markers)]
    genes = df_qual.nlargest(n).index.tolist()
    
    with open('Mapping/MarkerGenes/%s.txt' % title, 'w') as w:
        w.write('\n'.join(genes))
    
    return

def get_feature_selection(df_tpm, method, title, n=1000):
    # initialize variables
    if method != 'Chi2':
        X = (df_tpm.values>0).astype(int)
    else:
        X = df_tpm.values
    Y = df_tpm.index.get_level_values('Continent').values
    
    # get scores
    select = SelectKBest(method, k=n)
    select.fit(X, Y)
    
    # get top genes
    genes = df_tpm.columns[select.get_support()].tolist()
    with open('Mapping/MarkerGenes/%s.txt' % title, 'w') as w:
        w.write('\n'.join(genes))
        
    return

def get_kobak_genes(df_tpm, title, n=1000, threshold=32):
    # get scores
    X = df_tpm.values
    importantGenes = rnaseqTools.geneSelection(X, n=n, threshold=threshold, plot=False)
    
    # get top genes
    genes = df_tpm.loc[:,importantGenes].columns.tolist()
    with open('Mapping/MarkerGenes/%s.txt' % title, 'w') as w:
        w.write('\n'.join(genes))
    
    return

## Read in Raw Data

In [3]:
%%time

df_harris, df_pvalb, df_lin, df_lab, genes_harris = read_in_data()
df_harris_cam, df_pvalb_cam, df_lin_cam = get_cam_subsets(df_harris, df_pvalb, df_lin)

number = 150
cam_number = 100

CPU times: user 32.4 s, sys: 1.02 s, total: 33.5 s
Wall time: 33.5 s


## Get best correlating genes + prommt markers

In [4]:
%%time

write_defined_marker_genes()
save_correlating('Correlation', df_harris, genes_harris, n=number)
save_correlating('PV_Correlation', df_pvalb, genes_harris, n=number)
save_correlating('CAM_Correlation', df_harris_cam, genes_harris, n=cam_number)
save_correlating('CAM_PV_Correlation', df_pvalb_cam, genes_harris, n=cam_number)

CPU times: user 11.7 s, sys: 2.24 s, total: 13.9 s
Wall time: 8.91 s


## get best genes for each feature selection method

In [5]:
%%time

# initialize variables
methods = f_classif, chi2, mutual_info_classif
titles = ['F_Classif', 'Chi2', 'Mutual_Information']

# generate key genes
for method, title in zip(methods, titles):
    get_feature_selection(df_harris, method, title, n=number)
    get_feature_selection(df_pvalb, method, 'PV_%s' % title, n=number)
    get_feature_selection(df_harris_cam, method, 'CAM_%s' % title, n=cam_number)
    get_feature_selection(df_pvalb_cam, method, 'CAM_PV_%s' % title, n=cam_number)

CPU times: user 7min 57s, sys: 618 ms, total: 7min 58s
Wall time: 7min 58s


## get best genes using the Kobak et al, feature selection method

In [6]:
%%time

get_kobak_genes(df_harris, 'Kobak', threshold=32, n=number)
get_kobak_genes(df_pvalb, 'PV_Kobak', threshold=32, n=number)
get_kobak_genes(df_lin, 'Lin_Kobak', threshold=32, n=number)
get_kobak_genes(df_lab, 'Lab_Kobak', threshold=32, n=number)
get_kobak_genes(df_harris_cam, 'CAM_Kobak', threshold=32, n=cam_number)
get_kobak_genes(df_pvalb_cam, 'CAM_PV_Kobak', threshold=32, n=cam_number)

CPU times: user 1.7 s, sys: 108 ms, total: 1.81 s
Wall time: 1.81 s
