In [1]:
import numpy as np
import pandas as pd
from Modules import rnaseqTools

In [2]:
def get_lin_data():
    params = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/Lab_Pvalb-transcriptional_labels.tsv'
    df_labels = pd.read_csv(fname, na_values='Other', **params)
    df_labels = df_labels.loc[df_labels.Age>20,:]
    df_labels.sort_values('Age', inplace=True)
    
    fname = 'Datasets/Lab_Pvalb-tpm.tsv'
    df_tpm = pd.read_csv(fname, **params)
    df_tpm = df_tpm.loc[:,df_labels.index].copy()
    
    arrays = [df_labels.index, df_labels['Morph-PV-types']]
    names = ('Cell', 'CellType')
    df_tpm.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    return df_tpm

def get_olm_data():
    params = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/Lab_OLM-labels.tsv'
    df_labels = pd.read_csv(fname, **params)
    
    fname = 'Datasets/Lab_OLM-tpm.tsv'
    df_tpm = pd.read_csv(fname, **params)
    df_tpm = df_tpm.loc[:,df_labels.CellType == 'SST-OLM']
    
    arrays = [df_tpm.columns, ['SST-OLM'] * df_tpm.shape[1]]
    names = ('Cell', 'CellType')
    df_tpm.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    return df_tpm

def get_cams():
    fname = '/home/soma/Documents/Newest/analysisfiles/CAMmouse.txt'
    with open(fname) as f:
        cams = [line.split()[0] for line in f] + ['Pcdhgc5', 'Nectin2']
    
    return cams

def get_labels(df_cam):
    params = {'sep':'\t', 'header':0, 'index_col':0}
    fname = 'Datasets/Lab_Pvalb-transcriptional_labels.tsv'
    df_labels = pd.read_csv(fname, na_values='Other', **params)
    index = df_labels.index.isin(df_cam.index.get_level_values('Cell'))
    columns = ['Morph-PV-types', 'MorphMarker-PV-types', 'MorphDirectional-PV-types',]
    df_labels = df_labels.loc[index, columns].copy()
    olms = df_cam.xs('SST-OLM', level='CellType', axis=0).index
    for olm in olms:
        df_labels.loc[olm,:] = 'SST-OLM'
    
    df_cam = df_cam.loc[df_cam.index.get_level_values('Cell').isin(df_labels.index),:].copy()
    df_cam.index = df_cam.index.get_level_values('Cell')
    
    return df_cam, df_labels

def read_data():
    df_lin = get_lin_data()
    df_olm = get_olm_data()
    
    cams = get_cams()
    kept = df_lin.index.isin(cams)
    
    df_lin = df_lin.loc[kept,:]
    df_olm = df_olm.loc[kept,:]
    
    df = np.log2(1+pd.concat((df_lin, df_olm), axis=1).T)
    
    df_cam, df_labels = get_labels(df)
    
    return df_cam, df_labels

def order_similarity_matrix(df_sim):
    indices = df_sim.columns
    if 'BIC' in indices:
        order = ['AAC', 'BIC', 'BC', 'SST-OLM']
    elif 'hBIC' in indices:
        order = ['vAAC', 'vBIC', 'hBIC', 'vBC', 'hBC', 'SST-OLM']
    else:
        order = ['Vertical', 'Horizontal', 'SST-OLM']
    
    df_sim = df_sim.loc[order, order]
    return df_sim

def get_similarity_matrix(df, labels):
    datalist = df.values
    corr = rnaseqTools.corr2(datalist, datalist)
    df_sim = pd.DataFrame(corr, index=labels, columns=labels)
    
    inds = np.arange(df_sim.shape[0])
    df_sim.values[inds,inds] = np.NaN
    
    df_sim = df_sim.groupby(df_sim.index).mean().groupby(df_sim.columns,axis=1).mean()
    df_sim = order_similarity_matrix(df_sim)
    
    return df_sim

def create_heat_dataframe(df, df_labels):
    # get target labels
    labels = df_labels['Morph-PV-types']
    labels = labels[labels!='SST-OLM']
    labels[' '] = ' '
    
    # introduce empty column and genes
    df = df.T.copy()
    df[' '] = np.NaN
    df.loc['',:] = np.NaN
    
    # order dataframe
    celltype_order = ['vAAC', 'vBIC', 'hBIC', 'vBC', 'hBC', ' ']
    cells = [labels[labels==celltype].index.tolist() for celltype in celltype_order]
    cells = [cell for cel in cells for cell in cel]
    gene_set = gene_set = [['Bsg', 'Ptprn', 'Cntn1', 'Chl1', 'Ppfia2', 'Nrcam', 'Ptprs', 'App', 'Nrxn1', 'Nptn'],
            ['Vstm2a', 'Ntrk2', 'Nrxn3', 'Nlgn2', 'Negr1', 'Lrrc4', 'Igsf8', 'Fstl5', 'Cntnap4', 'Clstn3'],
            ['Sdk2', 'Lrfn2', 'Flrt1', 'Ephb2', 'Cbln2'],
            ['Nectin2', 'Ptprn2', 'Ptpn5', 'Nxph1', 'Igdcc4', 'Fam19a2', 'Epha10', 'Clstn2', 'Cd164'],
            ['Sema3e', 'Pcdhgc5', 'Epha4']
           ]
    gene_set = [genes[::-1]+[''] for genes in gene_set]
    genes = [gene for genes in gene_set for gene in genes][:-1]
    df = df.loc[genes,cells].copy()
    
    # adjust column labels
    arrays = [cells, labels[cells]]
    df.columns = pd.MultiIndex.from_arrays(arrays, names=('Cell', 'CellType'))
    df.index.name = 'Gene'
    
    return df

def write_data(df_cam, df_labels):
    df = create_heat_dataframe(df_cam, df_labels)
    df.to_csv('CAM Data/PNAS heatmap.tsv', sep='\t')
    df = get_similarity_matrix(df_cam, df_labels['Morph-PV-types'])
    df.index.name = 'Morphological types'
    df.columns.name = 'Morphological types'
    df.to_csv('CAM Data/Morphology.tsv', sep='\t')
    
    df = get_similarity_matrix(df_cam, df_labels['MorphMarker-PV-types'])
    df.index.name = 'Axo-morphological types'
    df.columns.name ='Axo-morphological types'
    df.to_csv('CAM Data/Axomorphic.tsv', sep='\t')
    
    df = get_similarity_matrix(df_cam, df_labels['MorphDirectional-PV-types'])
    df.index.name = 'Dendro-morphological types'
    df.columns.name = 'Dendro-morphological types'
    df.to_csv('CAM Data/Dendromorphic.tsv', sep='\t')
    
    return

In [3]:
df_cam, df_labels = read_data()

In [4]:
write_data(df_cam, df_labels)