In [1]:
import numpy as np
import pandas as pd

In [3]:
def read_dataset(dataset, celltypes=[], age=0, trim_type=0, drop=[], keep=[], trim_by='.'):
    if dataset != 'GSE115746':
        basedir = 'Datasets'
    else:
        basedir = '/media/soma/DavidWork/backup_4/Newest/Newest/Soma Paper/'\
                    'Produce Figures/Data Analysis/Tasic Analysis/Datasets'
    fname = '%s/%s-tpm.tsv' % (basedir, dataset)
    df = pd.read_csv(fname, sep='\t', header=0, index_col=0)
    
    fname = '%s/%s-labels.tsv' % (basedir, dataset)
    df_labels = pd.read_csv(fname, sep='\t', header=0, index_col=0)
    
    if len(celltypes) > 0:
        df_labels = df_labels.loc[df_labels.CellType.isin(celltypes),:]
    if len(drop) > 0:
        for drop_type in drop:
            df_labels = df_labels.loc[~df_labels.CellType.str.startswith(drop_type),:]
    if len(keep) > 0:
        kept = np.zeros(df_labels.shape[0]).astype(bool)
        for keep_type in keep:
            kept = np.logical_or(kept, df_labels.CellType.str.startswith(keep_type))
        df_labels = df_labels.loc[kept,:]
    if trim_type > 0:
        separated = df_labels.CellType.str.split(trim_by, expand=True).fillna('')
        base = separated[0]
        if trim_type > 1:
            for i in range(1, trim_type):
                base += trim_by + separated[i]
        df_labels.CellType = base
        
    converter = {'CA1 fast-spiking interneuron':'CA1 FS INT',
                 'CA1 regular-spiking interneuron':'CA1 RS INT',
                 'CA1 pyramidal cell':'CA1 PYR',
                 'Subiculum burst-spiking pyramidal cell':'Subiculum BS PYR',
                 'Subiculum regular-spiking pyramidal cell':'Subiculum RS PYR',
                 'Layer_2/3_Pyramidal':'L2/3 PYR',
                 'Layer_I_Astrocyte':'L1 Astrocyte',
                 'Layer_I_Interneuron':'L1 INT'
                }
    df_labels.CellType = [converter.get(cell,cell) for cell in df_labels.CellType]
    
    if 'Age' in df_labels.columns and age > 0:
            is_young = df_labels.Age < age
            df_labels.loc[is_young, 'CellType'] += ' (<P25)'
            df_labels.loc[~is_young, 'CellType'] += ' (>P25)'
    elif dataset == 'GSE115746':
        df_labels.CellType += ' (>P53)'
    elif dataset == 'Gouwens':
        df_labels.CellType += ' (>P45)'
    elif dataset == 'GSE70844':
        df_labels.CellType += ' (<P25)'
    elif dataset == 'GSE99888':
        df_labels.CellType += ' (>P25)'
    celltypes, counts = np.unique(df_labels.CellType, return_counts=True)
    celltypes = celltypes[counts>2]
    df_labels = df_labels.loc[df_labels.CellType.isin(celltypes),:]
    
    df = df.loc[:,df_labels.index].copy()
    
    arrays = (df.columns, df_labels.CellType)
    names = ('Cell', 'CellType')
    df.columns = pd.MultiIndex.from_arrays(arrays, names=names)
    
    return df

def read_hemoglobin_data(dataset, genes=['Hbb-bs'], read_args={}):
    df = read_dataset(dataset, **read_args)
    df = df.loc[df.index.isin(genes),:]
    df = np.log2(1+df)
    
    return df

def write_hemoglobin_data(dataset, genes=['Hba-a1', 'Hba-a2', 'Hbb-bs', 'Hbb-bt'], read_args={}):
    df = read_hemoglobin_data(dataset, genes=genes, read_args=read_args)
    df.to_csv('Hemoglobin/%s.tsv' % dataset, sep='\t')
    
    return

def create_hemoglobin_datasets(genes=['Hba-a1', 'Hba-a2', 'Hbb-bs', 'Hbb-bt']):
    write_hemoglobin_data('Lab_OLM', read_args={'celltypes':['SST-OLM'], 'age':25}, genes=genes)
    write_hemoglobin_data('Lab_Pvalb', read_args={'age':25}, genes=genes)
    write_hemoglobin_data('GSE109999', genes=genes)
    write_hemoglobin_data('Cadwell', read_args={'age':25}, genes=genes)
    write_hemoglobin_data('GSE70844', genes=genes)
    write_hemoglobin_data('GSE99888', read_args={'trim_type':1}, genes=genes)
    write_hemoglobin_data('GSE75386', read_args={'drop':['hippocampus']}, genes=genes)
    write_hemoglobin_data('GSE119248', read_args={'age':25}, genes=genes)
    write_hemoglobin_data('GSE60361', read_args={'age':25}, genes=genes)
    #write_hemoglobin_data('GSE115746', read_args={'drop':['Unknown', 'Low', 'Doublet', 'Batch', 'CR', 'High'],
    #                                              'trim_type':1,
    #                                              'trim_by':' '},
    #                      genes=genes)
    #write_hemoglobin_data('Gouwens', read_args={'drop':['Unknown'], 'trim_type':1, 'trim_by':' '}, genes=genes)
    
    return

In [4]:
create_hemoglobin_datasets(genes=['Hba-a1', 'Hba-a2', 'Hbb-bs', 'Hbb-bt', 'Hp', 'Mb', 'Ngb', 'Cygb', 'Gh'])