In [26]:
datasets = { 
    'hmtg': dict(reads_file = 'human/mtg/human_MTG_2018-06-14_intron-matrix.csv',
                 genes_file = 'human/mtg/human_MTG_2018-06-14_genes-rows.csv',
                 samples_file = 'human/mtg/human_MTG_2018-06-14_samples-columns.csv',
                 output_file = 'extracts/hmtg_intron_reads.hyper'),
    'hacc': dict(reads_file = 'human/acc/human_ACC_2018-10-04_intron-matrix.csv',
                 genes_file = 'human/acc/human_ACC_2018-10-04_genes-rows.csv',
                 samples_file = 'human/acc/human_ACC_2018-10-04_samples-columns.csv',
                 output_file = 'extracts/hacc_intron_reads.hyper',
                 sample_row_key = 'gene',
                 sample_col_key = 'seq_name'),
    'hlgn': dict(reads_file = 'human/lgn/human_LGN_2018-06-14_intron-matrix.csv',
                 genes_file = 'human/lgn/human_LGN_2018-06-14_genes-rows.csv',
                 samples_file = 'human/lgn/human_LGN_2018-06-14_samples-columns.csv',
                 output_file = 'extracts/hlgn_intron_reads.hyper'),
    'hv1': dict(reads_file = 'human/visp/human_VISp_2018-10-04_intron-matrix.csv',
                genes_file = 'human/visp/human_VISp_2018-10-04_genes-rows.csv',
                samples_file = 'human/visp/human_VISp_2018-10-04_samples-columns.csv',
                output_file = 'extracts/hv1_intron_reads.hyper',
                sample_row_key = 'gene',
                sample_col_key = 'seq_name'),
    'mv1': dict(reads_file = 'mouse/v1/mouse_VISp_2018-06-14_intron-matrix.csv',
                genes_file = 'mouse/v1/mouse_VISp_2018-06-14_genes-rows.csv',
                samples_file = 'mouse/v1/mouse_VISp_2018-06-14_samples-columns.csv',
                output_file = 'extracts/mv1_intron_reads.hyper'),
    'mmop': dict(reads_file = 'mouse/mop/mouse_MOp_nuclei_2018-10-04_intron-matrix.csv',
                 genes_file = 'mouse/mop/mouse_MOp_nuclei_2018-10-04_genes-rows.csv',
                 samples_file = 'mouse/mop/mouse_MOp_nuclei_2018-10-04_samples-columns.csv',
                 output_file = 'extracts/mmop_intron_reads.hyper',
                sample_row_key = 'gene',
                sample_col_key = 'seq_name'),
    'mlgd': dict(reads_file = 'mouse/lgd/mouse_LGd_2018-06-14_intron-matrix.csv',
                 genes_file = 'mouse/lgd/mouse_LGd_2018-06-14_genes-rows.csv',
                 samples_file = 'mouse/lgd/mouse_LGd_2018-06-14_samples-columns.csv',
                 output_file = 'extracts/mlgd_intron_reads.hyper'),
    'malm': dict(reads_file = 'mouse/alm/mouse_ALM_2018-06-14_intron-matrix.csv',
                 genes_file = 'mouse/alm/mouse_ALM_2018-06-14_genes-rows.csv',
                 samples_file = 'mouse/alm/mouse_ALM_2018-06-14_samples-columns.csv',
                 output_file = 'extracts/malm_intron_reads.hyper'),
    'maca': dict(reads_file = 'mouse/aca/mouse_ACA_2018-10-04_intron-matrix.csv',
                 genes_file = 'mouse/aca/mouse_ACA_2018-10-04_genes-rows.csv',
                 samples_file = 'mouse/aca/mouse_ACA_2018-10-04_samples-columns.csv',
                 output_file = 'extracts/maca_intron_reads.hyper',
                sample_row_key = 'gene',
                sample_col_key = 'seq_name')
}

In [17]:
import pandas as pd
from pandleau import *

def convert(samples_file, genes_file, reads_file, output_file, sample_row_key='entrez_id', sample_col_key='sample_name'):        
    samples = pd.read_csv(samples_file, encoding = "ISO-8859-1")
    genes = pd.read_csv(genes_file)
    reads = pd.read_csv(reads_file)
        
    if sample_col_key != 'sample_id':
        col_to_sample_id = { s[sample_col_key]:s['sample_id'] for i,s in samples.iterrows() }
        reads.rename(columns=col_to_sample_id, inplace=True)
    
    reads.rename(columns={ 'Unnamed: 0': 'entrez_id' }, inplace=True)
    
    if sample_row_key != 'entrez_id':
        row_to_entrez_id = { g[sample_row_key]:g['entrez_id'] for i,g in genes.iterrows() }
        reads['entrez_id'] = reads['entrez_id'].map(row_to_entrez_id)        
    
    reads = reads.melt(id_vars=['entrez_id'], var_name='sample_id', value_name='reads')
    
    reads = reads[reads.reads>0]
    reads['sample_id'] = reads['sample_id'].astype('int64')
    
    df_tab = pandleau(reads)
    df_tab.to_tableau(output_file, add_index=False)


In [27]:
import os 
for ds_name, ds in datasets.items():
    if not os.path.exists(ds['output_file']):
        print(ds_name)
        convert(**ds)    

maca
Table 'Extract' does not exist in extract extracts/maca_intron_reads.hyper, creating.


processing table: 19107066it [03:16, 97152.22it/s] 
