In [1]:
import pandas as pd
import numpy as np

In [2]:
clusts = open('../../data/ref/cluster_list.txt', 'r').read().strip('\n').split('\n')

In [3]:
species = ['human', 'macaque', 'marmoset', 'mouse']

# goals :
- get logcpm for atac
- get uniform element names across species

# raw mats

In [4]:
def read_list(listfile):
    '''
    reads a newline separated list like created by save_list
    args:
        listfile: path to a textfile containing a newline separated list
    returns:
        a list of strings saved in the list file
    '''
    return open(listfile, 'r').read().strip('\n').split('\n')

In [5]:
def rename_index(df_,
                 orthologs,
                 specie,
                 origin,
                 blacklist=None):
    df_use = df_.reindex(orthologs[specie])
    # use human locs to enable GREAT analysis, LDSC, ect
    df_use.index = orthologs[origin]
    if blacklist:
        blacklist = read_list(blacklist)
        df_use = df_use.loc[~df_use.index.isin(blacklist)]
    return df_use

In [6]:
spec_genome = {'human' : 'hg38',
               'macaque' : 'rhemac10',
               'marmoset' : 'caljac4',
               'mouse' : 'mm10'}

In [10]:
cpm_out =  '../../data/processed/atac_for_gls/{0}_origin_peaks/log2cpm/{1}_clusters_orthologous_counts.txt'
raw_out =  '../../data/processed/atac_for_gls/{0}_origin_peaks/raw/{1}_clusters_orthologous_counts.txt'
f_format = '../../data/preprocessed/atac_counts_by_sample/{0}_orgin_peaks/{1}/{1}_reps_{2}_atac_raw_counts_{0}_ortho_peaks.tsv'

In [11]:
blacklist_peaks = '../../data/ref/blacklist_regions/lists/{}_blacklist_peaks.txt'

In [13]:
for origin in  species:
    ortho_elements = '../../data/ref/ortholog_elements/{}_origin_ortholog_elements.tsv'.format(origin)
    element_list = pd.read_csv(ortho_elements, sep='\t')
    blacklist = blacklist_peaks.format(origin)
    for specie in species:
        dfs = []
        for clust in clusts:
            df = pd.read_csv(f_format.format(origin, specie, clust),
                             header = None,
                             sep = '\t').set_index(0)
            df_use = rename_index(df,
                                  orthologs = element_list, 
                                  specie=specie, origin=origin, 
                                 blacklist=blacklist)
            df_use = df_use.sum(axis=1)
            df_use.name = clust
            dfs.append(df_use)
        clust_df = pd.concat(dfs, axis=1)
        print(clust_df.shape)
        clust_df.to_csv(raw_out.format(origin, specie), sep='\t')

(204921, 21)
(204921, 21)
(204921, 21)
(204921, 21)
(185230, 21)
(185230, 21)
(185230, 21)
(185230, 21)
(158695, 21)
(158695, 21)
(158695, 21)
(158695, 21)
(191511, 21)
(191511, 21)
(191511, 21)
(191511, 21)


In [17]:
for origin in species:
    for specie in species:
        df = pd.read_csv(raw_out.format(origin, specie), sep='\t').set_index(origin)
        # df = df.fillna(0)
        counts_clust = df.sum(axis=0)
        df = df*1e6/counts_clust
        df = df + 1
        df = df.apply(np.log2)
        print(df.shape)
        df.to_csv(cpm_out.format(origin, specie), sep='\t')

(204921, 21)
(204921, 21)
(204921, 21)
(204921, 21)
(185230, 21)
(185230, 21)
(185230, 21)
(185230, 21)
(158695, 21)
(158695, 21)
(158695, 21)
(158695, 21)
(191511, 21)
(191511, 21)
(191511, 21)
(191511, 21)
