In [1]:
import pandas as pd
import numpy as np

In [2]:
!ls ../../data/preprocessed/atac_counts_by_sample/

check_order	     macaque_origin_peaks   mouse_origin_peaks
human_orgin_peaks    marmoset_orgin_peaks   old_quant
human_origin_peaks   marmoset_origin_peaks  primate_conserved
macaque_orgin_peaks  mouse_orgin_peaks


In [3]:
clusts = open('../../data/ref/cluster_list.txt', 'r').read().strip('\n').split('\n')

In [4]:
species = ['human', 'macaque', 'marmoset', 'mouse']

# wrangle blacklist

In [5]:
for specie in species:
    peaks = '../../data/ref/blacklist_regions/{}_atac_peaks_with_four_species_ortho.bed'.format(specie)
    names = '../../data/ref/blacklist_regions/{}_peaks_that_should_be_filtered_because_of_blacklist_overlap_in_any_species.txt'.format(specie)
    bed = '../../data/ref/blacklist_regions/beds/{}_blacklist_peaks.bed'.format(specie)
    out_list = '../../data/ref/blacklist_regions/lists/{}_blacklist_peaks.txt'.format(specie)
    
    !cat $peaks | grep -wf $names > $bed
    !cut -f 1-3 $bed | sed 's/\t/-/g'  >  $out_list

In [6]:
print('done')

done


# get element lists

In [7]:
for origin in species:
    dir_out = '../../data/preprocessed/atac_counts_by_sample/{}_orgin_peaks/ortho_elements'.format(origin)
    !mkdir -p $dir_out
    paste_string = ''
    f_final = '../../data/preprocessed/atac_counts_by_sample/{}_orgin_peaks/ortho_elements/ortholog_regions.tsv'.format(origin)
    f_filtered = '../../data/preprocessed/atac_counts_by_sample/{}_orgin_peaks/ortho_elements/ortholog_regions_filtered.tsv'.format(origin)
    
    blacklist = '../../data/ref/blacklist_regions/lists/{}_blacklist_peaks.txt'.format(origin)
    
    for specie in species:
        print(specie, origin)
        f_in = '../../data/preprocessed/atac_counts_by_sample/{0}_orgin_peaks/{1}/{1}_reps_ASC_atac_raw_counts_{0}_ortho_peaks.tsv'.format(origin,
                                                                                                                                  specie)
        f_out = '../../data/preprocessed/atac_counts_by_sample/{}_orgin_peaks/ortho_elements/{}_elements.txt'.format(origin, specie)
        !tail -n+1 $f_in | cut -f1  > $f_out 
        paste_string = paste_string + f_out + ' '
    !echo -e "human\tmacaque\tmarmoset\tmouse" > $f_final
    !paste $paste_string >> $f_final
    !cat $f_final |  grep -vf $blacklist > $f_filtered
    !cp -t ../../data/ref/ $f_final
    !cp -t ../../data/ref/ $f_filtered
    for_ref = '{}_origin_ortholog_elements.tsv'.format(origin)
    filtered_ref = '{}_origin_ortholog_elements_blacklist_filtered.tsv'.format(origin)
    !mv ../../data/ref/ortholog_regions.tsv ../../data/ref/ortholog_elements/$for_ref
    !mv ../../data/ref/ortholog_regions_filtered.tsv ../../data/ref/ortholog_elements/$filtered_ref

human human
macaque human
marmoset human
mouse human
human macaque
macaque macaque
marmoset macaque
mouse macaque
human marmoset
macaque marmoset
marmoset marmoset
mouse marmoset
human mouse
macaque mouse
marmoset mouse
mouse mouse


# make peak matricies

In [8]:
def save_list(outfile, listlike):
    '''
    writes a file as a newline seperated list, useful for gsea, or storing information in general
    
    args:
        outfile : a file path to write a list of genes
        listline : an interatable object like a python list, or a numpy array
    
    '''
    return open(outfile, 'w').write(''.join([i + '\n' for i in listlike]).strip('\n'))

In [9]:
def read_list(listfile):
    '''
    reads a newline separated list like created by save_list
    args:
        listfile: path to a textfile containing a newline separated list
    returns:
        a list of strings saved in the list file
    '''
    return open(listfile, 'r').read().strip('\n').split('\n')

In [10]:
def rename_index(df_,
                 orthologs,
                 specie,
                 origin,
                 blacklist=None):
    df_use = df_.reindex(orthologs[specie])
    # use human locs to enable GREAT analysis, LDSC, ect
    df_use.index = orthologs[origin]
    if blacklist:
        blacklist = read_list(blacklist)
        df_use = df_use.loc[~df_use.index.isin(blacklist)]
    return df_use

In [11]:
f_format = '../../data/preprocessed/atac_counts_by_sample/{2}_orgin_peaks/{0}/{0}_reps_{1}_atac_raw_counts_{2}_ortho_peaks.tsv'

In [12]:
blacklist_peaks = '../../data/ref/blacklist_regions/lists/{}_blacklist_peaks.txt'

In [13]:
for origin in species:
    element_list = pd.read_csv(
    '../../data/preprocessed/atac_counts_by_sample/{}_orgin_peaks/ortho_elements/ortholog_regions_filtered.tsv'.format(origin),
                           sep='\t')
    blacklist = blacklist_peaks.format(origin)
    for cluster in clusts: 
        sample_order = []
        dfs = []
        for specie in species:
            df = pd.read_csv(f_format.format(specie, cluster, origin),
                             header = None,
                             sep = '\t').set_index(0)
            sample_order += [specie for _ in range(df.shape[1])]
            dfs.append(rename_index(df, element_list, specie,
                                    origin=origin,
                                    blacklist = blacklist))
        origin_dir = '../../data/processed/atac_for_edger/{}_origin/'.format(origin)
        !mkdir -p {origin_dir}
        out = '../../data/processed/atac_for_edger/{}_origin/raw_counts_{}.tsv'.format(origin, cluster)
        clust_df = pd.concat(dfs, axis=1)
        clust_df.to_csv(out, header=None, sep='\t')      
        !mkdir -p {origin_dir}/conditions
        save_list('../../data/processed/atac_for_edger/{}_origin/conditions/sample_order_{}.txt'.format(origin, cluster), sample_order)

In [14]:
print('done')

done


# make orgin based peak_matricies

In [15]:
f_format = '../../data/preprocessed/atac_counts_by_sample/{2}_orgin_peaks/{0}/{0}_reps_{1}_atac_raw_counts_{2}_ortho_peaks.tsv'

In [16]:
blacklist_peaks = '../../data/ref/blacklist_regions/lists/{}_blacklist_peaks.txt'

In [17]:
def rename_index(df_,
                 orthologs,
                 specie,
                 origin,
                 blacklist=None):
    df_use = df_.reindex(orthologs[specie])
    # use human locs to enable GREAT analysis, LDSC, ect
    df_use.index = orthologs[origin]
    if blacklist:
        blacklist = read_list(blacklist)
        df_use = df_use.loc[~df_use.index.isin(blacklist)]
    return df_use

In [20]:
for origin in species:
    element_list = pd.read_csv(
    '../../data/preprocessed/atac_counts_by_sample/{}_orgin_peaks/ortho_elements/ortholog_regions_filtered.tsv'.format(origin),
                           sep='\t')
    blacklist = blacklist_peaks.format(origin)
    for cluster in clusts: 
        sample_order = []
        dfs = []
        # for specie in species:
        df = pd.read_csv(f_format.format(specie, cluster, origin),
                         header = None,
                         sep = '\t').set_index(0)
        sample_order += [specie for _ in range(df.shape[1])]
        dfs.append(df)
                # rename_index(df, element_list, specie,
                #                     origin=origin,
                #                     blacklist = blacklist))
        origin_dir = '../../data/processed/atac_for_gls/full_species_peaks/'
        !mkdir -p {origin_dir}
        out = '../../data/processed/atac_for_gls/full_species_peaks/raw_counts_{}.tsv'.format(origin, cluster)
        clust_df = pd.concat(dfs, axis=1)
        clust_df.to_csv(out, header=None, sep='\t')      
        # !mkdir -p {origin_dir}/conditions
        # save_list('../../data/processed/atac_for_edger/{}_origin/conditions/sample_order_{}.txt'.format(origin, cluster), sample_order)

In [21]:
print('done')

done
