In [7]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

# p = os.path.dirname(os.path.dirname(os.getcwd()))
# sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [8]:
config_file = 'figures/snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [66]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = 'figures/'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_lib_meta = 'figures/'+expand(config['data']['meta'], species='mouse')[0]


In [71]:
species=['human', 'mouse']
def get_meta_df():
    meta_df = pd.DataFrame()
    for f, s in zip(list(expand(config['data']['meta'], species=species)), species):
        temp = pd.read_csv('figures/'+f, sep='\t')
        temp['species'] = s
        meta_df = pd.concat([meta_df, temp], axis=0)
    # if meta_df.dataset.duplicated.any():
    #     raise ValueError('Mouse and human dataset names not unique')
    return meta_df
meta_df = get_meta_df()

In [73]:
len(meta_df)

264

In [62]:
df = pd.read_csv('figures/ref/lr_file_ids.tsv', sep='\t')

In [63]:
df = df[['name', 'file', 'output_type']].pivot(values='file',
                                                     index='name',
                                       columns='output_type')


In [64]:
df.head()

output_type,alignments,reads,unfiltered alignments
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENCSR026WRM_1_1,ENCFF753VCG,ENCFF402OZL,ENCFF655BWX
ENCSR026WRM_2_1,ENCFF587QKV,ENCFF182ZKL,ENCFF574LQO
ENCSR044ARQ_1_1,ENCFF292UIE,ENCFF688QGB,ENCFF193ZRQ
ENCSR044ARQ_2_1,ENCFF738RAA,ENCFF272VSN,ENCFF455IAW
ENCSR056MYH_1_1,ENCFF319JFG,ENCFF026VEI,ENCFF858LGV


In [57]:
meta_df = pd.read_csv(lib_meta, sep='\t')
meta_df.head()
meta_df = meta_df.merge(df, how='left', left_on='ENCODE_file_id', right_on='alignments')

In [58]:
meta_df.drop(['ENCODE_file_id'], axis=1, inplace=True)
meta_df.rename({'alignments': 'ENCODE_alignments_id',
                'unfiltered alignments': 'ENCODE_unfiltered_alignments_id',
                'reads': 'ENCODE_reads_id'}, axis=1, inplace=True)

In [60]:
meta_df.to_csv(lib_meta, sep='\t', index=False)

In [67]:
meta_df = pd.read_csv(m_lib_meta, sep='\t')
meta_df.head()
meta_df = meta_df.merge(df, how='left', left_on='ENCODE_file_id', right_on='alignments')
meta_df.drop(['ENCODE_file_id'], axis=1, inplace=True)
meta_df.rename({'alignments': 'ENCODE_alignments_id',
                'unfiltered alignments': 'ENCODE_unfiltered_alignments_id',
                'reads': 'ENCODE_reads_id'}, axis=1, inplace=True)
meta_df.to_csv(m_lib_meta, sep='\t', index=False)

In [68]:
meta_df.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,age,adult,sex,genotype,tissue_or_cell_line,sample_color_hex_code,matching_human_samples,reads_post_talon,ENCODE_alignments_id,ENCODE_reads_id,ENCODE_unfiltered_alignments_id
0,ENCSR885NRP,adrenal_18-20mo_m_1,adrenal_18-20mo,Adrenal gland PNM18-20,adrenal gland,adrenal gland,18-20mo,True,m,b6/cast,tissue,#b24422,['adrenal gland'],679622,ENCFF684SLZ,ENCFF811RRD,ENCFF167IJD
1,ENCSR964OKW,gastroc_4d_m_1,gastroc_4d,Gastrocnemius PND4,muscle,muscle,4d,True,m,b6/cast,tissue,#e2e5ef,['muscle'],1066347,ENCFF643TDK,ENCFF798HCT,ENCFF541JWV
2,ENCSR964OKW,gastroc_4d_m_2,gastroc_4d,Gastrocnemius PND4,muscle,muscle,4d,True,m,b6/cast,tissue,#e2e5ef,['muscle'],1100253,ENCFF821NRP,ENCFF069POP,ENCFF362LGG
3,ENCSR410XGE,heart_18-20mo_m_1,heart_18-20mo,Heart PNM18-20,heart,heart,18-20mo,True,m,b6/cast,tissue,#7caf5c,['heart'],699529,ENCFF800GKP,ENCFF177KNR,ENCFF458JBD
4,ENCSR410XGE,heart_18-20mo_m_2,heart_18-20mo,Heart PNM18-20,heart,heart,18-20mo,True,m,b6/cast,tissue,#7caf5c,['heart'],1019974,ENCFF510KLD,ENCFF468BPP,ENCFF107ZLE


In [None]:
## make sure thing is working for getting correct type of file

In [22]:
def get_df_lr_ids(file_id_df, file_format, species):
    df = file_id_df.copy(deep=True)
    if file_format == 'bam':
        df = df.loc[df.output_type=='unfiltered alignments']
    elif file_format == 'label_bam':
        df = df.loc[df.output_type=='alignments']
    elif file_format == 'fastq':
        df = df.loc[df.output_type=='reads']
    df = df.loc[df.species==species]
    ids = df.file.tolist()
    return ids

In [23]:
file_id_df = pd.read_csv('figures/ref/lr_file_ids.tsv', sep='\t')

In [24]:
file_id_df[['file', 'output_type', 'species']].groupby(['output_type', 'species']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,file
output_type,species,Unnamed: 2_level_1
alignments,human,138
alignments,mouse,126
reads,human,138
reads,mouse,126
unfiltered alignments,human,138
unfiltered alignments,mouse,126


In [4]:
# 230722 test get_lr_ids

In [5]:
def get_lr_ids(file_format='bam', species='human'):
    metadata = get_lr_exp_meta(species)
    if file_format == 'bam':
        metadata = metadata.loc[metadata.output_type=='unfiltered alignments']
    elif file_format == 'label_bam':
        metadata = metadata.loc[metadata.output_type=='alignments']
    elif file_format == 'fastq':
        metadata = metadata.loc[metadata.output_type=='reads']

    return metadata.file.tolist()

In [6]:
get_lr_ids('label_bam', 'human')

['ENCFF158KCA',
 'ENCFF147OYL',
 'ENCFF772MSZ',
 'ENCFF117DUA',
 'ENCFF648NAR',
 'ENCFF225CCJ',
 'ENCFF016SHE',
 'ENCFF479EHE',
 'ENCFF914XOH',
 'ENCFF745DHX',
 'ENCFF846YHI',
 'ENCFF049QGQ',
 'ENCFF243PFI',
 'ENCFF936VUF',
 'ENCFF939EUU',
 'ENCFF545PJV',
 'ENCFF504GVG',
 'ENCFF479SQR',
 'ENCFF400LRT',
 'ENCFF989FKA',
 'ENCFF596ODX',
 'ENCFF192PJS',
 'ENCFF694CBG',
 'ENCFF538BNH',
 'ENCFF814ABW',
 'ENCFF168AZV',
 'ENCFF362CPC',
 'ENCFF940WVU',
 'ENCFF973OML',
 'ENCFF058HQU',
 'ENCFF292UIE',
 'ENCFF738RAA',
 'ENCFF925MYC',
 'ENCFF558QPF',
 'ENCFF118JEI',
 'ENCFF322UJU',
 'ENCFF044LIA',
 'ENCFF219UJG',
 'ENCFF509GHY',
 'ENCFF803KIA',
 'ENCFF213XDA',
 'ENCFF281VKZ',
 'ENCFF250BDM',
 'ENCFF810FRP',
 'ENCFF600MGT',
 'ENCFF661OEY',
 'ENCFF645UVN',
 'ENCFF100RGC',
 'ENCFF927MKK',
 'ENCFF470UHX',
 'ENCFF054YYA',
 'ENCFF319JFG',
 'ENCFF901XCR',
 'ENCFF319FBW',
 'ENCFF992ZVE',
 'ENCFF373TKM',
 'ENCFF985LGZ',
 'ENCFF388HXU',
 'ENCFF291EKY',
 'ENCFF791WUV',
 'ENCFF193WEX',
 'ENCFF626QRV',
 'ENCFF2