In [2]:
from snakemake.io import expand
import pandas as pd
import yaml
import pdb
import re
import numpy as np
import pyranges as pr

In [3]:
config_file = 'config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

## 230616 getting expression values

In [14]:
def get_ab_from_gff(gff_file, ofile):
    df = pr.read_gff(gff_file).as_df() 
    df = df[['transcript_id', 'flrpm', 'rpm']]
    df = df.drop_duplicates()
    assert len(df.loc[df.transcript_id.duplicated(keep=False)].index) == 0
    df.to_csv(ofile, sep='\t', index=False)


In [15]:
gff_file = 'data/human/Brain_Embryo_ont_pre-capture_human_all.gff'
ofile = 'data/human/Brain_Embryo_ont_pre-capture_human_all_abundance.tsv'
get_ab_from_gff(gff_file, ofile)

In [7]:
df = pr.read_gff('data/human/Brain_Embryo_ont_pre-capture_human_all.gff').as_df()

In [9]:
df = df[['transcript_id', 'flrpm', 'rpm']]

In [11]:
print(len(df.index))
df = df.drop_duplicates()
print(len(df.index))


213704
51638


In [12]:
df.loc[df.transcript_id.duplicated(keep=False)].sort_values(by='transcript_id')

Unnamed: 0,transcript_id,flrpm,rpm


In [13]:
df.head()

Unnamed: 0,transcript_id,flrpm,rpm
0,ont-Crg-CapTrap_HpreCap_0+_EmbBrain01Rep1.NAM_...,25.271982,25.271982
3,ont-Crg-CapTrap_HpreCap_0+_EmbBrain01Rep1.NAM_...,16.847988,33.695976
8,ont-Crg-CapTrap_HpreCap_0+_EmbBrain01Rep1.NAM_...,33.695976,623.375556
9,ont-Crg-CapTrap_HpreCap_0+_EmbBrain01Rep1.NAM_...,16.847988,210.59985
10,ont-Crg-CapTrap_HpreCap_0+_EmbBrain01Rep1.NAM_...,16.847988,75.815946


## 230615 input parsing

In [47]:
def parse_input_config(config, config_file, subset):
    df = pd.read_csv(config_file, sep='\t')
    df['dataset'] = df.fname.str.split('.', n=1, expand=True)[0]
    df['subset'] = np.nan
    df.loc[df.fname.str.contains('all.gff.gz'), 'subset'] = 'all'
    df.loc[df.fname.str.contains('cagePolyASupported.gff.gz'), 'subset'] = 'supported'

    # add the metadata
    species = ['human', 'mouse']
    meta_df = pd.DataFrame()
    for s in species:
        temp = pd.read_csv(expand(config['data']['meta'], species=s)[0],
                           sep='\t')
        temp['species'] = s
        meta_df = pd.concat([meta_df, temp], axis=0)

    df = df.merge(meta_df, on='dataset', how='left')
    df['temp'] = df['tissue']+'_'+\
                 df['age']+'_'+\
                 df['platform']+'_'+\
                 df['capture']+'_'+\
                 df['species']+'_'+\
                 df['subset']
    
    df = df.loc[df.subset==subset]
    print(len(df.temp.unique()))
    # print(df.head())
    return df


In [48]:
df = parse_input_config(config, '230614_config.tsv', 'all')


88


Unnamed: 0,fname,path,dataset,subset,tissue,age,platform,capture,rep1,rep2,species,temp
1,SIDMWbEPP0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEPP0101,all,ESC,Embryo,pacBioSII,pre-capture,1,1,mouse,ESC_Embryo_pacBioSII_pre-capture_mouse_all
3,SIDMWbEPC0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEPC0101,all,ESC,Embryo,pacBioSII,post-capture,1,1,mouse,ESC_Embryo_pacBioSII_post-capture_mouse_all
5,SIDMWbEOP0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEOP0101,all,ESC,Embryo,ont,pre-capture,1,1,mouse,ESC_Embryo_ont_pre-capture_mouse_all
7,SIDMWbEOC0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEOC0101,all,ESC,Embryo,ont,post-capture,1,1,mouse,ESC_Embryo_ont_post-capture_mouse_all
9,SIDMWbAPP0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbAPP0101,all,WBlood,Adult,pacBioSII,pre-capture,1,1,mouse,WBlood_Adult_pacBioSII_pre-capture_mouse_all


In [42]:
df = pd.read_csv('230614_config.tsv', sep='\t')
df['dataset_id'] = df.fname.str.split('.', n=1, expand=True)[0]
df['subset'] = np.nan
df.loc[df.fname.str.contains('all.gff.gz'), 'subset'] = 'all'
df.loc[df.fname.str.contains('cagePolyASupported.gff.gz'), 'subset'] = 'supported'

# add the metadata
species = ['human', 'mouse']
meta_df = pd.DataFrame()
for s in species:
    temp = pd.read_csv(expand(config['data']['meta'], species=s)[0],
                       sep='\t')
    temp['species'] = s
    meta_df = pd.concat([meta_df, temp], axis=0)

df = df.merge(meta_df, on='dataset', how='left')
print(len(df.index))
print(len(df.dataset.unique()))
df['dataset'] = df['tissue']+'_'+\
             df['age']+'_'+\
             df['platform']+'_'+\
             df['capture']+'_'+\
             df['species']+'_'+\
             df['subset']
print(len(df.temp.unique()))
print(df.head())


176
88
176
                                               fname  \
0  SIDMWbEPP0101.splicing_status-all.endSupport-c...   
1  SIDMWbEPP0101.splicing_status-all.endSupport-a...   
2  SIDMWbEPC0101.splicing_status-all.endSupport-c...   
3  SIDMWbEPC0101.splicing_status-all.endSupport-a...   
4  SIDMWbEOP0101.splicing_status-all.endSupport-c...   

                                                path        dataset  \
0  https://public-docs.crg.es/rguigo/Data/gkaur/L...  SIDMWbEPP0101   
1  https://public-docs.crg.es/rguigo/Data/gkaur/L...  SIDMWbEPP0101   
2  https://public-docs.crg.es/rguigo/Data/gkaur/L...  SIDMWbEPC0101   
3  https://public-docs.crg.es/rguigo/Data/gkaur/L...  SIDMWbEPC0101   
4  https://public-docs.crg.es/rguigo/Data/gkaur/L...  SIDMWbEOP0101   

      subset tissue     age   platform       capture  rep1  rep2 species  \
0  supported    ESC  Embryo  pacBioSII   pre-capture     1     1   mouse   
1        all    ESC  Embryo  pacBioSII   pre-capture     1     1   mouse 

In [28]:
df['link'] = df.path+df.fname

In [29]:
datasets = df.dataset.tolist()
species = df.species.tolist()

Unnamed: 0,fname,path,dataset,subset,tissue,age,platform,capture,rep1,rep2,species,temp,link
0,SIDMWbEPP0101.splicing_status-all.endSupport-c...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEPP0101,supported,ESC,Embryo,pacBioSII,pre-capture,1,1,mouse,ESC_Embryo_pacBioSII_pre-capture_mouse,https://public-docs.crg.es/rguigo/Data/gkaur/L...
1,SIDMWbEPP0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEPP0101,all,ESC,Embryo,pacBioSII,pre-capture,1,1,mouse,ESC_Embryo_pacBioSII_pre-capture_mouse,https://public-docs.crg.es/rguigo/Data/gkaur/L...
2,SIDMWbEPC0101.splicing_status-all.endSupport-c...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEPC0101,supported,ESC,Embryo,pacBioSII,post-capture,1,1,mouse,ESC_Embryo_pacBioSII_post-capture_mouse,https://public-docs.crg.es/rguigo/Data/gkaur/L...
3,SIDMWbEPC0101.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEPC0101,all,ESC,Embryo,pacBioSII,post-capture,1,1,mouse,ESC_Embryo_pacBioSII_post-capture_mouse,https://public-docs.crg.es/rguigo/Data/gkaur/L...
4,SIDMWbEOP0101.splicing_status-all.endSupport-c...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDMWbEOP0101,supported,ESC,Embryo,ont,pre-capture,1,1,mouse,ESC_Embryo_ont_pre-capture_mouse,https://public-docs.crg.es/rguigo/Data/gkaur/L...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,SIDHBrAPC0301.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDHBrAPC0301,all,Brain,Adult,pacBioSII,post-capture,3,1,human,Brain_Adult_pacBioSII_post-capture_human,https://public-docs.crg.es/rguigo/Data/gkaur/L...
172,SIDHBrAOP0301.splicing_status-all.endSupport-c...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDHBrAOP0301,supported,Brain,Adult,ont,pre-capture,3,1,human,Brain_Adult_ont_pre-capture_human,https://public-docs.crg.es/rguigo/Data/gkaur/L...
173,SIDHBrAOP0301.splicing_status-all.endSupport-a...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDHBrAOP0301,all,Brain,Adult,ont,pre-capture,3,1,human,Brain_Adult_ont_pre-capture_human,https://public-docs.crg.es/rguigo/Data/gkaur/L...
174,SIDHBrAOC0301.splicing_status-all.endSupport-c...,https://public-docs.crg.es/rguigo/Data/gkaur/L...,SIDHBrAOC0301,supported,Brain,Adult,ont,post-capture,3,1,human,Brain_Adult_ont_post-capture_human,https://public-docs.crg.es/rguigo/Data/gkaur/L...


In [2]:
config_file = 'config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
config['data']['cerberus']['agg_ends']

'data/{species}/bepis_{end_type}_agg.bed'

In [7]:
expand(                         config['data']['cerberus']['agg_ends'],
       zip,
                                species='human',
                                end_type='tes',
                                allow_missing=True)

['data/human/bepis_tes_agg.bed']