In [6]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from utils import *
from plotting import *

from sm_utils import *

In [7]:
config_file = 'config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

## 231109 read lens

In [8]:
species=['human', 'mouse']
meta_df = get_meta_df(config, species)

In [9]:
meta_df.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,document_urls,document_labels,platform,RIN,spikeins,species,age,sex,genotype,matching_human_samples
0,ENCSR989ZYL,a673_1_1,a673,A673,a673,bone,,cell_line,#de3700,,...,https://www.encodeproject.org/documents/6d583a...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.8,True,human,,,,
1,ENCSR989ZYL,a673_1_2,a673,A673,a673,bone,,cell_line,#de3700,,...,https://www.encodeproject.org/documents/6d583a...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.8,True,human,,,,
2,ENCSR081NRO,adrenal_gland_1_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,https://www.encodeproject.org/documents/54012f...,'non-size selected cDNA libraries for use in P...,Pacific Biosciences Sequel,9.8,False,human,,,,
3,ENCSR563RLX,adrenal_gland_2_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,,True,human,,,,
4,ENCSR995WKW,adrenal_gland_3_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,https://www.encodeproject.org/documents/54012f...,'non-size selected cDNA libraries for use in P...,Pacific Biosciences Sequel,9.4,False,human,,,,


In [19]:
def get_col_from_meta_df(wc, col):
    temp = meta_df.copy(deep=True)
    if 'species' in wc.keys():
        temp = meta_df.loc[meta_df.species == wc['species']]
    return temp[col].tolist()

In [20]:
wc = {'species': 'human'}

In [24]:
expand(config['lr']['bam'],
                      species='human',
                      dataset=get_col_from_meta_df(wc, col='dataset'))

['data/human/lr/tc/a673_1_1.bam',
 'data/human/lr/tc/a673_1_2.bam',
 'data/human/lr/tc/adrenal_gland_1_1.bam',
 'data/human/lr/tc/adrenal_gland_2_1.bam',
 'data/human/lr/tc/adrenal_gland_3_1.bam',
 'data/human/lr/tc/aorta_1_1.bam',
 'data/human/lr/tc/aorta_2_1.bam',
 'data/human/lr/tc/brodmann_area_46_1_1.bam',
 'data/human/lr/tc/brodmann_area_46_2_1.bam',
 'data/human/lr/tc/brodmann_area_46_3_1.bam',
 'data/human/lr/tc/brodmann_area_46_4_1.bam',
 'data/human/lr/tc/brodmann_area_46_5_1.bam',
 'data/human/lr/tc/brodmann_area_46_6_1.bam',
 'data/human/lr/tc/brodmann_area_46_7_1.bam',
 'data/human/lr/tc/brodmann_area_46_8_1.bam',
 'data/human/lr/tc/brodmann_area_46_9_1.bam',
 'data/human/lr/tc/caco2_1_1.bam',
 'data/human/lr/tc/caco2_1_2.bam',
 'data/human/lr/tc/calu3_1_1.bam',
 'data/human/lr/tc/calu3_1_2.bam',
 'data/human/lr/tc/cardiac_septum_1_1.bam',
 'data/human/lr/tc/gm12878_1_1.bam',
 'data/human/lr/tc/gm12878_1_2.bam',
 'data/human/lr/tc/gm12878_1_3.bam',
 'data/human/lr/tc/gm128

## 231010 fixe get talon run info

In [3]:
wildcards = {'species':'human', 'talon_run':'1'}
df = process_lr_metadata(config['lr']['meta'], ['human', 'mouse'], datasets_per_talon_run=8)

In [4]:
df.talon_run.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        0])

In [5]:
df.max_talon_run.unique()

array([17, 15])

In [6]:
df.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,spikeins,species,age,sex,genotype,matching_human_samples,n_datasets,mod_num,talon_run,max_talon_run
0,ENCSR989ZYL,a673_1_1,a673,A673,a673,bone,,cell_line,#de3700,,...,True,human,,,,,138,18.0,1,17
1,ENCSR989ZYL,a673_1_2,a673,A673,a673,bone,,cell_line,#de3700,,...,True,human,,,,,138,18.0,2,17
2,ENCSR081NRO,adrenal_gland_1_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,False,human,,,,,138,18.0,3,17
3,ENCSR563RLX,adrenal_gland_2_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,True,human,,,,,138,18.0,4,17
4,ENCSR995WKW,adrenal_gland_3_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,False,human,,,,,138,18.0,5,17


In [7]:
def get_talon_run_info(wc, df, cfg_entry, dataframe=False):
    """
    Get all files for a talon run

    Parameters:
        dataframe (bool): False if it should return just the
            list of input files for this TALON run,
            True if it should return the whole DF
    """
    temp = df.copy(deep=True)
    temp = temp.loc[(temp.species==wc['species'])&\
                  (temp.talon_run==int(wc['talon_run']))]
    datasets = temp.dataset.tolist()
    species = temp.species.tolist()
    files = expand(cfg_entry,
                   zip,
                   dataset=datasets,
                   species=species)
    temp['talon_file'] = files
    if not dataframe:
        return files
    else:
        return temp

In [8]:

cfg_df =get_talon_run_info(wildcards, df,
                                    config['lr']['talon']['bam_sort'],
                                    dataframe=True)
cfg_df = cfg_df[['dataset', 'sample', 'platform', 'talon_file']]
# cfg_df.to_csv(output.config, header=None, sep=',', index=False)

In [9]:
cfg_df

Unnamed: 0,dataset,sample,platform,talon_file
0,a673_1_1,a673,Pacific Biosciences Sequel II,data/human/lr/talon/label/a673_1_1_sorted.bam
18,calu3_1_1,calu3,Pacific Biosciences Sequel II,data/human/lr/talon/label/calu3_1_1_sorted.bam
36,h9_chondro_1_1,h9_chondro,Pacific Biosciences Sequel II,data/human/lr/talon/label/h9_chondro_1_1_sorte...
54,heart_right_ventricle_1_1,heart,Pacific Biosciences Sequel II,data/human/lr/talon/label/heart_right_ventricl...
72,hl60_m1_72hr_1_2,hl60_m1_72hr,Pacific Biosciences Sequel II,data/human/lr/talon/label/hl60_m1_72hr_1_2_sor...
90,left_cardiac_atrium_1_1,heart,Pacific Biosciences Sequel II,data/human/lr/talon/label/left_cardiac_atrium_...
108,ovary_2_1,ovary,Pacific Biosciences Sequel II,data/human/lr/talon/label/ovary_2_1_sorted.bam
126,right_cardiac_atrium_1_1,heart,Pacific Biosciences Sequel II,data/human/lr/talon/label/right_cardiac_atrium...


In [12]:
cfg_df.talon_run_num.unique()

AttributeError: 'DataFrame' object has no attribute 'talon_run_num'

## 231010 process metadata

In [4]:
df = process_lr_metadata(config['lr']['meta'], ['human', 'mouse'], datasets_per_talon_run=8)

In [23]:
datasets_per_talon_run=8

In [24]:
# get number to mod each talon run by
# df['n_datasets'] = df[['dataset', 'species']].groupby('species').count()[['dataset']].transform('
# df['mod_num'] = np.ceil(df.n_datasets/datasets_per_talon_run)
# df.head()

df['n_datasets'] = df[['dataset', 'species']].groupby('species')[['dataset']].transform('count')
df['mod_num'] = np.ceil(df.n_datasets/datasets_per_talon_run)
df.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,RIN,spikeins,species,age,sex,genotype,matching_human_samples,talon_run,n_datasets,mod_num
0,ENCSR989ZYL,a673_1_1,a673,A673,a673,bone,,cell_line,#de3700,,...,9.8,True,human,,,,,1,138,18.0
1,ENCSR989ZYL,a673_1_2,a673,A673,a673,bone,,cell_line,#de3700,,...,9.8,True,human,,,,,2,138,18.0
2,ENCSR081NRO,adrenal_gland_1_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,9.8,False,human,,,,,3,138,18.0
3,ENCSR563RLX,adrenal_gland_2_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,,True,human,,,,,4,138,18.0
4,ENCSR995WKW,adrenal_gland_3_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,9.4,False,human,,,,,5,138,18.0


In [27]:
df['talon_run'] = (df.sort_values(['species', 'dataset'],
                                ascending=[True, True])\
                                .groupby('species')\
                                .cumcount() + 1).to_numpy()\
                                % df.mod_num.to_numpy()

In [28]:
df

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,RIN,spikeins,species,age,sex,genotype,matching_human_samples,talon_run,n_datasets,mod_num
0,ENCSR989ZYL,a673_1_1,a673,A673,a673,bone,,cell_line,#de3700,,...,9.8,True,human,,,,,1.0,138,18.0
1,ENCSR989ZYL,a673_1_2,a673,A673,a673,bone,,cell_line,#de3700,,...,9.8,True,human,,,,,2.0,138,18.0
2,ENCSR081NRO,adrenal_gland_1_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,9.8,False,human,,,,,3.0,138,18.0
3,ENCSR563RLX,adrenal_gland_2_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,,True,human,,,,,4.0,138,18.0
4,ENCSR995WKW,adrenal_gland_3_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,9.4,False,human,,,,,5.0,138,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122,ENCSR909NPF,hippocampus_2mo_f_1,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,9.0,True,mouse,2mo,f,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']",11.0,127,16.0
123,ENCSR909NPF,hippocampus_2mo_f_2,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,9.0,True,mouse,2mo,f,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']",12.0,127,16.0
124,ENCSR972DID,hippocampus_2mo_m_1,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,9.0,True,mouse,2mo,m,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']",13.0,127,16.0
125,ENCSR972DID,hippocampus_2mo_m_2,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,9.0,True,mouse,2mo,m,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']",14.0,127,16.0


In [29]:
df[['dataset', 'species', 'talon_run']].groupby(['species', 'talon_run']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,dataset
species,talon_run,Unnamed: 2_level_1
human,0.0,7
human,1.0,8
human,2.0,8
human,3.0,8
human,4.0,8
human,5.0,8
human,6.0,8
human,7.0,8
human,8.0,8
human,9.0,8


In [32]:
df[['dataset', 'species', 'talon_run']].groupby(['species', 'talon_run']).count().reset_index().groupby(['species']).sum()

Unnamed: 0_level_0,talon_run,dataset
species,Unnamed: 1_level_1,Unnamed: 2_level_1
human,153.0,138
mouse,120.0,127


In [8]:
datasets_per_talon_run = 7
138/8
138/18

7.666666666666667

In [31]:
# split up by species
# temp2 = pd.DataFrame()
# for i,s in enumerate(df.species.unique().tolist()):
#     temp = df.loc[df.species==s].copy(deep=True)
#     temp = temp.sort_values(by='dataset', ascending=True)
temp = df.copy(deep=True)
temp['talon_run_num'] = ((temp.sort_values(['species', 'dataset'], 
                                    ascending=[True, True])\
                                    .groupby('species')\
                                    .cumcount() + 1)\
                                    %datasets_per_talon_run).tolist()

# temp['talon_run_num'] = temp['num']%datasets_per_talon_run

In [30]:
temp.num.unique()

array([1, 2, 3, 4, 5, 6, 0])

In [None]:
df['flowcell'] = df.sort_values(['genotype', 'mouse_id'],
                                ascending=[True, True])\
                                .groupby(['mouse_id']) \
                                .cumcount() + 1

In [14]:
# species=['human', 'mouse']

# # lr stuff
# for i,s in enumerate(species):
#     f = expand(config['lr']['meta'], species=s)[0]
#     if i == 0:
#         lr_df = pd.read_csv(f, sep='\t')
#         lr_df['species'] = s
#     else:
#         temp = pd.read_csv(f, sep='\t')
#         temp['species'] = s
#         lr_df = pd.concat([lr_df, temp], axis=0)
        
# # for i, f in enumerate(expand(config['lr']['meta'], species=species)):
# #     if i == 0:
# #         lr_df = pd.read_csv(f, sep='\t')
# lr_meta = process_encode_metadata(expand(config['lr']['encode_meta'], species=species)[0])


In [4]:
df.species.unique()

array(['human', 'mouse'], dtype=object)

In [6]:
df.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,document_urls,document_labels,platform,RIN,spikeins,species,age,sex,genotype,matching_human_samples
0,ENCSR989ZYL,a673_1_1,a673,A673,a673,bone,,cell_line,#de3700,,...,https://www.encodeproject.org/documents/6d583a...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.8,True,human,,,,
1,ENCSR989ZYL,a673_1_2,a673,A673,a673,bone,,cell_line,#de3700,,...,https://www.encodeproject.org/documents/6d583a...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.8,True,human,,,,
2,ENCSR081NRO,adrenal_gland_1_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,https://www.encodeproject.org/documents/54012f...,'non-size selected cDNA libraries for use in P...,Pacific Biosciences Sequel,9.8,False,human,,,,
3,ENCSR563RLX,adrenal_gland_2_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,,True,human,,,,
4,ENCSR995WKW,adrenal_gland_3_1,adrenal_gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal_gland,adrenal_10d,adrenal_14d,adrenal_...",...,https://www.encodeproject.org/documents/54012f...,'non-size selected cDNA libraries for use in P...,Pacific Biosciences Sequel,9.4,False,human,,,,


In [7]:
df.tail()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,document_urls,document_labels,platform,RIN,spikeins,species,age,sex,genotype,matching_human_samples
122,ENCSR909NPF,hippocampus_2mo_f_1,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,https://www.encodeproject.org/documents/bf543f...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.0,True,mouse,2mo,f,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']"
123,ENCSR909NPF,hippocampus_2mo_f_2,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,https://www.encodeproject.org/documents/bf543f...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.0,True,mouse,2mo,f,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']"
124,ENCSR972DID,hippocampus_2mo_m_1,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,https://www.encodeproject.org/documents/bf543f...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.0,True,mouse,2mo,m,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']"
125,ENCSR972DID,hippocampus_2mo_m_2,hippocampus_2mo,Hippocampus PNM2,hippocampus,hippocampus,,tissue,#c96579,,...,https://www.encodeproject.org/documents/bf543f...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.0,True,mouse,2mo,m,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']"
126,ENCSR038FOD,hippocampus_10d_f_1,hippocampus_10d,Hippocampus,hippocampus,hippocampus,,tissue,,,...,https://www.encodeproject.org/documents/bf543f...,'ENCODE Long Read RNA-Seq Analysis Protocol fo...,Pacific Biosciences Sequel II,9.0,True,mouse,10d,f,b6/cast,"['brain', 'pgp1_excite_neuron', 'pgp1_astro']"


In [18]:
len(lr_df.loc[lr_df.species=='mouse'])

126