In [2]:
import pandas as pd
import pdb
import numpy as np
import itertools
from snakemake.io import expand
import yaml
import os

from utils import *

In [33]:
config_file = 'config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

## 231019 pipeline restructuring -- talon input

In [71]:
# utilities related to defining snakemake rules
from snakemake.io import expand

def subset_df_on_wcs(wc, df):
    """
    Return a copy of the input metadata df limited to the wildcards
    """
    temp = df.copy(deep=True)
    for key, item in wc.items():
        if type(item) == list:
            temp = temp.loc[temp[key].isin(item)]
        else:
            temp = temp.loc[temp[key] == item]
    return temp

def get_df_col(wc, df, col):
    """
    From the metadata dataframe df, get the entries that satisfy
    the wildcards requirements and return the corresponding value
    from col. Ensure that this is always a 1:1 relationship, otherwise
    throw an error.
    """
    cols = [col] + [key for key, item in wc.items()]

    temp = subset_df_on_wcs(wc, df)
    temp = temp[cols].drop_duplicates()

    if len(temp.index) != 1:
        msg = 'Issues getting data from DF with wildcards'
        for key, item in wc.items():
            msg+=f'\n{key}: {item}'
        raise ValueError(msg)

    val = temp[col].tolist()[0]
    return val

def get_cfg_entries(wc, df, cfg_entry, return_df=False):
    """
    Expand a config entry based on the wildcards and the
    values in df that satisfy these wildcards
    
    Parameters:
        return_df (bool): Return DataFrame with 'file' column 
            as opposed to list of files. Default: False
    """
    temp = subset_df_on_wcs(wc, df)

    # cols = ['study', 'genotype', 'sex',
    #         'age', 'tissue', 'biorep_num',
    #         'flowcell']

    study = temp.study.tolist()
    genotype = temp.genotype.tolist()
    sex = temp.sex.tolist()
    age = temp.age.tolist()
    tissue = temp.tissue.tolist()
    biorep_num = temp.biorep_num.tolist()
    flowcell = temp.flowcell.tolist()

    files = expand(cfg_entry,
                   zip,
                   study=study,
                   genotype=genotype,
                   sex=sex,
                   age=age,
                   tissue=tissue,
                   biorep_num=biorep_num,
                   flowcell=flowcell)

    temp['file'] = files

    # make sure we only take unique ones
    # cols.append('file')
    # temp = temp[cols]
    temp = temp.drop_duplicates(subset='file', keep='first')
    files = temp['file'].tolist()
    
    if return_df:
        return temp
    else: 
        return files

In [73]:
# get all files from bioreps of same genotype, sex, age, tissue
wc = {'study': 'ad003',
      'genotype': '5xFADHEMI',
      'sex': 'F',
      'age': ['4_months', '10_days'],
      'tissue': 'HC'}

cfg_entry = config['merge']['bam']
files = get_cfg_entries(wc, df, cfg_entry)
files
temp = get_cfg_entries(wc, df, cfg_entry, return_df=True)
temp

Unnamed: 0,batch,platform,fname,basename,path,flowcell,file_stem,chop_num,sample_temp,mouse_id,study,genotype,sex,age,tissue,sample,biorep_num,dataset,file
0,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11616_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11616_lig-blk_1,1.0,ad003_11616,11616,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,1,5xFADHEMI_F_4_months_HC_1,data/flowcell_merge/5xFADHEMI_F_4_months_HC_1.bam
2,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11617_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11617_lig-blk_1,1.0,ad003_11617,11617,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,2,5xFADHEMI_F_4_months_HC_2,data/flowcell_merge/5xFADHEMI_F_4_months_HC_2.bam
4,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11625_lig-blk_1_t2.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11625_lig-blk_1,2.0,ad003_11625,11625,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,3,5xFADHEMI_F_4_months_HC_3,data/flowcell_merge/5xFADHEMI_F_4_months_HC_3.bam


In [74]:
temp[['sample', 'dataset', 'platform', 'file']]

Unnamed: 0,sample,dataset,platform,file
0,5xFADHEMI_F_4_months_HC,5xFADHEMI_F_4_months_HC_1,ONT,data/flowcell_merge/5xFADHEMI_F_4_months_HC_1.bam
2,5xFADHEMI_F_4_months_HC,5xFADHEMI_F_4_months_HC_2,ONT,data/flowcell_merge/5xFADHEMI_F_4_months_HC_2.bam
4,5xFADHEMI_F_4_months_HC,5xFADHEMI_F_4_months_HC_3,ONT,data/flowcell_merge/5xFADHEMI_F_4_months_HC_3.bam


In [51]:
df

Unnamed: 0,batch,platform,fname,basename,path,flowcell,file_stem,chop_num,sample_temp,mouse_id,study,genotype,sex,age,tissue,sample,biorep_num,dataset
0,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11616_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11616_lig-blk_1,1.0,ad003_11616,11616,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,1,5xFADHEMI_F_4_months_HC_1
1,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11616_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11616_lig-blk_2,1.0,ad003_11616,11616,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,1,5xFADHEMI_F_4_months_HC_1
2,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11617_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11617_lig-blk_1,1.0,ad003_11617,11617,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,2,5xFADHEMI_F_4_months_HC_2
3,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11617_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11617_lig-blk_2,1.0,ad003_11617,11617,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,2,5xFADHEMI_F_4_months_HC_2
4,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11625_lig-blk_1_t2.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11625_lig-blk_1,2.0,ad003_11625,11625,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,3,5xFADHEMI_F_4_months_HC_3
5,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11625_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11625_lig-blk_2,1.0,ad003_11625,11625,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,3,5xFADHEMI_F_4_months_HC_3
6,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11627_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11627_lig-blk_1,1.0,ad003_11627,11627,ad003,5xFADWT,F,4_months,HC,5xFADWT_F_4_months_HC,1,5xFADWT_F_4_months_HC_1
7,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11627_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11627_lig-blk_2,1.0,ad003_11627,11627,ad003,5xFADWT,F,4_months,HC,5xFADWT_F_4_months_HC,1,5xFADWT_F_4_months_HC_1
8,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11628_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11628_lig-blk_1,1.0,ad003_11628,11628,ad003,5xFADWT,F,4_months,HC,5xFADWT_F_4_months_HC,2,5xFADWT_F_4_months_HC_2
9,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11628_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11628_lig-blk_2,1.0,ad003_11628,11628,ad003,5xFADWT,F,4_months,HC,5xFADWT_F_4_months_HC,2,5xFADWT_F_4_months_HC_2


## 231017 pipeline restructuring

In [4]:
configfile: 'config.yml'
config_tsv = '231018_config.tsv'
meta_tsv = 'mouse_metadata.tsv'
auto_dedupe = True
df = parse_config_file(config_tsv,
                       meta_tsv,
                       auto_dedupe=auto_dedupe)


In [13]:
wc = {'genotype': '5xFADHEMI',
      'sex': 'F',
      'age': ['4_months', '10_days'],
      'tissue': 'HC',
      'biorep_num': '1',
      'flowcell': '1'}

In [41]:
cfg_entry = config['talon_label']['sort_bam']
wc = {'genotype': '5xFADHEMI',
      'sex': 'F',
      'age': ['4_months', '10_days'],
      'tissue': 'HC',
      'biorep_num': '1'}

temp = subset_df_on_wcs(wc, df)
study = temp.study.tolist()
genotype = temp.genotype.tolist()
sex = temp.sex.tolist()
age = temp.age.tolist()
tissue = temp.tissue.tolist()
biorep_num = temp.biorep_num.tolist()
flowcell = temp.flowcell.tolist()

files = expand(cfg_entry,
               zip,
               study=study,
               genotype=genotype,
               sex=sex,
               age=age,
               tissue=tissue,
               biorep_num=biorep_num,
               flowcell=flowcell)
return files

['data/talon_label/5xFADHEMI_F_4_months_HC_1_1_labeled_sorted.bam',
 'data/talon_label/5xFADHEMI_F_4_months_HC_1_2_labeled_sorted.bam']

In [40]:
df.head()

Unnamed: 0,batch,platform,fname,basename,path,flowcell,file_stem,chop_num,sample_temp,mouse_id,study,genotype,sex,age,tissue,sample,biorep_num,dataset
0,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11616_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11616_lig-blk_1,1.0,ad003_11616,11616,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,1,5xFADHEMI_F_4_months_HC_1
1,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11616_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11616_lig-blk_2,1.0,ad003_11616,11616,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,1,5xFADHEMI_F_4_months_HC_1
2,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11617_lig-blk_1_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11617_lig-blk_1,1.0,ad003_11617,11617,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,2,5xFADHEMI_F_4_months_HC_2
3,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11617_lig-blk_2_t1.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,2,ad003_11617_lig-blk_2,1.0,ad003_11617,11617,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,2,5xFADHEMI_F_4_months_HC_2
4,230516,ONT,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,ad003_11625_lig-blk_1_t2.fastq,/share/crsp/lab/seyedam/share/ad_nanopore_tmp/...,1,ad003_11625_lig-blk_1,2.0,ad003_11625,11625,ad003,5xFADHEMI,F,4_months,HC,5xFADHEMI_F_4_months_HC,3,5xFADHEMI_F_4_months_HC_3


In [30]:
get_df_col(wc, df, 'fname')

'/share/crsp/lab/seyedam/share/ad_nanopore_tmp/AD003/ad003_trimfastq/ad003_11616_lig-blk_1_t1.fastq'

In [25]:
# replicates in modelad consist of 
# * genotype
# * sex
# * age
# * tissue