In [65]:
from snakemake.io import expand
import pandas as pd
import yaml
import pdb
import re

In [92]:
config_file = 'config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [94]:
def parse_config_file(fname, auto_dedupe=True):
    df = pd.read_csv(fname, sep='\t')
    
    # get flowcell 
    exp = '.*\/[\w-]+_(\d+)(?:_t\d+)?\.fastq(?:.gz)?'
    df['flowcell'] = df.fname.str.extract(exp)

    # get dataset
    exp = '.*\/[\w-]+_(\d+[ABCDEFGH])[\w-]+\d+(?:_t\d+)?\.fastq(?:.gz)?'
    df['dataset'] = df.fname.str.extract(exp)


    # check to make sure the same file stem isn't there more than once 
    # (can happen if different flow cells needed different amounts of chopping)
    # df['file_stem'] = df.basename.str.rsplit('_', n=1, expand=True)[0]
    exp = '.*\/([\w-]+_\d+)(?:_t\d+)?\.fastq(?:.gz)?'
    df['file_stem'] = df.fname.str.extract(exp)
    df['chop_num'] = df.basename.str.rsplit('.fastq', expand=True)[0].str.rsplit('_t', expand=True)[1].astype(float)
    if df.file_stem.duplicated().any():
        dupe_stems = df.loc[df.file_stem.duplicated(keep=False), 'basename'].tolist()    
        if not auto_dedupe:
            raise ValueError(f'Files {dupe_stems} seem to be duplicated. Check config file.')
        else:
            print(f'Files {dupe_stems} seem to be duplicated. Automatically removing lower chop numbers')
            df = df.sort_values(by='chop_num', ascending=False)
            df = df.drop_duplicates(subset='file_stem', keep='first')
            
    return df

In [93]:
df = pd.read_csv('230427_config.tsv', sep='\t')
print(len(df.index))

auto_dedupe = True

# get flowcell 
exp = '.*\/[\w-]+_(\d+)(?:_t\d+)?\.fastq(?:.gz)?'
df['flowcell'] = df.fname.str.extract(exp)

# get dataset
exp = '.*\/[\w-]+_(\d+[ABCDEFGH])[\w-]+\d+(?:_t\d+)?\.fastq(?:.gz)?'
df['dataset'] = df.fname.str.extract(exp)


# check to make sure the same file stem isn't there more than once 
# (can happen if different flow cells needed different amounts of chopping)
# df['file_stem'] = df.basename.str.rsplit('_', n=1, expand=True)[0]
exp = '.*\/([\w-]+_\d+)(?:_t\d+)?\.fastq(?:.gz)?'
df['file_stem'] = df.fname.str.extract(exp)
df['chop_num'] = df.basename.str.rsplit('.fastq', expand=True)[0].str.rsplit('_t', expand=True)[1].astype(float)
if df.file_stem.duplicated().any():
    dupe_stems = df.loc[df.file_stem.duplicated(keep=False), 'basename'].tolist()    
    if not auto_dedupe:
        raise ValueError(f'Files {dupe_stems} seem to be duplicated. Check config file.')
    else:
        print(f'Files {dupe_stems} seem to be duplicated. Automatically removing lower chop numbers')
        df = df.sort_values(by='chop_num', ascending=False)
        df = df.drop_duplicates(subset='file_stem', keep='first')

20


In [24]:
# df = pd.read_csv('config.tsv', sep='\t')
df = pd.read_csv('230427_config.tsv', sep='\t')
cols = ['fname', 'sample',
        'dataset', 'platform',
        'flowcell']
for c in cols:
    df[c] = df[c].astype(str)
    
# subset the config df on dataset
def get_df_dataset(dataset, df):
    return df.loc[df.dataset==dataset]

# subset the config df on flowcell
def get_df_flowcell(flowcell, df):
    return df.loc[df.flowcell==flowcell]

# get a 1:1 value for dataset:<col> from config
def get_df_dataset_val(wc, df, col):
    temp = get_df_dataset(wc.dataset, df)
    return temp[col].values[0]

# get a 1:many value (ie dataset:flowcell, dataset:fname) from config
def get_df_dataset_col(wc, df, col):
    temp = get_df_dataset(wc.dataset, df)
    return temp[col].tolist()

# get a 1:1 value from flowcell,dataset:<col>
def get_df_dataset_flowcell_col(dataset, flowcell, df, col):
    temp = get_df_dataset(dataset, df)
    temp = get_df_flowcell(flowcell, temp)
    return temp[col].values[0]

# def get_df_col(wc, df, col):
#     val = df.loc[df.dataset==wc.dataset, col].values[0]
#     return val
#
# def get_df_whole_col(wc, df, col):
#     temp = df.loc[df.dataset==wc.dataset]
#     if flowcell in wc.keys():
#         temp = df.loc[df.flowcell==wc.flowcell]
#     vals = temp[col].tolist()
#     return vals

def get_sublib_bc_files(wc, df, config):
    sublib_flowcells = get_df_dataset_col(wc, df, 'flowcell')
    bc_files = expand(config['proc']['demux_bc'],
                      dataset=wc.dataset,
                      flowcell=sublib_flowcells)
    return bc_files

files = df['fname'].tolist()
samples = df['sample'].astype(str).tolist()
datasets = df['dataset'].astype(str).tolist()
platforms = df['platform'].astype(str).tolist()
flowcells = df['flowcell'].astype(str).tolist()


In [25]:
get_df_dataset_flowcell_col('13G', '1', df, 'fname')

'/share/crsp/lab/seyedam/share/igvf_nanopore/igvfb01/igvfb01_13G-gc_lig-ss_1_t2.fastq.gz'