In [1]:
import pandas as pd
import pdb
import numpy as np
import itertools
import re
from utils import *
from sm_utils import *

from snakemake.io import expand
import yaml

In [2]:
config_file = 'config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [21]:
def parse_config_file(fname,
                      meta_fname,
                      p_meta_fname,
                      geno_fname,
                      auto_dedupe=True):

    """
    Parameters:
        fname (str): Path to config file fname. One line per input fastq.
        meta_fname (str): Path to file with metadata information.
        p_meta_fname (str): Path to pseudochromosome metadata information
        geno_fname (str): Path to genotype metadata information
        datasets_per_run (int): Number of datasets to process in each TALON run
        auto_dedupe (bool): Automatically deduplicate duplicate fastqs that result from
            successive Porechop rounds

    Returns:
        df (pandas DataFrame): DF w/ pipeline information; one line per fastq
        dataset_df (pandas DataFrame): DF w/ dataset information; one line per mouse
    """

    df, p_df = parse_config_common(fname,
                          meta_fname,
                          p_meta_fname,
                          geno_fname,
                          auto_dedupe=True)

    return df, p_df

def parse_config_common(fname,
                      meta_fname,
                      p_meta_fname,
                      geno_fname,
                      auto_dedupe=True):

    df = pd.read_csv(fname, sep='\t')

    ############ Basename + fname
    df['basename'] = df.fname.str.rsplit('/', n=1, expand=True)[1]
    df['path'] = df.fname.str.rsplit('/', n=1, expand=True)[0]

    ############ Dataset + flowcell df

    # get flowcell
    exp = '.*\/[\w-]+_(\d+)(?:_t\d+)?\.fastq(?:.gz)?'
    df['flowcell'] = df.fname.str.extract(exp)

    # check to make sure the same file stem isn't there more than once
    # (can happen if different flow cells needed different amounts of chopping)
    # df['file_stem'] = df.basename.str.rsplit('_', n=1, expand=True)[0]
    exp = '.*\/([\w-]+_\d+)(?:_t\d+)?\.fastq(?:.gz)?'
    df['file_stem'] = df.fname.str.extract(exp)
    df['chop_num'] = df.basename.str.rsplit('.fastq', expand=True)[0].str.rsplit('_t', expand=True)[1].astype(float)
    if df.file_stem.duplicated().any():
      dupe_stems = df.loc[df.file_stem.duplicated(keep=False), 'basename'].tolist()
      if not auto_dedupe:
          raise ValueError(f'Files {dupe_stems} seem to be duplicated. Check config file.')
      else:
          print(f'Files {dupe_stems} seem to be duplicated. Automatically removing lower chop numbers')
          df = df.sort_values(by='chop_num', ascending=False)
          df = df.drop_duplicates(subset='file_stem', keep='first')

    # extract the sample name
    temp = df.basename.str.split('_', expand=True)[[0,1]]#.str.join('_')
    df['sample_temp'] = temp[0]+'_'+temp[1]

    # extract the mouse id
    df['mouse_id'] = df['sample_temp'].str.split('_', expand=True)[1]

    # extract the "study" name
    exp = '^(ad[0-9]+)'
    df['study'] = df.basename.str.extract(exp)

    # merge in metadata
    meta = process_meta(meta_fname)
    df['mouse_id'] = df['mouse_id'].astype('int')
    df = df.merge(meta, how='left', on='mouse_id')

    # merge in genotype metadata
    g_meta = pd.read_csv(geno_fname, sep='\t')
    df = df.merge(g_meta,
                  how='left',
                  on='genotype')

    # get tech rep numbers -- each mouse has multiple reps
    # and are therefore technical reps
    df['flowcell'] = df.sort_values(['genotype', 'mouse_id'],
                              ascending=[True, True])\
                              .groupby(['mouse_id']) \
                              .cumcount() + 1

    # sample should be the genotype + age + sex + tissue
    df['sample'] = df.genotype+'_'+ \
                 df.sex+'_'+ \
                 df.age+'_'+ \
                 df.tissue

    # get biorep numbers -- each mouse_id is a different mouse
    # and therefore a different biorep
    temp = df[['sample', 'mouse_id']].drop_duplicates()
    temp.reset_index(inplace=True, drop=True)
    temp['biorep_num'] = temp.sort_values(['sample', 'mouse_id'],
                              ascending=[True, True])\
                              .groupby(['sample']) \
                              .cumcount()+1
    df = df.merge(temp, how='left',
                on=['sample', 'mouse_id'])

    # talon dataset should be sample + bio rep
    df['dataset'] = df['sample']+'_'+df['biorep_num'].astype(str)

    # source for cerberus should be study + sample
    df['source'] = df['study']+'_'+df['sample']

    # get and verify humanized status
    temp = df.loc[(df.pseudochrom_needed==False)&(df.genotype.str.contains('h'))].copy(deep=True)
    # if len(temp.index) >= 1:
    #   genotypes = temp.genotype.unique().tolist()
    #   warnings.warn(f'Config found non-pseudochrom mouse w/ genotypes {genotypes}, is this expected?')

    # if not include_pseudochrom:
    #     df = df.loc[df.pseudochrom_needed==False].copy(deep=True)

    # get pseudochromosome / reference : genotype df
    # else:
    def format_pseudochrom_cols(df, col):
      """
      Format hgene, mgene, and pseudochromosome names columns
      to either replace NaNs with "dummy" and and to string
      split entries with more than one
      """
      inds = df.loc[(df[col].isnull())].index
      df.loc[inds, col] = 'dummy'
      df[col] = df[col].str.split(',')
      df[col] = df.apply(lambda x: tuple(sorted(x[col])), axis=1)
      return df

    # for c in ['pseudochrom', 'human_gene', 'mouse_gene']:
    for c in ['pseudochrom']:
      df = format_pseudochrom_cols(df, c)

    # make sure the correspondance between
    # genotype:pseudochromosomes is 1:1
    temp = df.loc[df.pseudochrom_needed==True].copy(deep=True)
    temp = temp[['pseudochrom', 'genotype']].drop_duplicates()
    dupe_genotypes = temp.loc[temp.genotype.duplicated()].genotype.unique().tolist()
    if len(dupe_genotypes) > 1:
      raise ValueError(f'Found genotype(s) {dupe_genotypes} w/ multiple pseudochromosome settings')

    df['flowcell'] = df.flowcell.astype(str)
    df['biorep_num'] = df.biorep_num.astype(str)

    # get a table that matches genotype + pseudochrom + human gene + mouse gene
    temp = df.explode('pseudochrom')
    p_meta = pd.read_csv(p_meta_fname, sep='\t')
    p_meta.fillna('dummy', inplace=True)
    p_df = temp.merge(p_meta, on='pseudochrom')

    return df, p_df

def parse_config_file_analysis(fname,
                      meta_fname,
                      p_meta_fname,
                      geno_fname,
                      an_meta_fname,
                      auto_dedupe=True):

    """
    Parameters:
        fname (str): Path to config file fname. One line per input fastq.
        meta_fname (str): Path to file with metadata information.
        p_meta_fname (str): Path to pseudochromosome metadata information
        geno_fname (str): Path to genotype metadata information
        an_meta_fname (str): Path to analysis metadata information
        datasets_per_run (int): Number of datasets to process in each TALON run
        auto_dedupe (bool): Automatically deduplicate duplicate fastqs that result from
            successive Porechop rounds
        include_pseudochrom (bool): Include models with pseudochrom loci, which need
            some preprocessing / different treatment

    Returns:
        df (pandas DataFrame): DF w/ pipeline information; one line per fastq
        dataset_df (pandas DataFrame): DF w/ dataset information; one line per mouse
    """

    df, p_df = parse_config_common(fname,
                          meta_fname,
                          p_meta_fname,
                          geno_fname,
                          auto_dedupe=True)
    

    # limit to just the studies and genotypes requested
    an_df = pd.read_csv(an_meta_fname, sep='\t')
    i = len(an_df[['genotype', 'study']].drop_duplicates().index)
    an_df['genotype_study'] = an_df['genotype']+' '+an_df['study']
    df['genotype_study'] = df['genotype']+' '+df['study']
    p_df['genotype_study'] = p_df['genotype']+' '+p_df['study']
    # import pdb; pdb.set_trace()
    # p_df = p_df.loc[(p_df.genotype.isin(genotypes))&\
    #                 (p_df.study.isin(studies))]
    p_df = p_df.loc[p_df.genotype_study.isin(an_df.genotype_study.tolist())]
    p_df.drop('genotype_study', axis=1, inplace=True)
    i2 = len(p_df[['genotype', 'study']].drop_duplicates().index)
    df = df.loc[df.genotype_study.isin(an_df.genotype_study.tolist())]
    df.drop('genotype_study', axis=1, inplace=True)
    
    
    # df = df.loc[(df.genotype.isin(genotypes))&\
    #             (df.study.isin(studies))]
    i3 = len(p_df[['genotype', 'study']].drop_duplicates().index)
    
    genotypes = an_df.genotype.unique().tolist()
    studies = an_df.study.unique().tolist()
 
    if not (i==i2==i3):
        genotypes = list(set(genotypes)-\
                         set(df.genotype.unique().tolist()))
        studies = list(set(studies)-\
                         set(df.study.unique().tolist()))
        warnings.warn(f'Genotypes {genotypes} and studies {studies} not found. Is this expected?')

    # assign a cerberus run to each "sample" (study+genotype+sex+age+tissue)
    # but first sort on study and sample such that they will always be ordered in the same way
    # this should freeze our results
    gb_cols = ['study', 'genotype', 'sex', 'age', 'tissue']
    df = df.sort_values(by=gb_cols, ascending=True)
    temp = df.copy(deep=True)
    temp = temp[gb_cols].groupby(gb_cols).count().reset_index()
    temp['cerberus_run'] = [i+1 for i in range(len(temp.index))]
    df = df.merge(temp, how='left', on=gb_cols)
    df['cerberus_run'] = df.cerberus_run.astype(str)

    # add in analysis stuff
    an_df = pd.read_csv(an_meta_fname, sep='\t')
    p_df = p_df.merge(an_df, how='left',
                  on=['genotype', 'study'])

    # add cerberus run info
    p_df = p_df.merge(df[gb_cols+['cerberus_run']].drop_duplicates(), how='left',
                 on=gb_cols)

    # sanitize genotype alias internally (int) w/ characters better for file names
    exp = '[^0-9a-zA-Z-_]+'
    p_df.loc[p_df.genotype_alias.isnull(), 'genotype_alias'] = p_df.loc[p_df.genotype_alias.isnull(), 'genotype']
    p_df['genotype_alias_int'] = p_df.genotype_alias.str.replace(exp, '_', regex=True)

    # add in columns for comparisons
    p_df['genotype_sex'] = p_df['genotype_alias_int']+'_'+p_df['sex']

    # sanitize analysis
    exp = '[^0-9a-zA-Z-_]+'
    p_df['analysis'] = p_df.analysis.str.replace(exp, '_', regex=True)

    return df, p_df

In [22]:
config_tsv = 'config.tsv'
p_meta_tsv = 'pseudochromosome_metadata.tsv'
meta_tsv = 'mouse_metadata.tsv'
geno_tsv = 'genotype_metadata.tsv'
an_meta_tsv = 'negar_analysis_config.tsv'
auto_dedupe = True

df, p_df = parse_config_file_analysis(config_tsv,
                       meta_tsv,
                       p_meta_tsv,
                       geno_tsv,
                       an_meta_tsv,
                       auto_dedupe=auto_dedupe)

> [0;32m<ipython-input-21-e50924320f75>[0m(217)[0;36mparse_config_file_analysis[0;34m()[0m
[0;32m    215 [0;31m[0;34m[0m[0m
[0m[0;32m    216 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 217 [0;31m    [0;32mif[0m [0;32mnot[0m [0;34m([0m[0mi[0m[0;34m==[0m[0mi2[0m[0;34m==[0m[0mi3[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    218 [0;31m        genotypes = list(set(genotypes)-\
[0m[0;32m    219 [0;31m                         set(df.genotype.unique().tolist()))
[0m


ipdb>  genotypes


['5xFAD-HEMI_Clu-h2kbKI-HO', '5xFAD-WT', 'B6J', 'hABKI-Swe-WT', 'hTREM2KI-WT']


ipdb>  df.genotype.unique()


array(['5xFAD-WT', '5xFAD-HEMI_Clu-h2kbKI-HO', 'hTREM2KI-WT',
       'hABKI-Swe-WT'], dtype=object)


ipdb>  n


> [0;32m<ipython-input-21-e50924320f75>[0m(218)[0;36mparse_config_file_analysis[0;34m()[0m
[0;32m    216 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    217 [0;31m    [0;32mif[0m [0;32mnot[0m [0;34m([0m[0mi[0m[0;34m==[0m[0mi2[0m[0;34m==[0m[0mi3[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 218 [0;31m        genotypes = list(set(genotypes)-\
[0m[0;32m    219 [0;31m                         set(df.genotype.unique().tolist()))
[0m[0;32m    220 [0;31m        studies = list(set(studies)-\
[0m


ipdb>  c




In [23]:
p_df.loc[p_df.analysis.isnull()]

Unnamed: 0,platform,fname,basename,path,flowcell,file_stem,chop_num,sample_temp,mouse_id,study,...,source,human_gene,mouse_gene,locus_type,notes,analysis,genotype_alias,cerberus_run,genotype_alias_int,genotype_sex


In [24]:
p_df[['study', 'genotype']].drop_duplicates()


Unnamed: 0,study,genotype
0,ad002,5xFAD-WT
2,ad004,hTREM2KI-WT
14,ad006,hABKI-Swe-WT
26,ad003,5xFAD-HEMI_Clu-h2kbKI-HO


In [13]:
def get_de_cfg_entries(p_df, cfg_entry, how):
    """
    Get file names needed as output for DE or DU
    tests within analysis objects for
        - all pairwise genotype sets
        - all pairwise genotype sets by sex

    Parameters:
        how (str): {'du', 'de'}
    """

    if how == 'du':
        feats = ['tss', 'tes', 'ic', 'iso']
    else:
        feats = []

    files = []

    for a in p_df.analysis.unique().tolist():
        wc = {'analysis': a}
        temp = subset_df_on_wcs(wc, p_df)
        obs_col = 'genotype_alias_int'
        conds = temp[obs_col].unique().tolist()

        combos = [c for c in itertools.combinations(conds, 2)]
        obs_cond1 = [c[0] for c in combos]
        obs_cond2 = [c[1] for c in combos]

        files += expand(expand(cfg_entry,
          zip,
          obs_cond1=obs_cond1,
          obs_cond2=obs_cond2,
          allow_missing=True),
          obs_col=obs_col,
          feat=feats,
          analysis=a)

        # now get genotype comparisons for each sex
        for s in temp.sex.unique():
            wc['sex'] = s
            temp2 = subset_df_on_wcs(wc, temp)
            obs_col = 'genotype_sex'
            conds = temp2[obs_col].unique().tolist()

            combos = [c for c in itertools.combinations(conds, 2)]
            obs_cond1 = [c[0] for c in combos]
            obs_cond2 = [c[1] for c in combos]

            files += expand(expand(cfg_entry,
              zip,
              obs_cond1=obs_cond1,
              obs_cond2=obs_cond2,
              allow_missing=True),
              obs_col=obs_col,
              feat=feats,
              analysis=a)

    return files

# 24 feb

In [14]:
wc = {'analysis':'240202_grant', 'obs_cond1':'5xFAD_F',
      'obs_cond2':'B6J_F', 'obs_col':'genotype_sex'}

## 243101

In [4]:
config_tsv = 'config.tsv'
p_meta_tsv = 'pseudochromosome_metadata.tsv'
meta_tsv = 'mouse_metadata.tsv'
geno_tsv = 'genotype_metadata.tsv'
an_meta_tsv = 'analysis_config.tsv'
auto_dedupe = True

df, p_df = parse_config_file_analysis(config_tsv,
                       meta_tsv,
                       p_meta_tsv,
                       geno_tsv,
                       an_meta_tsv,
                       auto_dedupe=auto_dedupe)

In [10]:
   
wc = {  'analysis':'240202_grant',
  'study':'ad003',

  'genotype':'5xFAD-WT',
  'sex':'F',
  'age':'4_months',
  'tissue':'HC',
  'cerberus_run':'1',
  'end_mode':'tss'}
p_df.loc[(p_df.cerberus_run=='1')&\
         (p_df.analysis=='240202_grant')]
p_df[['cerberus_run', 'genotype']].drop_duplicates()
# get_cfg_entries_analysis(wc,
#                         p_df,
#                         config['analysis']['cerberus']['agg']['ends'])
# expand(get_cfg_entries_analysis(wc,
#                         p_df,
#                         config['analysis']['cerberus']['agg']['ends'])[0],
#                         end_mode='tss')[0]

Unnamed: 0,cerberus_run,genotype
0,1,5xFAD-HEMI
6,2,5xFAD-WT


In [27]:
wc = {  'analysis':'240202_grant',
  'study':'ad003',

  'genotype':'5xFAD-HEMI',
  'sex':'F',
  'age':'4_months',
  'tissue':'HC',
  'cerberus_run':'1',
  'end_mode':'tss'}
get_prev_cerb_entry(wc, p_df,
                      config['analysis']['cerberus']['agg']['ics'],
                      config)

'ref/ca_tss.bed'

In [7]:
# wc = {'analysis': '240202_grant',
#        analysis=240202_grant

#   study=ad003
#   genotype=5xFAD-WT
#   sex=F
#   age=4_months
#   tissue=HC
#   cerberus_run=1}

lambda wc:get_final_cerb_entry(wc,
                     p_df,
                     config['analysis']['cerberus']['ca'])[0]

<function __main__.<lambda>(wc)>

In [26]:
# p_df.dtypes

In [21]:
p_df.cerberus_run.unique()

array(['1', '2'], dtype=object)

In [25]:
p_df.loc[p_df.cerberus_run=='1'].head(1)[['study', 'genotype', 'sex', 'age', 'tissue', 'cerberus_run']]
# p_df.loc[p_df.cerberus_run=='1', 'genotype'].unique().tolist()

Unnamed: 0,study,genotype,sex,age,tissue,cerberus_run
0,ad003,5xFAD-HEMI,F,4_months,HC,1


In [16]:
wc = {'analysis': '240202_grant',

  'study':'ad003',
  'genotype':'5xFAD-WT',
  'sex':'F',
  'age':'4_months',
  'tissue':'HC',
  'cerberus_run':'1'}
expand(get_cfg_entries_analysis(wc,
                        p_df,
                        config['analysis']['cerberus']['agg']['ends'])[0],
                        end_mode='tss')[0]

'analysis/240202_grant/cerberus/agg/ad003_5xFAD-WT_F_4_months_HC_2_tss.bed'

In [10]:
get_cfg_entries_analysis(wc,
                        p_df,
                        config['analysis']['cerberus']['agg']['ends'])[0]

'analysis/240202_grant/cerberus/agg/ad003_5xFAD-HEMI_F_4_months_HC_1_{end_mode}.bed'

In [9]:
lambda wc:expand(get_cfg_entries_analysis(wc,
                        p_df,
                        config['analysis']['cerberus']['agg']['ends'])[0],
                        end_mode='tss')[0]

<function __main__.<lambda>(wc)>

In [10]:
get_de_cfg_entries(p_df, config['analysis']['swan']['du'], how='du')

['analysis/hClu/swan/5xFAD-HEMI_vs_B6J_genotype_alias_int_du_tss.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_B6J_genotype_alias_int_du_tes.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_B6J_genotype_alias_int_du_ic.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_B6J_genotype_alias_int_du_iso.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_alias_int_du_tss.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_alias_int_du_tes.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_alias_int_du_ic.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_alias_int_du_iso.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_Clu-h2kbKI-HO_genotype_alias_int_du_tss.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_Clu-h2kbKI-HO_genotype_alias_int_du_tes.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_Clu-h2kbKI-HO_genotype_alias_int_du_ic.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_Clu-h2kbKI-HO_genotype_alias_int_du_iso.tsv',
 'analysis/hClu/swan/B6J_vs

## 243101

In [3]:
config_tsv = 'config.tsv'
p_meta_tsv = 'pseudochromosome_metadata.tsv'
meta_tsv = 'mouse_metadata.tsv'
geno_tsv = 'genotype_metadata.tsv'
an_meta_tsv = 'analysis_config.tsv'
auto_dedupe = True

df, p_df = parse_config_file_analysis(config_tsv,
                       meta_tsv,
                       p_meta_tsv,
                       geno_tsv,
                       an_meta_tsv,
                       auto_dedupe=auto_dedupe)

In [4]:
p_df.genotype_alias.unique()

array(['5xFAD-HEMI', 'B6J', '5xFAD-HEMI;Clu-h2kbKI-HO', 'Clu-h2kbKI-HO',
       'hTREM2KI-HET', 'hTREM2-R47HKI-HET', 'hMAPT-HO;hABKI-SweIb-HET',
       'hMAPT-HO'], dtype=object)

In [5]:
p_df.genotype_alias_int.unique()

array(['5xFAD-HEMI', 'B6J', '5xFAD-HEMI_Clu-h2kbKI-HO', 'Clu-h2kbKI-HO',
       'hTREM2KI-HET', 'hTREM2-R47HKI-HET', 'hMAPT-HO_hABKI-SweIb-HET',
       'hMAPT-HO'], dtype=object)

In [7]:
p_df.genotype_sex.unique()

array(['5xFAD-HEMI_F', 'B6J_F', 'B6J_M', '5xFAD-HEMI_Clu-h2kbKI-HO_F',
       'Clu-h2kbKI-HO_F', 'hTREM2KI-HET_M', 'hTREM2-R47HKI-HET_M',
       'hTREM2-R47HKI-HET_F', 'hTREM2KI-HET_F',
       'hMAPT-HO_hABKI-SweIb-HET_F', 'hMAPT-HO_F',
       'hMAPT-HO_hABKI-SweIb-HET_M', 'hMAPT-HO_M'], dtype=object)

In [6]:
get_de_cfg_entries(p_df, config['analysis']['swan']['du'], how='du')

['analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_genotype_du_tss.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_genotype_du_tes.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_genotype_du_ic.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_genotype_du_iso.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_du_tss.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_du_tes.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_du_ic.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_du_iso.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_Clu-h2kbKI-HO_genotype_du_tss.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_Clu-h2kbKI-HO_genotype_du_tes.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_Clu-h2kbKI-HO_genotype_du_ic.tsv',
 'analysis/hClu/swan/5xFAD-HEMI_vs_5xFAD-WT_Clu-h2kbKI-HO_genotype_du_iso.tsv',
 'analysis/hClu/swan/5xFAD-WT_vs_5xFAD-HEMI_Clu-h2kbKI-HO_genotype_du_tss.tsv',
 'analysis/

## 243001

In [3]:
config_tsv = 'config.tsv'
p_meta_tsv = 'pseudochromosome_metadata.tsv'
meta_tsv = 'mouse_metadata.tsv'
geno_tsv = 'genotype_metadata.tsv'
auto_dedupe = True

df, p_df = parse_config_file(config_tsv,
                       meta_tsv,
                       p_meta_tsv,
                       geno_tsv,
                       auto_dedupe=auto_dedupe)

In [4]:
len(p_df.index)

114

In [5]:
config_tsv = 'config.tsv'
p_meta_tsv = 'pseudochromosome_metadata.tsv'
meta_tsv = 'mouse_metadata.tsv'
geno_tsv = 'genotype_metadata.tsv'
an_meta_tsv = 'analysis_config.tsv'
auto_dedupe = True

df, p_df = parse_config_file_analysis(config_tsv,
                       meta_tsv,
                       p_meta_tsv,
                       geno_tsv,
                       an_meta_tsv,
                       auto_dedupe=auto_dedupe)

> [0;32m/Users/fairliereese/Documents/programming/mortazavi_lab/bin/modelad_pipeline/proc_update_2/sm_utils.py[0m(232)[0;36mparse_config_file_analysis[0;34m()[0m
[0;32m    230 [0;31m    [0mi3[0m [0;34m=[0m [0mlen[0m[0;34m([0m[0mp_df[0m[0;34m[[0m[0;34m[[0m[0;34m'genotype'[0m[0;34m,[0m [0;34m'study'[0m[0;34m][0m[0;34m][0m[0;34m.[0m[0mdrop_duplicates[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0mindex[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    231 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 232 [0;31m    [0;32mif[0m [0;32mnot[0m [0;34m([0m[0mi[0m[0;34m==[0m[0mi2[0m[0;34m==[0m[0mi3[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    233 [0;31m        genotypes = list(set(genotypes)-\
[0m[0;32m    234 [0;31m                         set(df.genotype.unique().tolist()))
[0m


ipdb>  genotypes


['hMAPT-HO_hABKI-SweIb-HET', 'hMAPT-HO_hABKI-SweIb-WT', 'hABKI-Swe-WT', '5xFAD-HEMI_Clu-h2kbKI-HO', '5xFAD-WT', '5xFAD-HEMI', '5xFAD-WT_Clu-h2kbKI-HO', 'hTREM2KI-HET', 'hTREM2-R47HKI-HET', 'hTREM2KI-WT']


ipdb>  c


In [6]:
p_df[['genotype', 'study', 'analysis']].drop_duplicates().sort_values(by='analysis')

Unnamed: 0,genotype,study,analysis
24,hABKI-Swe-WT,ad006,hABKI_Swe_IB_HET_WT
66,hMAPT-HO_hABKI-SweIb-WT,ad008,hABKI_Swe_IB_HET_WT
67,hMAPT-HO_hABKI-SweIb-HET,ad008,hABKI_Swe_IB_HET_WT
0,5xFAD-HEMI,ad003,hClu
6,5xFAD-WT,ad003,hClu
36,5xFAD-HEMI_Clu-h2kbKI-HO,ad003,hClu
38,5xFAD-WT_Clu-h2kbKI-HO,ad003,hClu
12,hTREM2KI-WT,ad004,hTREM2
48,hTREM2KI-HET,ad004,hTREM2
50,hTREM2-R47HKI-HET,ad004,hTREM2


In [9]:
len(p_df[['genotype', 'study', 'analysis']].drop_duplicates().index)

7

In [10]:
len(df.index)

53

## 240129

In [3]:
configfile: 'config.yml'
config_tsv = 'config.tsv'
p_meta_tsv = 'pseudochromosome_metadata.tsv'
meta_tsv = 'mouse_metadata.tsv'
an_meta_tsv = 'test_analysis_config.tsv'
auto_dedupe = True

df, p_df = parse_config_file_analysis(config_tsv,
                       meta_tsv,
                       p_meta_tsv,
                       an_meta_tsv,
                       auto_dedupe=auto_dedupe)

In [4]:
len(p_df.index)

13

In [5]:
p_df.columns

Index(['batch', 'platform', 'fname', 'basename', 'path', 'flowcell',
       'file_stem', 'chop_num', 'sample_temp', 'mouse_id', 'study', 'genotype',
       'sex', 'age', 'tissue', 'pseudochrom_needed', 'pseudochrom', 'sample',
       'biorep_num', 'dataset', 'source', 'genotype_sex', 'human_gene',
       'mouse_gene', 'locus_type', 'notes', 'analysis', 'cerberus_run'],
      dtype='object')

In [6]:
df.columns

Index(['batch', 'platform', 'fname', 'basename', 'path', 'flowcell',
       'file_stem', 'chop_num', 'sample_temp', 'mouse_id', 'study', 'genotype',
       'sex', 'age', 'tissue', 'pseudochrom_needed', 'pseudochrom', 'sample',
       'biorep_num', 'dataset', 'source', 'genotype_sex', 'cerberus_run'],
      dtype='object')

In [None]:
wc = {'}

In [12]:
get_cfg_entries(wc, df, config['analysis']['cerberus']['gtf'])

NameError: name 'wc' is not defined

In [5]:
# get_de_cfg_entries(p_df, config['analysis']['swan']['du'], how='du')
# get_de_cfg_entries(p_df, config['analysis']['swan']['deg'], how='de'),
# get_de_cfg_entries(p_df, config['analysis']['swan']['det'], how='de')

['analysis/hClu/swan/5xFADHEMI_vs_C57B6J_genotype_du_tss.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_C57B6J_genotype_du_tes.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_C57B6J_genotype_du_ic.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_C57B6J_genotype_du_iso.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_5xCLU-h2kbKI_HO_genotype_du_tss.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_5xCLU-h2kbKI_HO_genotype_du_tes.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_5xCLU-h2kbKI_HO_genotype_du_ic.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_5xCLU-h2kbKI_HO_genotype_du_iso.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_CLU-h2kbKI_HO_genotype_du_tss.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_CLU-h2kbKI_HO_genotype_du_tes.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_CLU-h2kbKI_HO_genotype_du_ic.tsv',
 'analysis/hClu/swan/5xFADHEMI_vs_CLU-h2kbKI_HO_genotype_du_iso.tsv',
 'analysis/hClu/swan/C57B6J_vs_5xCLU-h2kbKI_HO_genotype_du_tss.tsv',
 'analysis/hClu/swan/C57B6J_vs_5xCLU-h2kbKI_HO_genotype_du_tes.tsv',
 'analysis/hClu/swan/C57B6J_vs_5xCLU-h2kbKI_H

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19'], dtype=object)