In [1]:
import pandas as pd
from snakemake.io import expand
import yaml
import numpy as np
from utils import *
from sm_utils import *
from bc_utils import *

In [222]:
config_file = '../configs/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [257]:
# variables to change (again these could go in
# a future analysis spec)
# config_tsv = 'configs/test_2.tsv'
# config_tsv = 'configs/test_3.tsv'
# config_tsv = '../configs/test_4.tsv'
config_tsv = '../configs/test_fastqs_for_exp_spec.tsv'
subpool_tsv = '../configs/subpool_metadata.tsv'
sample_csv = '../configs/sample_metadata.csv'
kit = 'WT_mega'
chemistry = 'v2'
first_min_counts = 500

# read in config / analysis spec
df = parse_config(config_tsv)
bc_df = get_bc1_matches(kit, chemistry)
sample_df = parse_sample_df(sample_csv)
subpool_df = pd.read_csv(subpool_tsv, sep='\t')


In [258]:
import numpy as np
import yaml
from collections import defaultdict
# employee_dict={'employee': {'name': 'John Doe',  'age': 35,
#   'job': {'title': 'Software Engineer','department': 'IT','years_of_experience': 10},
#   'address': {'street': '123 Main St.', 'city': 'San Francisco','state': 'CA', 'zip': 94102}}}
# print("The python dictionary is:")
# print(employee_dict)
# yaml_string=yaml.dump(employee_dict)
# print("The YAML string is:")
# print(yaml_string)

In [259]:
def make_exp_spec_from_sample_df(sample_df, experiment, ofile):
    """
    Parameters:
        sample_df (pandas DataFrame): Output from parse_sample_df; derived from
            the Google Sheets thingie
        experiment (str): Name of experiment. Will be used to subset the sample_df
        ofile (str): Path to output .yml file
    """
    exp_spec = dict()
    exp_spec['samples'] = dict()
    
    # subset on correct experiment
    sample_df = sample_df.loc[sample_df.plate==experiment].copy(deep=True)
    
    # make sure that we only have one kit / chemistry for this experiment
    assert len(sample_df['Protocol'].unique().tolist()) == 1
    assert len(sample_df['Chemistry'].unique().tolist()) == 1
    
    # kit + chemistry
    exp_spec['protocol'] = sample_df['Protocol'].tolist()[0]
    exp_spec['chemistry'] = sample_df['Chemistry'].tolist()[0]
    
    # list of samples for each row / col well
    def make_alias_list(x, alias_cols):
        """
        Turn the alias sample columns into a list that's one column
        """
        aliases = [a for a in x[alias_cols] if not str(a)=='nan']
        return aliases    

    alias_cols = ['alias_tissue1', 'alias_tissue2', 'alias_tissue3', 'alias_tissue4']
    sample_df['well_row'] = sample_df.bc1_well.str.slice(0,1)
    sample_df['well_col'] = sample_df.bc1_well.str.slice(1).astype(int)
    sample_df = sample_df[alias_cols+['well_row', 'well_col']]
    sample_df['samples'] = sample_df.apply(lambda x: make_alias_list(x, alias_cols), axis=1)
    sample_df.drop(alias_cols, axis=1, inplace=True)

    for row in sample_df.well_row.unique().tolist():
        temp = sample_df.loc[sample_df.well_row==row].copy(deep=True)
        for ind, entry in temp.iterrows():
            if row not in exp_spec['samples'].keys():
                exp_spec['samples'][row] = dict()
            exp_spec['samples'][row][entry.well_col] = entry['samples']

    yml_str = yaml.dump(exp_spec)
    with open(ofile, 'w') as o:
        o.write(yml_str)

In [260]:
# this works for the samples and exp overview
sample_df = parse_sample_df(sample_csv)
# make_exp_spec_from_sample_df(sample_df, 'igvf_010', 'igvf_010_exp_spec.yml')

In [261]:
sample_df.head()

Unnamed: 0,Mouse_Tissue_ID,alias_tissue1,alias_tissue2,alias_tissue3,alias_tissue4,plate,Protocol,Chemistry,bc1_well,well_type,...,Age_weeks,Age_days,Body_weight_g,Estrus_cycle,Dissection_date,Dissection_time,Tissue_weight_mg,mult_genotype_1,mult_genotype_2,mult_genotype
0,016_B6J_10F_03,ali-mortazavi:016_B6J_10F_03_NTR_0000646,ali-mortazavi:016_B6J_10F_03_NTR_0000750,,,igvf_003,Parse_WT_Mega,v2,A1,Single,...,10,72.0,21.1,Diestrus,10/27/22,9:01 AM,141.0,,,
1,017_B6J_10M_03,ali-mortazavi:017_B6J_10M_03_NTR_0000646,ali-mortazavi:017_B6J_10M_03_NTR_0000750,,,igvf_003,Parse_WT_Mega,v2,A2,Single,...,10,72.0,26.3,,10/27/22,11:44 AM,141.0,,,
2,018_B6J_10F_03,ali-mortazavi:018_B6J_10F_03_NTR_0000646,ali-mortazavi:018_B6J_10F_03_NTR_0000750,,,igvf_003,Parse_WT_Mega,v2,A3,Single,...,10,72.0,21.0,Proestrus,10/27/22,9:38 AM,143.0,,,
3,019_B6J_10M_03,ali-mortazavi:019_B6J_10M_03_NTR_0000646,ali-mortazavi:019_B6J_10M_03_NTR_0000750,,,igvf_003,Parse_WT_Mega,v2,A4,Single,...,10,72.0,29.0,,10/27/22,12:12 PM,141.0,,,
4,020_B6J_10F_03,ali-mortazavi:020_B6J_10F_03_NTR_0000646,ali-mortazavi:020_B6J_10F_03_NTR_0000750,,,igvf_003,Parse_WT_Mega,v2,A5,Single,...,10,72.0,18.5,Diestrus,10/27/22,10:14 AM,149.0,,,


## the subpools >_<

In [262]:
df = df.merge(subpool_df, how='left', on=['subpool', 'plate'])

In [276]:
# add some extra columns
plat_dict = {'prom': 'ONT PromethION',
             'grid': 'ONT GridION',
             'nova': 'Illumina NovaSeq',
             'next': 'Illumina NextSeq'}
plat_to_seq_type_dict = {'prom': 'long',
                         'grid': 'long',
                         'nova': 'short',
                         'next': 'short'}
df['long_or_short'] = df.platform.map(plat_to_seq_type_dict)
df['platform_hr'] = df.platform.map(plat_dict)

# get thing
gb_cols = ['subpool', 'long_or_short', 'platform', 'run']
gb_fastq_cols = gb_cols+['fastq']
temp = df[gb_fastq_cols].groupby(gb_cols)['fastq'].apply(list)
temp.to_dict()

{('Sublibrary_2',
  'long',
  'grid',
  1): ['/dfs7/samlab/seyedam/IGVF/igvf_010/grid/igvf_010_lig-blk.fastq.gz'],
 ('Sublibrary_2',
  'long',
  'prom',
  1): ['/dfs7/samlab/seyedam/IGVF/igvf_010/prom/igvf_010_lig-blk.fastq.gz'],
 ('Sublibrary_2',
  'short',
  'next',
  1): ['/dfs7/samlab/seyedam/IGVF/igvf_010/next1/Sublibrary_3_S2_L004_R1_001.fastq.gz'],
 ('Sublibrary_2',
  'short',
  'next',
  2): ['/dfs7/samlab/seyedam/IGVF/igvf_010/next2/Sublibrary_3_S2_L004_R1_001.fastq.gz'],
 ('Sublibrary_2',
  'short',
  'nova',
  1): ['/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Sublibrary_2_S1_L001_R1_001.fastq.gz', '/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Sublibrary_2_S1_L002_R1_001.fastq.gz', '/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Sublibrary_2_S1_L003_R1_001.fastq.gz', '/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Sublibrary_2_S1_L004_R1_001.fastq.gz'],
 ('Sublibrary_2',
  'short',
  'nova',
  2): ['/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/Sublibrary_2_S1_L001_R1_001.fastq.gz', '/dfs7/samlab/se

In [263]:
exp_spec = {}
exp_spec['subpools'] = {}
subpool_cols = ['subpool', 'Count', 'i7_subpool_barcode', 'i5_subpool_barcode', 'Selection']
temp = df[subpool_cols].drop_duplicates().copy(deep=True)
temp.head()
for ind, entry in temp.iterrows():
    subpool = entry.subpool
    if c not in exp_spec['subpools'].keys():
            exp_spec['subpools'][subpool] = {}
    for c in subpool_cols:
        exp_spec['subpools'][subpool][c] = entry[c]

In [264]:
exp_spec

{'subpools': {'Sublibrary_2': {'subpool': 'Sublibrary_2',
   'Count': 67000,
   'i7_subpool_barcode': 'ACTTGA',
   'i5_subpool_barcode': nan,
   'Selection': 'EX'},
  'Sublibrary_3': {'subpool': 'Sublibrary_3',
   'Count': 67000,
   'i7_subpool_barcode': 'GATCAG',
   'i5_subpool_barcode': nan,
   'Selection': 'NO'}}}

In [256]:
subpool_cols = ['subpool', 'Count', 'i7_subpool_barcode', 'i5_subpool_barcode', 'Selection']
df[subpool_cols+['fastq']].groupby(subpool_cols, dropna=False).count().to_dict(orient='index')

{('Sublibrary_2', 67000, 'ACTTGA', nan, 'EX'): {'fastq': 8},
 ('Sublibrary_3', 67000, 'GATCAG', nan, 'NO'): {'fastq': 8}}

In [165]:
df = df.merge(subpool_df, how='left', on=['subpool', 'plate'])

In [206]:
df = pd.DataFrame([
           ['A', 'a', 123, 1], 
           ['B', 'b', 345, 5], 
           ['C', 'c', 712, 4],
           ['B', 'b', 768, 2], 
           ['B', 'b', 768, 3], 
           ['A', 'a', 123, 9], 
           ['C', 'c', 178, 6], 
           ['C', 'c', 178, 5],  
           ['A', 'a', 321, 3]])
df.columns = ['subpool', 'Count', 'sequencing', 'fastq']
df['i7_subpool_barcode'] = df['subpool']
df['i5_subpool_barcode'] = df['subpool']
df['Selection'] = 'EX'
df.head()

Unnamed: 0,subpool,Count,sequencing,fastq,i7_subpool_barcode,i5_subpool_barcode,Selection
0,A,a,123,1,A,A,EX
1,B,b,345,5,B,B,EX
2,C,c,712,4,C,C,EX
3,B,b,768,2,B,B,EX
4,B,b,768,3,B,B,EX


In [217]:
# subpool_cols = ['subpool', 'Count', 'i7_subpool_barcode', 'i5_subpool_barcode', 'Selection']


# # d = {'subpools': {k[0]: \
# #         dict(dict([(subpool_cols[i],k[i]) for i in range(1, len(subpool_cols))]).update(\
# #          **('Sequencing':f.groupby('sequencing')['fastq'].apply(list).to_dict())) \
# #      for k, f in df.groupby(subpool_cols)}}

# # # d = {'subpools': {k[0]: \
# # #         dict([(subpool_cols[i],k[i]) for i in range(1, len(subpool_cols))]) \
# # #      for k, f in df.groupby(subpool_cols)}}

# # # d = {'subpools': {k[0]: \
# # #         {subpool_cols[1]: k[1], 'Sequencing': f.groupby('sequencing')['fastq'].apply(list).to_dict()}
# # #      for k, f in df.groupby(subpool_cols)}}

In [218]:
# d

In [195]:
# df.head()
# subpool_cols = ['subpool', 'Count']
# df.columns = ['subpool', 'Count', 'sequencing', 'fastq']
# d = {'subpools': {k[0]: {'Count': k[1]}}
#      for k, f in df.groupby(subpool_cols)}

In [136]:
df.head()
df['level'] = 'subpools'

In [141]:
temp = df[['level', 'subpool', 'Count', 'Selection', 'i7_subpool_barcode', 'i5_subpool_barcode']].drop_duplicates()
data = temp.groupby('level').apply(
            lambda x: x.set_index('subpool').to_dict(orient='index')).to_dict()
data

{'subpools': {'Sublibrary_2': {'level': 'subpools',
   'Count': 67000,
   'Selection': 'EX',
   'i7_subpool_barcode': 'ACTTGA',
   'i5_subpool_barcode': nan},
  'Sublibrary_3': {'level': 'subpools',
   'Count': 67000,
   'Selection': 'NO',
   'i7_subpool_barcode': 'GATCAG',
   'i5_subpool_barcode': nan}}}

In [142]:
# temp = df.copy(deep=True)
# temp.groupby('level').groupby(['subpool', 'Count', 'Selection', 'i7_subpool_barcode', 'i5_subpool_barcode'])

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'

In [155]:
df.head()
plat_to_long_or_short = {'nova': 'short', 'next': 'short',
                         'grid': 'long', 'prom': 'long'}
df['long_or_short'] = df.platform.map(plat_to_long_or_short)
df.head()

Unnamed: 0,fastq,subpool,sample,plate,lane,run,platform,path,path2,r2_fastq,Count,Selection,Sublibrary Index,i7_subpool_barcode,i5_subpool_barcode,level,long_or_short
0,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,S1,igvf_010,L001,1,nova,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,,subpools,short
1,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,S1,igvf_010,L002,1,nova,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,,subpools,short
2,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,S1,igvf_010,L003,1,nova,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,,subpools,short
3,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,Sublibrary_2,S1,igvf_010,L004,1,nova,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova1/Subli...,67000,EX,2,ACTTGA,,subpools,short
4,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/Subli...,Sublibrary_2,S1,igvf_010,L001,2,nova,/dfs7/samlab/seyedam/IGVF/igvf_010/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/,/dfs7/samlab/seyedam/IGVF/igvf_010/nova2/Subli...,67000,EX,2,ACTTGA,,subpools,short


In [158]:
# go from inside out
temp = df[['fastq', 'r2_fastq', 'run', 'platform']]
d = {k: f.groupby(['run', 'platform'])[['fastq', 'r2_fastq']].apply(list).to_dict()
     for k, f in df.groupby('run')}
d

{1: {(1, 'nova'): ['fastq', 'r2_fastq']},
 2: {(2, 'nova'): ['fastq', 'r2_fastq']}}

In [156]:
subpool_cols = ['subpool', 'Count', 'Selection', 'i7_subpool_barcode', 'i5_subpool_barcode']
sequencing_cols = ['long_or_short']
d = {k: f.groupby(sequencing_cols)[].apply(list).to_dict()
     for k, f in df.groupby('subpool_cols')}


KeyError: 'subpool_cols'

In [119]:
exp_spec = dict()

# add first level of subpool information
exp_spec['subpools'] = dict()
for s in df.subpool.unique().tolist():
    temp = df.loc[df.subpool==s].copy(deep=True)
    
    # count, i7, i5 barcodes, selection method
    if s not in exp_spec['subpools'].keys():
        exp_spec['subpools'][s] = dict()
    cols = ['Count', 'Selection', 'i7_subpool_barcode',
        'i5_subpool_barcode']
    for c in cols:
        assert len(temp[c].unique().tolist())==1
        exp_spec['subpools'][s][c] = temp[c].values[0]

In [120]:
# add in the fastqs
short_platforms = ['nova', 'next']
long_platforms = ['grid', 'prom']
exp_spec
for ind, entry in df.iterrows():
    if entry.platform.isin(short_platforms):
        if 

{'subpools': {'Sublibrary_2': {'Count': 67000,
   'Selection': 'EX',
   'i7_subpool_barcode': 'ACTTGA',
   'i5_subpool_barcode': nan,
   'long_read': 'YES'},
  'Sublibrary_3': {'Count': 67000,
   'Selection': 'NO',
   'i7_subpool_barcode': 'GATCAG',
   'i5_subpool_barcode': nan,
   'long_read': 'NO'}}}

In [110]:
subpool_df.head()

Unnamed: 0,plate,subpool,Count,Selection,Sublibrary Index,i7_subpool_barcode,i5_subpool_barcode
0,igvf_010,Sublibrary_2,67000,NO,2,ACTTGA,
1,igvf_010,Sublibrary_3,67000,NO,3,GATCAG,
2,igvf_010,Sublibrary_4,67000,NO,4,TAGCTT,
3,igvf_010,Sublibrary_5,67000,NO,5,ATGTCA,
4,igvf_010,Sublibrary_6,67000,NO,6,CTTGTA,


In [90]:
# exp_spec

defaultdict(dict,
            {'samples': defaultdict(None,
                         {'A': {'1': ['ali-mortazavi:016_B6J_10F_09_UBERON_0002113'],
                           '2': ['ali-mortazavi:017_B6J_10M_09_UBERON_0002113'],
                           '3': ['ali-mortazavi:018_B6J_10F_09_UBERON_0002113'],
                           '4': ['ali-mortazavi:019_B6J_10M_09_UBERON_0002113'],
                           '5': ['ali-mortazavi:020_B6J_10F_09_UBERON_0002113'],
                           '6': ['ali-mortazavi:021_B6J_10M_09_UBERON_0002113'],
                           '7': ['ali-mortazavi:024_B6J_10F_09_UBERON_0002113'],
                           '8': ['ali-mortazavi:025_B6J_10M_09_UBERON_0002113'],
                           '10': ['ali-mortazavi:017_B6J_10M_16_UBERON_0001388',
                            'ali-mortazavi:067_NODJ_10M_16_UBERON_0001388'],
                           '11': ['ali-mortazavi:018_B6J_10F_16_UBERON_0001388',
                            'ali-mortazavi:074_N

Unnamed: 0,alias_tissue1,alias_tissue2,alias_tissue3,alias_tissue4,well_row,well_col,samples
0,ali-mortazavi:016_B6J_10F_03_NTR_0000646,ali-mortazavi:016_B6J_10F_03_NTR_0000750,,,A,1,"[ali-mortazavi:016_B6J_10F_03_NTR_0000646, ali..."
1,ali-mortazavi:017_B6J_10M_03_NTR_0000646,ali-mortazavi:017_B6J_10M_03_NTR_0000750,,,A,2,"[ali-mortazavi:017_B6J_10M_03_NTR_0000646, ali..."
2,ali-mortazavi:018_B6J_10F_03_NTR_0000646,ali-mortazavi:018_B6J_10F_03_NTR_0000750,,,A,3,"[ali-mortazavi:018_B6J_10F_03_NTR_0000646, ali..."
3,ali-mortazavi:019_B6J_10M_03_NTR_0000646,ali-mortazavi:019_B6J_10M_03_NTR_0000750,,,A,4,"[ali-mortazavi:019_B6J_10M_03_NTR_0000646, ali..."
4,ali-mortazavi:020_B6J_10F_03_NTR_0000646,ali-mortazavi:020_B6J_10F_03_NTR_0000750,,,A,5,"[ali-mortazavi:020_B6J_10F_03_NTR_0000646, ali..."


Unnamed: 0,well_row,well_col,samples
0,A,1,"[ali-mortazavi:016_B6J_10F_03_NTR_0000646, ali..."
1,A,2,"[ali-mortazavi:017_B6J_10M_03_NTR_0000646, ali..."
2,A,3,"[ali-mortazavi:018_B6J_10F_03_NTR_0000646, ali..."
3,A,4,"[ali-mortazavi:019_B6J_10M_03_NTR_0000646, ali..."
4,A,5,"[ali-mortazavi:020_B6J_10F_03_NTR_0000646, ali..."


In [43]:
sample_df['alias_tissue3'].unique()

array([nan, 'ali-mortazavi:017_B6J_10M_01_UBERON_0000007',
       'ali-mortazavi:018_B6J_10F_01_UBERON_0000007',
       'ali-mortazavi:019_B6J_10M_01_UBERON_0000007',
       'ali-mortazavi:016_B6J_10F_01_UBERON_0000007',
       'ali-mortazavi:021_B6J_10M_01_UBERON_0000007',
       'ali-mortazavi:024_B6J_10F_01_UBERON_0000007',
       'ali-mortazavi:025_B6J_10M_01_UBERON_0000007',
       'ali-mortazavi:020_B6J_10F_01_UBERON_0000007',
       'ali-mortazavi:029_AJ_10M_01_UBERON_0000007',
       'ali-mortazavi:028_AJ_10F_01_UBERON_0000007',
       'ali-mortazavi:031_AJ_10M_01_UBERON_0000007',
       'ali-mortazavi:026_AJ_10F_01_UBERON_0000007',
       'ali-mortazavi:033_AJ_10M_01_UBERON_0000007',
       'ali-mortazavi:032_AJ_10F_01_UBERON_0000007',
       'ali-mortazavi:035_AJ_10M_01_UBERON_0000007',
       'ali-mortazavi:030_AJ_10F_01_UBERON_0000007',
       'ali-mortazavi:037_129S1J_10M_01_UBERON_0000007',
       'ali-mortazavi:038_129S1J_10F_01_UBERON_0000007',
       'ali-mortazavi:041

In [36]:
sample_df.dtypes

alias_tissue1    object
alias_tissue2    object
alias_tissue3    object
alias_tissue4    object
well_row         object
well_col         object
samples          object
dtype: object