In [1]:
import pandas as pd

In [81]:
df = pd.read_csv('230429_config.tsv', sep='\t')

# check to make sure the same file stem isn't there more than once 
# (can happen if different flow cells needed different amounts of chopping)
df['file_stem'] = df.basename.str.rsplit('_', n=1, expand=True)[0]
if df.file_stem.duplicated().any():
    dupe_stems = df.loc[df.file_stem.duplicated(keep=False), 'basename'].tolist()
    raise ValueError(f'Files {dupe_stems} seem to be duplicated. Check config file.')

# extract the sample name (?)
temp = df.basename.str.split('_', expand=True)[[0,1]]#.str.join('_')
df['sample_temp'] = temp[0]+'_'+temp[1]

# get tech rep numbers -- each mouse has multiple reps 
# and are therefore technical reps
df['techrep_num'] = df.sort_values(['genotype', 'sample_temp'],
							ascending=[True, True])\
							.groupby(['sample_temp']) \
							.cumcount() + 1

# get biorep numbers -- each sample is a different mouse
# and therefore a different biorep
temp = df[['genotype', 'sample_temp']].drop_duplicates()
temp.reset_index(inplace=True, drop=True)
temp['biorep_num'] = temp.sort_values(['genotype', 'sample_temp'],
							ascending=[True, True])\
							.groupby(['genotype']) \
							.cumcount()+1
df = df.merge(temp, how='left',
              on=['genotype', 'sample_temp'])

# sample should be the genotype + mouse id 
# so genotype + biorep
df['sample'] = df.genotype+'_'+df.biorep_num.astype(str)

# dataset should be genotype + mouse id + tech rep 
df['dataset'] = df.genotype+'_'+df.biorep_num.astype(str)+'_'+df.techrep_num.astype(str)