In [80]:
import sys
import argparse
import re
import pandas as pd
import pysradb



In [101]:
# def unescaped_str(arg_str):
#     """
#     Borrowed from https://stackoverflow.com/questions/34145686/handling-argparse-escaped-character-as-option
#     """
#     return codecs.decode(str(arg_str), 'unicode-escape')

def parse_args(args):
    parser = argparse.ArgumentParser(
        description='Create the input section for distiller\'s project.yml from GEO/ENA/SRA accessions.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        'accessions', 
        metavar='N', 
        type=str, 
        nargs='+',
        help='GEO/SRA/ENA accession with a Hi-C project.')
    parser.add_argument(
        '--title_sub', 
        nargs=2, 
        action='append',
#         type=unescaped_str,
        default = [
            ('\s', '_'),
            ('[^\w_.-]', '') # the first character cannot be a hyphen!!
        ],
        help='A list of regular expression substitutions to clean up the experiment titles. '
        'Each substitution must be provided using a separate flag --title_sub followed by '
        'a pair of regular expressions pat repl, separated by a space, '
        'where pat is the matching pattern and repl is the replacement string. '
        'Internally, these expressions are then provided to pandas.Series.str.replace() or re.sub(). '
        'The default substitutions replace spaces with underscores and remove characters not matching '
        'A–Z a–z 0–9 ._- (a.k.a. the POSIX portable file name character set).'
    )
    parser.add_argument(
        '--group_sub', 
        nargs=2, 
        action='append',
#         type=unescaped_str,
        default = [
            ('[_-](R|rep)[\d+]$', '')
        ],
        help='A list of regular expression substitutions to convert experiment titles into groups. '
        'The usage is same as above. The default substitution matches patterns like _R1/_R2/_rep1/-R1/R1 '
        'at the ends of the title.'
    )
    
    parser.print_help()

    return parser.parse_args(args)

def to_downloadable(queries):
    out_queries = []
    for q in queries:
        if q.startswith('GSE'):
            out_queries += list(
                pysradb.SRAweb()
                .gse_to_srp(q)
                .study_accession
            )
        else: 
            out_queries.append(q)
    return out_queries

TAB_CHAR = '    '

In [86]:
#args = parse_args(sys.argv[1:])
#queries = ['SRP098789']

args = parse_args(['GSE110061'])
display(args)

usage: ipykernel_launcher.py [-h] [--title_sub TITLE_SUB TITLE_SUB]
                             [--group_sub GROUP_SUB GROUP_SUB]
                             N [N ...]

Create a table of fastq inputs (for distiller's project.yml) from GEO/ENA/SRA
accessions.

positional arguments:
  N                     GEO/SRA/ENA accession with a Hi-C project.

optional arguments:
  -h, --help            show this help message and exit
  --title_sub TITLE_SUB TITLE_SUB
                        A list of regular expression substitutions to clean up
                        the experiment titles. Each substitution must be
                        provided using a separate flag --title_sub followed by
                        a pair of regular expressions pat repl, separated by a
                        space, where pat is the matching pattern and repl is
                        the replacement string. Internally, these expressions
                        are then provided to pandas.Series.str.replace() 

Namespace(accessions=['GSE110061'], group_sub=[('[_-](R|rep)[\\d+]$', '')], title_sub=[('\\s', '_'), ('[^\\w_.-]', '')])

In [87]:
db = pysradb.SRAweb()

queries = to_downloadable(args.accessions)
 
srr_table = pd.concat([    
    db.sra_metadata(q)
    for q in queries
])

srr_table = srr_table[['experiment_title', 'run_accession']]

srr_table['experiment_title'] = (
    srr_table['experiment_title']
    .str.split(';')
    .str.get(0)
    .str.split(':')
    .str.get(1)
    .str.strip()
)


In [88]:
for re_sub in args.title_sub:
    srr_table['experiment_title'] = (
            srr_table.experiment_title
            .str.replace(re_sub[0], re_sub[1], regex=True)
        )

srr_table=srr_table.sort_values(['experiment_title','run_accession'])

('\\s', '_')
('[^\\w_.-]', '')


In [90]:
srr_table['lane'] = (
    'lane'
    + (srr_table.groupby('experiment_title').cumcount()+1)
        .astype('str')
)

In [91]:
group = srr_table.experiment_title
for sub in group_subs:
    group = group.str.replace(sub[0], sub[1])
srr_table['group'] = group

In [135]:
# Keeping this code in case YAML structures will become useful:

# out_raw_reads_paths = {}
# for title, grouped in srr_table.groupby('experiment_title'):
#     out_raw_reads_paths[title] = {
#         row.lane:f'- sra:{row.run_accession}'
#         for _,row in grouped.iterrows()
#     }

# out_library_groups = {}
# for group, grouped in srr_table.groupby('group'):
#     experiment_titles = list(grouped.experiment_title.unique())
#     if len(experiment_titles) > 1:
#         out_library_groups[group] = experiment_titles


out_raw_reads_paths = [f'{TAB_CHAR}raw_reads_paths:']
for title, grouped in srr_table.groupby('experiment_title'):
    out_raw_reads_paths.append(f'{TAB_CHAR}{TAB_CHAR}{title}:')
    for _, row in grouped.iterrows():
        out_raw_reads_paths.append(f'{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}{row.lane}:')
        out_raw_reads_paths.append(f'{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}- sra:{row.run_accession}')

out_library_groups = [f'{TAB_CHAR}library_groups:']
for group, grouped in srr_table.groupby('group'):
    experiment_titles = grouped.experiment_title.unique()
    if len(experiment_titles) > 1:
        out_library_groups.append(f'{TAB_CHAR}{TAB_CHAR}{group}:')
        out_library_groups += [f'{TAB_CHAR}{TAB_CHAR}{TAB_CHAR}- {title}' 
                               for title in experiment_titles]

out = '\n'.join(['input:']+out_raw_reads_paths+out_library_groups) 

In [136]:
print(out)

input:
    raw_reads_paths:
        BL-CAP-C_F123_CTCF-AID_KO-POLII_G5_rep1:
            lane1:
                - sra:SRR12053682
            lane2:
                - sra:SRR12053683
            lane3:
                - sra:SRR12053684
        BL-CAP-C_F123_CTCF-AID_KO-POLII_G5_rep2:
            lane1:
                - sra:SRR12053685
        BL-CAP-C_F123_CTCF-AID_KO_G5_rep1:
            lane1:
                - sra:SRR12053678
            lane2:
                - sra:SRR12053679
            lane3:
                - sra:SRR12053680
        BL-CAP-C_F123_CTCF-AID_KO_G5_rep2:
            lane1:
                - sra:SRR12053681
        BL-CAP-C_F123_CTCF-AID_POLII_G5_rep1:
            lane1:
                - sra:SRR12053686
            lane2:
                - sra:SRR12053687
            lane3:
                - sra:SRR12053688
        BL-CAP-C_F123_CTCF-AID_POLII_G5_rep2:
            lane1:
                - sra:SRR12053689
        BL-CAP-C_F123_CTCF-AID_WT_G5_rep1:
            lane1