In [1]:
import pandas as pd
import re
import os

In [7]:
## The DRIMSeq DTU script execpt one sample divided into 2 groups.
## So we need to split the espresso counts file into all its separate sample.

esp_df = pd.read_table('../data/espresso/LCL_espresso_counts.tsv')

# Get cell lines
cell_lines = set()
for col in esp_df.columns:
    match = re.search(r'(GM\d{5})', col)
    if match:
        cell_lines.add(match.group(1))
cell_lines = list(cell_lines)

separate_outdir = '../data/drimseq_dtu_input/replicates_separate'
merged_outdir = '../data/drimseq_dtu_input/merged_replicates'
os.makedirs(separate_outdir, exist_ok=True)
os.makedirs(merged_outdir, exist_ok=True)

# Select columns to keep for each cell line
# and write the new file. Keep replicates separate
id_columns = esp_df.iloc[:, :3].columns.tolist()
for cell_line in cell_lines:

    count_columns = [col for col in esp_df.columns if cell_line in col]
    keep_columns = id_columns + count_columns

    df = esp_df[keep_columns]
    df = df[~df['transcript_ID'].str.startswith('ESPRESSO')]
    df_fname = os.path.join(separate_outdir, f'{cell_line}_espresso_counts.tsv')
    df.to_csv(df_fname, index=None, sep='\t')   

    group_1 = ','.join([col for col in df.columns if 'maternal' in col])
    group_2 = ','.join([col for col in df.columns if 'paternal' in col])
    group_1_fname = os.path.join(separate_outdir, f'{cell_line}_group_1.txt')
    group_2_fname = os.path.join(separate_outdir, f'{cell_line}_group_2.txt')

    with open(group_1_fname, 'w') as f:
        f.write(group_1)

    with open(group_2_fname, 'w') as f:
        f.write(group_2)


    # Merge replicates of the same sample together
    maternal_cols = [col for col in df.columns if 'maternal' in col]
    paternal_cols = [col for col in df.columns if 'paternal' in col]

    maternal_sum = df[maternal_cols].sum(axis=1)
    maternal_sum.name = 'M'

    paternal_sum = df[paternal_cols].sum(axis=1)
    paternal_sum.name = 'P'

    merged_df = pd.concat([df.iloc[:, :3], maternal_sum, paternal_sum], axis=1)
    merged_df_fname = os.path.join(merged_outdir, f'{cell_line}_espresso_counts.tsv')
    merged_df.to_csv(merged_df_fname, index=None, sep='\t')

    merged_group_1_fname = os.path.join(merged_outdir, f'{cell_line}_group_1.txt')
    merged_group_2_fname = os.path.join(merged_outdir, f'{cell_line}_group_2.txt')
    
    with open(merged_group_1_fname, 'w') as f:
        f.write('M')

    with open(merged_group_2_fname, 'w') as f:
        f.write('P')