In [1]:
import pandas as pd
import os
import re

In [2]:
## Change intermediate isoform counts per allele to ESPRESSO format
## so we can use it as input to the DRIMSeq DTU script.
## format: transcript_ID, transcript_name, gene_ID, sample_name1, sample_name2, ...

def interm_counts_to_espresso_format(file_path,
                                     gene_ID,
                                     transcript_ID,
                                     transcript_name,
                                     columns,
                                     values):
    df = pd.read_table(file_path)

    df['gene_ID'] = df[gene_ID].astype(str).agg('/'.join, axis=1)
    df['transcript_ID'] = df[transcript_ID].astype(str).agg('/'.join, axis=1)
    df['transcript_name'] = df[transcript_name].astype(str).agg('/'.join, axis=1) # same as transcript_ID

    esp_df = df.pivot_table(index=['transcript_ID', 'transcript_name', 'gene_ID'],
                            columns=columns,
                            values=values,
                            fill_value=0)
    
    esp_df.columns = esp_df.columns.values
    esp_df.reset_index(inplace=True)

    return esp_df

In [3]:
## Format counts to espresso format and keep replicates separate

indir = '../data/splicing_order_stringent_version/replicates_separate'
files = [os.path.relpath(os.path.join(indir, f)) for f in os.listdir(indir)]

outdir = '../data/splicing_order_stringent_version/replicates_separate_esp_format'
os.makedirs(outdir, exist_ok=True)

# Get all cell lines
cell_lines = set()
for file in files:
    cell_line = re.findall(r'(GM\d{5})', file)
    cell_lines.update(cell_line)

# Associate each file to its cell line
cell_line_files_dict = {cell_line: [] for cell_line in cell_lines}
for file in files:
    for cell_line in cell_lines:
        if cell_line in file:
            cell_line_files_dict[cell_line].append(file)

# Format each file like espresso output
for cell_line in cell_line_files_dict:
    dfs = []
    for file in cell_line_files_dict[cell_line]:
        # Yield three possible intermediate isoforms per splcing level
        df = interm_counts_to_espresso_format(
            file_path=file,
            gene_ID=['gene', 'analyzed_introns', 'level'],
            transcript_ID=['gene_ID', 'pattern2'],
            transcript_name=['gene_ID', 'pattern2'],
            columns='allele',
            values='count_pattern2'
        )
        dfs.append(df)

    # Concatenate each replicates of a cell line into one file
    combined_df = pd.DataFrame()
    for i, df in enumerate(dfs):
        df = df.rename(columns={'M': f'M_{i+1}', 'P': f'P_{i+1}'})

        if i == 0:
            combined_df = df
        else:
            combined_df = pd.merge(combined_df, df, on=['transcript_ID', 'transcript_name', 'gene_ID'], how='outer')
        
    combined_df = combined_df.fillna(0)
    fname = os.path.join(outdir, f'{cell_line}_interm_counts_per_allele.hac.all_introns.min10reads.filterND.stringent.3_isoforms.esp.txt')
    combined_df.to_csv(fname, index=None, sep='\t')

    # Necessary group files for DRIMSeq script
    num_reps = len(cell_line_files_dict[cell_line])
    group_1 = ','.join([f'M_{i}' for i in range(1, num_reps+1)]) + '\n'
    group_2 = ','.join([f'P_{i}' for i in range(1, num_reps+1)]) + '\n'
    
    fname = os.path.join(outdir, f'{cell_line}_interm_counts_per_allele.hac.all_introns.min10reads.filterND.stringent.3_isoforms.group_1.txt')
    with open(fname, 'w') as f:
        f.write(group_1)

    fname = os.path.join(outdir, f'{cell_line}_interm_counts_per_allele.hac.all_introns.min10reads.filterND.stringent.3_isoforms.group_2.txt')
    with open(fname, 'w') as f:
        f.write(group_2)

In [4]:
## Format counts to espresso format and merge replicates

indir = '../data/splicing_order_stringent_version/merged_replicates'
files = [os.path.relpath(os.path.join(indir, f)) for f in os.listdir(indir)]

outdir = '../data/splicing_order_stringent_version/merged_replicates_esp_format'
os.makedirs(outdir, exist_ok=True)

for file in files:
    df = interm_counts_to_espresso_format(
        # Yield three possible intermediate isoforms per splcing level
        file_path=file,
            gene_ID=['gene', 'analyzed_introns', 'level'],
            transcript_ID=['gene_ID', 'pattern2'],
            transcript_name=['gene_ID', 'pattern2'],
            columns='allele',
            values='count_pattern2'
    )
                      
    directory, file_name = os.path.split(file)
    base_name, ext = os.path.splitext(file_name)
    df_fname = os.path.join(outdir, f"{base_name}.3_isoforms.esp.txt")                                        

    df.to_csv(df_fname, index=None, sep='\t')

    group1_fname = os.path.join(outdir, f"{base_name}.3_isoforms.group_1.txt") 
    group2_fname = os.path.join(outdir, f"{base_name}.3_isoforms.group_2.txt") 

    with open(group1_fname, 'w') as f:
        f.write('M')
    with open(group2_fname, 'w') as f:
        f.write('P')