In [1]:
import pandas as pd
import os
import re
from collections import defaultdict

In [30]:
# Format espresso counts
indir = '../data/splicing_order_stringent_version/merged_replicates_esp_format'
files = [os.path.relpath(os.path.join(indir, f)) for f in os.listdir(indir)]
files = [f for f in files if 'esp.txt' in f]

outdir = '../data/drimseq_tuqtl_input'
os.makedirs(outdir, exist_ok=True)

dfs = []
for file in files:
    cell_line = re.findall(r'(GM\d{5})', file)[0]
    df = pd.read_table(file)
    df = df[['transcript_ID', 'gene_ID', 'M', 'P']]
    df.columns = ['feature_id', 'gene_id', f'{cell_line + '_M'}', f'{cell_line + '_P'}']
    dfs.append(df)

merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on=['feature_id', 'gene_id'], how='outer')
merged_df = merged_df.fillna(0)
merged_df.to_csv(os.path.join(outdir, 'LCL_interm_counts_tuqtl.tsv'),
                 index=None, sep='\t')

In [5]:
# Format genotypes
genotypes_path = '../data/drimseq_tuqtl_input/LCLs_studied_cell_lines.genes_splicing_order.extended.formatted_for_drimseq.txt'
genotypes_df = pd.read_table(genotypes_path)
genotypes_df = genotypes_df.drop(columns=['REF', 'ALT'])
genotypes_df = genotypes_df.rename(columns={
    'CHROM': 'chr', 'START': 'start', 'ID': 'snp_id'}
)
genotypes_df.insert(loc=2, column='end', value=genotypes_df['start'])
genotypes_df.to_csv('../data/drimseq_tuqtl_input/LCL_genotypes.tsv', index=None, sep='\t')

In [32]:
# Fix gene ids of the bed file so that they are the same as the unique ids of the counts file
bed = pd.read_table('../data/drimseq_tuqtl_input/NCBI_RefSeq_hg38_genes_parsed.bed', header=None)
merged_interm_counts = pd.read_table('../data/drimseq_tuqtl_input/LCL_interm_counts_tuqtl.tsv')

# Special unique gene ids
uniq_gene_id = pd.Series(merged_interm_counts['gene_id'].unique())
gene_id = uniq_gene_id.str.split('/').str[0]
# Keep only wanted genes
bed = bed[bed.iloc[:,3].isin(gene_id)]

# Standard gene id and unique gene id in a dictionary
gene_id_dict = defaultdict(list)

for key, value in zip(gene_id, uniq_gene_id):
    gene_id_dict[key].append(value)

# Make new bed from unique gene id
new_rows_list = []
for i, old_gene_id in enumerate(bed.iloc[:,3]):
    if old_gene_id in gene_id_dict:
        for new_gene_id in gene_id_dict[old_gene_id]:
            new_row = bed.iloc[i].copy()
            new_row[3] = new_gene_id
            new_rows_list.append(new_row)

new_df = pd.DataFrame(new_rows_list)

new_df.to_csv('../data/drimseq_tuqtl_input/NCBI_RefSeq_hg38_genes_parsed_uniq_id.bed',
              index=None, sep='\t', header=None)

In [35]:
# Fix counts
def interm_counts_to_espresso_format(file_path,
                                     gene_ID,
                                     transcript_ID,
                                     transcript_name,
                                     columns,
                                     values):
    df = pd.read_table(file_path)

    df['gene_ID'] = df[gene_ID].astype(str).agg('/'.join, axis=1)
    df['transcript_ID'] = df[transcript_ID].astype(str).agg('/'.join, axis=1)
    df['transcript_name'] = df[transcript_name].astype(str).agg('/'.join, axis=1) # same as transcript_ID

    esp_df = df.pivot_table(index=['transcript_ID', 'transcript_name', 'gene_ID'],
                            columns=columns,
                            values=values,
                            fill_value=0)
    
    esp_df.columns = esp_df.columns.values
    esp_df.reset_index(inplace=True)

    return esp_df



df = interm_counts_to_espresso_format(
    file_path='../data/splicing_order_stringent_version/LCLs_merged_interm_counts_per_allele_filtered_min_2_sig_cell_lines.txt',
    gene_ID=['gene', 'analyzed_introns', 'level'],
    transcript_ID=['gene', 'pattern2'],
    transcript_name=['gene', 'pattern2'],
    columns=['cell_line', 'allele'],
    values='count_pattern2'
)

df.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in df.columns]
df = df.drop(columns='transcript_name')
df = df.rename(columns={
    'transcript_ID': 'feature_id',
    'gene_ID': 'gene_id'
}
)

df.to_csv('../data/drimseq_tuqtl_input/LCLs_merged_interm_counts_per_allele_filtered_min_2_sig_cell_lines_3_isoforms.tsv',
          sep='\t', index=None)