# Extract splice event features
Given a set of RMATS junction count files, aggregate and remove duplicate events. Write these aggregated events to file.

**Runtime**: < 5 minutes (I/O limited)

In [2]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

import os
import sys
sys.path.append('..')

from splintr import splice
from splintr.splice import SpliceData

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from shutil import copyfile
from tqdm.autonotebook import tqdm

import multiprocessing
from multiprocessing import Pool

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 41.2 ms


In [3]:
def remove_duplicate_events(df):
    duplicated = df.iloc[:, 5:-14].duplicated()
    df = df.loc[~duplicated]
    return df

time: 1.33 ms


In [15]:
rmats_dir = '../data/encore_rmats'
output_dir = '../data/features'

events = ['A3SS', 'A5SS', 'MXE', 'RI', 'SE']

raw_dir = '../data/encore_raw'

# move files over to features
for experiment in tqdm(os.listdir(raw_dir)):
    experiment_tags = experiment.split('-')
    if experiment_tags[-1] == 'HepG2' or experiment_tags[-1] == 'K562':
        rbp_name = experiment_tags[0]
        cell_line = experiment_tags[-1]
        for event in events:
            orig_file = f'{raw_dir}/{experiment}/MATS_Norm_output/{event}.MATS.JunctionCountOnly.txt'
            copy_file = f'{rmats_dir}/{cell_line}/{event}/{rbp_name}.MATS.JunctionCountOnly.txt'
            copyfile(orig_file, copy_file)

HBox(children=(IntProgress(value=0, max=2202), HTML(value='')))

time: 1min 1s


In [20]:
# Iterate over event -> cell line -> RBP
def generate_features(event):
    cell_lines = os.listdir(f'{rmats_dir}')
    
    # cell line
    event_all_cell_dataset = []
    for cell_line in tqdm(cell_lines, position=1, desc='Cell lines', leave=False):
        jc_files = os.listdir(f'{rmats_dir}/{cell_line}/{event}')
        
        # sample
        cell_datasets = []
        for jc_file in tqdm(jc_files, position=2, desc='Samples', leave=False):
            jc_filepath = f'{rmats_dir}/{cell_line}/{event}/{jc_file}'
            sample_name = jc_file.split('.')[0] # parse sample name
            
            # Load junction counts data
            jc = pd.read_csv(jc_filepath, sep='\t')
            jc['event'] = event
            
            # Set aside non-alternatively spliced events as control
            bg_jc = jc.loc[jc['FDR'] > 0.1]
            bg_jc = bg_jc.iloc[np.random.randint(bg_jc.shape[0], size=1)]
            bg_jc['sample'] = 'bg'
            
            # Alternatively spliced events
            jc = jc.loc[jc['FDR'] < 0.1]
            jc['sample'] = sample_name
            jc = pd.concat([bg_jc, jc])
            
            cell_datasets.append(jc)
            
        # Combine all events for event type
        cell_datasets = pd.concat(cell_datasets, ignore_index=True)
        
        # Remove duplicates
        cell_datasets = remove_duplicate_events(cell_datasets)
        event_all_cell_dataset.append(cell_datasets)
        
        # Write to file
        file_prefix = f'{output_dir}/{cell_line}_{event}'
        cell_datasets.to_csv(f'{file_prefix}.txt', sep='\t', index=False)

    # Combine all events for given event type across all cell lines
    event_all_cell_dataset = pd.concat(event_all_cell_dataset, ignore_index=True)
    
    # remove duplicates
    event_all_cell_dataset = remove_duplicate_events(event_all_cell_dataset)
    event_all_cell_dataset.to_csv(f'{output_dir}/{event}.txt', sep='\t', index=False)
    
p = Pool(4)
list(tqdm(p.imap_unordered(generate_features, events), total=len(events)))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

[None, None, None, None, None]

time: 4min 26s
