In [1]:
"""
Author: Wen-Jou Chang
Baylor College of Medicine

This script pre- and post-processes CoRSIVs and control regions.
"""

import os
import pandas as pd
import matplotlib.pyplot as plt
from pybedtools import BedTool
import random



In [None]:
# get unique combinations of CoRSIV metrics for input to sample from control lookup table
df = pd.read_csv("annotated_corsiv_all.csv") # annotated list of CoRSIs, can be obtained from CoRSIV_annotation.ipynb

columns_to_group_by = ['chr', 'CpG_count',
       'block_size', 'tss_count', 'Gene_body_count', 'tes_count',
       'Union Count']

unique_combinations = df.groupby(columns_to_group_by).size().reset_index(name='count')
sorted_combinations = unique_combinations.sort_values(by='count', ascending=False)
sorted_combinations.to_csv("corsiv_table_unique.csv", index=0)

In [None]:
# read and cleanup corsivs
corsivs = pd.read_csv("annotated_corsiv_all.csv")
corsivs = corsivs.drop(corsivs.columns[4], axis=1)
corsivs.columns = ['CoRSIV_ID', 'CoRSIV_chr', 'CoRSIV_start', 'CoRSIV_end', 'CoRSIV_CpG_count',
       'CoRSIV_block_size', 'CoRSIV_tss_count', 'CoRSIV_gb_count', 'CoRSIV_tes_count',
       'CoRSIV_probe_count']
# read and cleanup controls candidates
control_candidates = pd.read_csv("../control/control_candidates.csv")
control_candidates = control_candidates.drop(control_candidates.columns[[9, 10, 12, 13, 15, 16]], axis=1) #, 15, 16
control_candidates["ID"] = control_candidates["ID"].apply(eval)
control_candidates.columns = ['Region ID', 'control_chr', 'control_start', 'control_end', 'control_block_size', 'control_CpG_count',
       'control_tss_count', 'control_gb_count', 'control_tes_count', 'control_probe_count', 'ID']
control_candidates["Control_ID"] = pd.NA
#  make ID column for annotated_corsiv_all
corsivs["ID"] = corsivs.apply(lambda row: (row['CoRSIV_CpG_count'], row['CoRSIV_tss_count'], row['CoRSIV_gb_count'], row['CoRSIV_tes_count'], row['CoRSIV_probe_count']), axis=1)
# groupby chr, blocksize, ID
grouped = corsivs.groupby(by=['CoRSIV_chr', "CoRSIV_block_size", "ID"])
for (group_chr, group_bp, group_id), group in grouped:
    # expand each group's corsiv id list into 10 times _1, _2, ..., _10
    num_corsiv = len(group)
    curr_corsiv_id = list(group["CoRSIV_ID"])
    control_ids = [f"{item}_{i}" for item in curr_corsiv_id for i in range(1, 11)]
    random.shuffle(control_ids)
    # find controls with corresponding chr, blocksize, ID and issue permuted control id to it
    matching_idx = control_candidates[(control_candidates["control_chr"] == group_chr) &
                                      (control_candidates["control_block_size"] == group_bp) &
                                      (control_candidates["ID"] == group_id)].index
    control_candidates.loc[matching_idx, 'Control_ID'] = control_ids
control_candidates["CoRSIV_ID"] = control_candidates["Control_ID"].apply(lambda row: "_".join(row.split("_")[:-1]))
control_candidates1 = control_candidates.drop(control_candidates.columns[0], axis=1)
final_df = pd.merge(control_candidates1, corsivs, on=["CoRSIV_ID","ID"], how="outer")
target = final_df.pop('Control_ID')
final_df.insert(0, 'Control_ID', target)
final_df.to_csv("../control/corsiv_control_matching.csv", index=0) # matches control regions to CoRSIVs


In [20]:
# clean up a little
df = pd.read_csv("../control/corsiv_control_matching.csv")
df.drop(columns=['ID'], inplace=True)
df['sort_key'] = df['Control_ID'].apply(lambda x: (int(x.split('_')[0]), int(x.split('_')[1]), int(x.split('_')[-1])))
df.sort_values(by='sort_key', inplace=True, ignore_index=True)
df.drop('sort_key', axis=1, inplace=True)
df.to_csv("../control/corsiv_control_matching.csv", index=0)

# export bed files for all 10 sets of control regions
df = pd.read_csv("../control/corsiv_control_matching.csv")
df[["control_chr", "control_start", "control_end", "Control_ID"]].to_csv("../control/control_candidates.bed", sep="\t", header=0, index=0)

In [None]:
# split control candidates into 10 sets
df = pd.read_csv("../control/control_candidates.bed", sep="\t", names=["chr", "start", "end", "id"])
df['suffix'] = df['id'].apply(lambda x: x.split('_')[-1])

dfs = {}

# Split the DataFrame based on the suffix and store them in the dictionary
for i in range(1, 11):
    suffix = str(i)
    dfs[suffix] = df[df['suffix'] == suffix]

for suffix, data in dfs.items():
    data[["chr", "start", "end", "id"]].to_csv(f"../control/control_candidates_{suffix}.bed", index=False, sep="\t", header=0)
