In [None]:
import pandas as pd
import numpy as np
import os
import subprocess
# get the path to the root of the repository
root_path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode('utf-8').strip()
# set the working directory to the root of the repository
os.chdir(root_path)
# print the current working directory
print(os.getcwd())

In [None]:
def combine_df(deg_file_path, rmats_file_path):
    # read in the DEG and rMATS files
    deseq_df = pd.read_csv(deg_file_path, delimiter="\t")
    rmats_df = pd.read_csv(rmats_file_path, delimiter="\t")

    # add _deseq to the column names of deseq_df, add _rmats to the column names of rmats_df
    deseq_df.columns = [str(col) + "_deseq" for col in deseq_df.columns]
    rmats_df.columns = [str(col) + "_rmats" for col in rmats_df.columns]
    # filter rmat_df to remove rows where IncLevelDifference_rmats is NaN
    rmats_df = rmats_df[~np.isnan(rmats_df["IncLevelDifference_rmats"])]
    # join the two dataframes based on GeneID column, add suffix to the column names
    df = pd.merge(
        deseq_df, rmats_df, left_on="GeneID_deseq", right_on="GeneID_rmats", how="inner"
    )
    # filter the dataframe to only keep rows where padj_deseq < 0.05
    df = df[df["padj_deseq"] < 0.05]
    # only keep GeneID, log2FoldChange, pvalue, padj, PValue, FDR, IncLevelDifference
    df = df[
        [
            "GeneID_deseq",
            "GeneName_deseq",
            "log2FoldChange_deseq",
            "pvalue_deseq",
            "padj_deseq",
            "IncLevel1_rmats",
            "IncLevel2_rmats",
            "IncLevelDifference_rmats",
        ]
    ]

    # get the filename of the DEG file
    deg_filename = (
        os.path.basename(deg_file_path).replace("DEG_DESeq2_", "").replace(".tsv", "")
    )
    rmats_filename = os.path.basename(rmats_file_path).replace("MATS.JCEC.", "")
    # create the output filename
    output_filename = f"{deg_filename}_{rmats_filename}"

    # save the results to ./data/08.intersection
    output_path = os.path.join("./data/08.intersection", output_filename)
    df.to_csv(output_path, sep="\t", index=False)


# get the path to the root of the repository
root_path = (
    subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
    .decode("utf-8")
    .strip()
)
os.chdir(root_path)

# get a list of all files in ./data/07.DEG and subdirs, match DEG_DESeq2.*\.tsv
deg_files = [
    os.path.join("./data/07.DEG", f)
    for f in os.listdir("./data/07.DEG")
    if f.startswith("DEG_DESeq2") and f.endswith(".tsv")
]
# get a list of all rMATS files in ./data/06.rMATs

rmats_files = [
    os.path.join(root, f)
    for root, dirs, files in os.walk("./data/06.rMATs")
    for f in files
    if f.endswith("JCEC.txt")
]

if not os.path.exists("./data/08.intersection"):
    os.makedirs("./data/08.intersection")
# process deg_files containing AB with rmats_files containing A_vs_B, process deg_files containing AC with rmats_files containing A_vs_C, etc.
for deg_file in deg_files:
    for rmats_file in rmats_files:
        if deg_file.split("_")[2].replace(".tsv", "").replace("vs","")[::-1] in rmats_file.split("/")[
            3
        ].replace("_vs_", ""):
            combine_df(deg_file, rmats_file)

In [None]:
# put all AB_ files into one folder, all BC_ files into another folder, etc.
# create the folders
folders = ["AB", "BC", "AC"]
for folder in folders:
    folder_path = os.path.join("./data/08.intersection", folder)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

# move files to corresponding folders
for root, dirs, files in os.walk("./data/08.intersection"):
    for f in files:
        for folder in folders:
            if f.startswith(folder):
                os.rename(
                    os.path.join(root, f), os.path.join(f"./data/08.intersection/{folder}", f)
                )
# copy ./data/08.intersection to ./results/, remove if ./results/08.intersection.zip exists
if os.path.exists("./results/08.intersection.zip"):
    os.remove("./results/08.intersection.zip")
subprocess.run(["zip", "-r", "./results/08.intersection.zip", "./data/08.intersection"])
subprocess.run(["cp", "-r", "./data/08.intersection", "./results/"])