This script loads *_tads.csv files made by Kacper and compress them into .bedpe file to be feeded to the jusier

Change folder_path if needed

In [25]:
import os
import glob
import pandas as pd

# Define the path to the folder containing the CSV files
folder_path = "intermediates/"

# Retrieves all files ending with '_tads.csv' in the specified folder
files = glob.glob(folder_path + "*_tads.csv")

# Initialize an empty dictionary to store DataFrames for each category
category_dfs = {}

# Iterate through each file
for file in files:
    # Extract category name from the file name
    category = file.split('/')[-1].split('_')[0]  # Assuming Unix-like path separator '/'
    
    # Load the CSV file into a Pandas DataFrame
    df = pd.read_csv(file)
    
    # Skip if DataFrame is empty
    if len(df) < 1:
        continue
    
    # Extract characters before "_" from the 'community' column and create a new column 'extracted_characters'
    extracted_characters = df['community'].str.split('_').str[0]
    
    # Create a new DataFrame with desired columns
    new_df = pd.DataFrame({
        'CHR1': extracted_characters,
        'X1': df['start'],
        'X2': df['end'],
        'CHR2': extracted_characters,
        'Y1': df['start'],
        'Y2': df['end']
    })

    # Check if the category already exists in the dictionary
    if category in category_dfs:
        # Append the new DataFrame to the existing category DataFrame
        category_dfs[category] = pd.concat([category_dfs[category], new_df], ignore_index=True)
    else:
        # Create a new entry in the dictionary for the category DataFrame
        category_dfs[category] = new_df


# Create the output folder if it doesn't exist
output_folder = "output"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
        
# Save each category DataFrame to a separate file
for category, df in category_dfs.items():
    df.to_csv(f"output/{category}_merged_output.bedpe", sep='\t', index=False)