In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
def seq2kmer(seq, k=6):
    """
    Convert original sequence to kmers.
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [3]:
def process_bed_file(file_path):
    df = pd.read_csv(file_path, sep="\t")
    #display(df)
    df.rename(columns={'label': 'Label'}, inplace=True)
    #print(df.groupby(['Label']).size())
    g = df.groupby('Label')
    df_balanced = g.apply(lambda x: x.sample(n=min(len(x), 50000), random_state=42)).reset_index(drop=True)
    
    # Save the balanced DataFrame to a new BED file
    balanced_bed_path = file_path.replace('.bed', '_balanced.bed')
    df_balanced.to_csv(balanced_bed_path, sep='\t', index=False)
    print(f"Balanced BED saved to {balanced_bed_path}")
    
    # Extract k-mers and save to a separate file
    df_balanced['Sequence'] = df_balanced['sequence'].apply(seq2kmer) 
    df_shuffle = shuffle(df_balanced[['Sequence','Label']]).reset_index(drop=True)
    
    train, test = train_test_split(df_shuffle, test_size=0.15)
    #print (train.shape, test.shape)
    #print(train.groupby(['Label']).size(), test.groupby(['Label']).size() )
    
    
    # Create a directory named after the bed file
    base_path = os.path.dirname(file_path)
    dir_path = os.path.join(base_path, '300bp_balanced')
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # Save the train and dev files
    train.to_csv(os.path.join(dir_path, 'train.tsv'), sep='\t', index=False)
    test.to_csv(os.path.join(dir_path, 'dev.tsv'), sep='\t', index=False)

In [4]:
folder_path= "/home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg"
# List all subdirectories in the base path, excluding .ipynb_checkpoints
subfolders = [f.name for f in os.scandir(folder_path) if f.is_dir() and f.name != '.ipynb_checkpoints']


# Loop through each subfolder and read the 300bp_unique.bed file
for subfolder in subfolders:
    file_path = os.path.join(folder_path, subfolder, '300bp_unique.bed')
    if os.path.isfile(file_path):
        print(f"300bp.bed of the folder {subfolder} is processing.")
        dataset = process_bed_file(file_path)
        #split_and_save(dataset, f"{base_folder_path}/{tfbs}", "300bp_unique.bed")
    else:
        print(f'300bp.bed file is not still generated in the folder {subfolder}.')

300bp.bed of the folder MEF2D is processing.
Balanced BED saved to /home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg/MEF2D/300bp_unique_balanced.bed
300bp.bed of the folder ZNF132 is processing.
Balanced BED saved to /home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg/ZNF132/300bp_unique_balanced.bed
300bp.bed of the folder NR3C1 is processing.
Balanced BED saved to /home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg/NR3C1/300bp_unique_balanced.bed
300bp.bed of the folder SMARCA5 is processing.
Balanced BED saved to /home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg/SMARCA5/300bp_unique_balanced.bed
300bp.bed of the folder H2AK9ac is processing.
Balanced BED saved to /home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg/H2AK9ac/300bp_unique_balanced.bed
300bp.bed of the folder KDM5B is processing.
Balanced BED saved to /home/shared/rdavuluri/TFBS_Pallavi/ramana.cewit/1_1_pos_neg/KDM5B/300bp_unique_balanced.bed
300bp.bed of the folder IRF5 i