In [31]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [7]:
# Load the important TFBS files list
with open('/home/campus.stonybrook.edu/pdutta/Github/Postdoc/DNABERT_data_processing/TFBS/all_selected_tfbs.txt', 'r') as file:
    important_tfbs= file.read().splitlines()

In [8]:
important_tfbs

['CEBPA ',
 'CTCF ',
 'FOS ',
 'H2AFZ ',
 'H2AK5ac ',
 'H2BK12ac ',
 'H2BK5ac ',
 'H3F3A ',
 'H3K14ac ',
 'H3K27ac ',
 'H3K27me3 ',
 'H3K36me3 ',
 'H3K4me1 ',
 'H3K4me2 ',
 'H3K79me1 ',
 'H3K79me2 ',
 'H3K9ac ',
 'H3K9me1 ',
 'H3K9me3 ',
 'H4K20me1 ',
 'H4K5ac ',
 'H4K8ac ',
 'IKZF1 ',
 'RAD21 ',
 'RBM22 ',
 'RUNX3 ',
 'TRIM22 ',
 'ZBTB33 ',
 'ZNF143 ',
 'CEBPB ',
 'EGR2 ',
 'NFE2 ',
 'PRDM1 ',
 'SCRT2 ',
 'ZFHX2 ',
 'ZNF121 ',
 'ZNF366 ',
 'ZNF770 ']

In [32]:
def seq2kmer(seq, k=6):
    """
    Convert original sequence to kmers.
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [37]:
def process_bed_file(file_path):
    df = pd.read_csv(file_path, sep="\t")
    df.rename(columns={'label': 'Label'}, inplace=True)
    print(df.groupby(['Label']).size())
    g = df.groupby('Label')
    df_balanced = g.apply(lambda x: x.sample(n=min(len(x), 50000), random_state=42)).reset_index(drop=True)
    
    # Save the balanced DataFrame to a new BED file
    balanced_bed_path = file_path.replace('.bed', '_balanced.bed')
    df_balanced.to_csv(balanced_bed_path, sep='\t', index=False)
    print(f"Balanced BED saved to {balanced_bed_path}")
    
    # Extract k-mers and save to a separate file
    df_balanced['Sequence'] = df_balanced['sequence'].apply(seq2kmer) 
    df_shuffle = shuffle(df_balanced[['Sequence','Label']]).reset_index(drop=True)
    
    train, test = train_test_split(df_shuffle, test_size=0.15)
    print (train.shape, test.shape)
    print(train.groupby(['Label']).size(), test.groupby(['Label']).size() )
    
    
    # Create a directory named after the bed file
    base_path = os.path.dirname(file_path)
    dir_path = os.path.join(base_path, '300bp_balanced')
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # Save the train and dev files
    train.to_csv(os.path.join(dir_path, 'train.tsv'), sep='\t', index=False)
    test.to_csv(os.path.join(dir_path, 'dev.tsv'), sep='\t', index=False)

In [38]:
# def split_and_save(dataset, base_path, bed_file_name):
#     df = pd.DataFrame(dataset, columns=['Sequence', 'Label'])
#     print(df.groupby(['Label']).size())
#     df = df.groupby('Label').apply(lambda x: x.sample(n=min(len(x), 50000), random_state=42)).reset_index(drop=True)
#     print(df.groupby(['Label']).size())

#     # Split the dataset
#     X_train, X_test, y_train, y_test = train_test_split(df['Sequence'], df['Label'], test_size=0.2, random_state=42)
    
#     # Prepare train and dev (test) DataFrames
#     train_df = pd.DataFrame({'Sequence': X_train, 'Label': y_train})
#     dev_df = pd.DataFrame({'Sequence': X_test, 'Label': y_test})
    
#     # Create a directory named after the bed file
#     base_path = os.path.dirname(file_path)
#     dir_path = os.path.join(base_path, '300bp_balanced')
#     if not os.path.exists(dir_path):
#         os.makedirs(dir_path)
    
#     # Save the train and dev files
#     train_df.to_csv(os.path.join(dir_path, 'train.tsv'), sep='\t', index=False)
#     dev_df.to_csv(os.path.join(dir_path, 'dev.tsv'), sep='\t', index=False)

In [39]:
base_folder_path= "/data/projects/DNABERT_snv/Manuscript_11_2023/TFBS_fine_tune_data/Data_Jan_2024/cleaned_names_0524/1_1_pos_neg"
for tfbs in important_tfbs:
    tfbs= tfbs.strip()
    file_path = f"{base_folder_path}/{tfbs.strip()}/300bp_unique.bed"
    if os.path.isfile(file_path):
        print(f"300bp.bed of the folder {tfbs} is processing.")
        dataset = process_bed_file(file_path)
        #split_and_save(dataset, f"{base_folder_path}/{tfbs}", "300bp_unique.bed")
    else:
        print(f'300bp.bed file is not still generated in the folder {tfbs}.')

# for root, dirs, files in os.walk(folder_path):
#     # Skip .ipynb_checkpoints directories
#     if '.ipynb_checkpoints' in root:
#         continue
#     for file in files:
#         if file.endswith('.bed'):
#             file_path = os.path.join(root, file)
#             dataset = process_bed_file(file_path)
#             split_and_save(dataset, root, file)

300bp.bed of the folder CEBPA is processing.
Label
0    58877
1    57842
dtype: int64
Balanced BED saved to /data/projects/DNABERT_snv/Manuscript_11_2023/TFBS_fine_tune_data/Data_Jan_2024/cleaned_names_0524/1_1_pos_neg/CEBPA/300bp_unique_balanced.bed
(85000, 2) (15000, 2)
Label
0    42415
1    42585
dtype: int64 Label
0    7585
1    7415
dtype: int64
300bp.bed of the folder CTCF is processing.
Label
0    9123823
1     249010
dtype: int64
Balanced BED saved to /data/projects/DNABERT_snv/Manuscript_11_2023/TFBS_fine_tune_data/Data_Jan_2024/cleaned_names_0524/1_1_pos_neg/CTCF/300bp_unique_balanced.bed
(85000, 2) (15000, 2)
Label
0    42527
1    42473
dtype: int64 Label
0    7473
1    7527
dtype: int64
300bp.bed of the folder FOS is processing.
Label
0    503537
1    169317
dtype: int64
Balanced BED saved to /data/projects/DNABERT_snv/Manuscript_11_2023/TFBS_fine_tune_data/Data_Jan_2024/cleaned_names_0524/1_1_pos_neg/FOS/300bp_unique_balanced.bed
(85000, 2) (15000, 2)
Label
0    42478
1   