In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def seq2kmer(seq, k=6):
    """
    Convert original sequence to kmers.
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [3]:
def process_bed_file(file_path):
    dataset = []
    with open(file_path, 'r') as bed_file:
        for line in bed_file:
            parts = line.strip().split('\t')
            if len(parts) >= 6:  # Ensure the line has enough columns
                sequence = parts[3]
                label = parts[-1]
                kmers = seq2kmer(sequence)
                dataset.append((kmers, label))
    return dataset

In [4]:
def split_and_save(dataset, base_path, bed_file_name):
    df = pd.DataFrame(dataset, columns=['Sequence', 'Label'])

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(df['Sequence'], df['Label'], test_size=0.2, random_state=42)
    
    # Prepare train and dev (test) DataFrames
    train_df = pd.DataFrame({'Sequence': X_train, 'Label': y_train})
    dev_df = pd.DataFrame({'Sequence': X_test, 'Label': y_test})
    
    # Create a directory named after the bed file
    dir_path = os.path.join(base_path, bed_file_name.replace('.bed', ''))
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    
    # Save the train and dev files
    train_df.to_csv(os.path.join(dir_path, 'train.tsv'), sep='\t', index=False)
    dev_df.to_csv(os.path.join(dir_path, 'dev.tsv'), sep='\t', index=False)

In [5]:
folder_path= "/data/projects/DNABERT_snv/Manuscript_11_2023/TFBS_fine_tune_data/Data_Jan_2024/1_1_pos_neg"
for root, dirs, files in os.walk(folder_path):
    # Skip .ipynb_checkpoints directories
    if '.ipynb_checkpoints' in root:
        continue
    for file in files:
        if file.endswith('.bed'):
            file_path = os.path.join(root, file)
            dataset = process_bed_file(file_path)
            split_and_save(dataset, root, file)