In [2]:
import os, glob
import pandas as pd
import pysam
import numpy as np
import pickle

In [3]:
cancer_type = "Brain"
intersected_base_path = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/Intersected_Data/TFBS_remaining".format(cancer_type)
reference_genome_path="/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa"

In [4]:
output_path = "/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/TFBS_remaining"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
top_models_df= pd.read_csv("/home/campus.stonybrook.edu/pdutta/Github/Postdoc/DNABERT_data_processing/TFBS/top_TFBS_models.tsv", sep="\t") 
top_models_df

Unnamed: 0,tags,config,eval_acc
0,RBM14,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",0.968908
1,SAFB,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",0.968090
2,TAF15,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",0.959936
3,SPI1,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",0.957358
4,USF1,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",0.951848
...,...,...,...
134,ESRRA,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",0.851964
135,ZNF652,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",0.851404
136,ATF3,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 4...",0.851240
137,RBM25,"{'epochs': 10, 'Dropout': 0.1, 'batch_size': 1...",0.850664


In [6]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [7]:
def get_sequences(df, reference_fasta):
    data = []
    for idx, row in df.iterrows():
        #print(row)
        chrom = row[0]
        ref_start = row[1]
        ref_end = row[2]
        variant_start = row['START_POS']
        variant_end = row['END_POS']
        ref_nucleotide = row['REF']
        alt = row["ALT"]
        
        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start
        #print(ref_nucleotide , alt, variant_pos_start, variant_pos_end)
        
        
        # Get reference sequence
        #print(chrom)
        ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
        #print(ref_seq)
        
        # Handle insertion and deletion to get the correct alt sequence
        # Identify if the variant is an insertion or deletion
        if len(ref_nucleotide) < len(alt):  # Insertion
            delete_size =  len(alt) - len(ref_nucleotide)
            #print(variant_pos_start, variant_pos_end ,delete_size)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq) - delete_size]

        elif len(ref_nucleotide) > len(alt):  # Deletion
            insert_size = len(ref_nucleotide) - len(alt)
            #print(insert_size)
            extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end + insert_size)
            #print(extra_bases)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases

        else:  # SNV
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
        # print(ref_seq)
        # print(alt_seq)
        # input()


        data.append({
            'chr': chrom,
            #'strand': row['strand'],
            #'Transcript_ID': row['transcript_id'],
            'Chip_Seq_start': row[0],
            'Chip_Seq_end': row[1],
            'varinat_start': variant_start,
            'variant_end': variant_end,
            'ref_neucleotide': ref_nucleotide,
            'alternative_neucleotide': alt,
            'reference_seq': ref_seq,
            'alt_seq': alt_seq
        })
            
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    # print(new_df.shape)
    # print(new_df)
    new_df = new_df.drop_duplicates().reset_index()
    #print(new_df.shape)
    #print("&*")
    data = []
    merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
    merged_list = [item.upper() for tup in merged_list for item in tup]
    #print(merged_list)
    kmer_lst = list(map(seq2kmer, merged_list))
    df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
    df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
    #print(df_kmer.shape)
    return new_df, df_kmer

In [8]:
## Load the reference file
reference_fasta = pysam.FastaFile(reference_genome_path)
print("#####The reference file is loaded######")

#####The reference file is loaded######


In [9]:
missing_files = []



for index, row in top_models_df.iterrows():
    intersected_data = f"{intersected_base_path}/{row['tags']}/intersected_vcf_data.pkl"
    print(intersected_data)
    
    try:
        with open(intersected_data, "rb") as file:
            loaded_dictionary = pickle.load(file)
        print(f"****All the VCF files are loaded for {row['tags']}***") 

        dnabert_raw_data = {}
        for key, value in loaded_dictionary.items():
            new_df, df_kmer = get_sequences(value, reference_fasta)
            dnabert_raw_data[key] = new_df

            out_folder_path = f"{output_path}/{row['tags']}/Patient_wise/{key}"
            if not os.path.exists(out_folder_path):
                os.makedirs(out_folder_path)
            df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index=False)
        
        with open(f"{output_path}/{row['tags']}/raw_vcf_data.pkl", "wb") as file:
            pickle.dump(dnabert_raw_data, file)

    except FileNotFoundError:
        print(f"File not found for {row['tags']}, skipping...")
        missing_files.append(row['tags'])

# Optionally, save the list of missing files/tags to a text file
missing_files_path = f"{output_path}/missing_files.txt"
with open(missing_files_path, "w") as f:
    for tag in missing_files:
        f.write(f"{tag}\n")

print(f"Missing files recorded at {missing_files_path}")

/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/Intersected_Data/TFBS_remaining/RBM14/intersected_vcf_data.pkl
File not found for RBM14, skipping...
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/Intersected_Data/TFBS_remaining/SAFB/intersected_vcf_data.pkl
File not found for SAFB, skipping...
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/Intersected_Data/TFBS_remaining/TAF15/intersected_vcf_data.pkl
****All the VCF files are loaded for TAF15***
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/Intersected_Data/TFBS_remaining/SPI1/intersected_vcf_data.pkl
****All the VCF files are loaded for SPI1***
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/Intersected_Data/TFBS_remaining/USF1/intersected_vcf_data.pkl
****All the VCF files are loaded for USF1***
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/Intersected_Data/TFBS_remaining/ZNF426/intersected_vcf_data.pkl
File not found for ZNF426, skipping...
/d

In [42]:
len(missing_files)

100

In [None]:
6