In [1]:
import os, glob
import pandas as pd
import pysam
import numpy as np
import pickle

In [2]:
cancer_type = "Brain"
intersected_base_path = "/data/private/pdutta_new/GDC_cancer_wise/{}/Generated_files/Intersected_files/100bp_TFBS/TFBS_test".format(cancer_type)
reference_genome_path="/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa"

In [3]:
output_path = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/DNABERT_Data/Somatic/TFBS_300bp_test".format(cancer_type)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [4]:
top_models_df= pd.read_csv("/home/campus.stonybrook.edu/pdutta/Github/Postdoc/DNABERT_data_processing/TFBS/important_tfbs.txt", sep="\t", header=None)
top_models_df

Unnamed: 0,0
0,FOS
1,H2AFZ
2,H2AK5ac
3,H2BK12ac
4,H2BK5ac
5,H3F3A
6,H3K14ac
7,H3K27ac
8,H3K27me3
9,H3K36me3


In [5]:
top_models_df = pd.DataFrame({0: ['CBFA2T3', 'POLR2A']})

top_models_df

Unnamed: 0,0
0,CBFA2T3
1,POLR2A


In [6]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [16]:
def apply_mutations_absolute(ref_seq, chip_start, starts, ends, refs, alts):
    mutations = sorted(zip(starts, ends, refs, alts), key=lambda x: x[0])  # Sort by start position
    seq_list = list(ref_seq)
    offset = chip_start  # This is the absolute starting position of the sequence on the chromosome
    #print(mutations, seq_list, offset)
    #input()

    # Apply each mutation
    for start, end, ref, alt in mutations:
        rel_pos = start - offset  # Calculate the relative position in the sequence
        #print(rel_pos)
        if ''.join(seq_list[rel_pos:rel_pos+len(ref)]) == ref:  # Check if the reference matches
            seq_list[rel_pos:rel_pos+len(ref)] = list(alt)  # Replace with the alternative
        #else:
            # If the reference doesn't match, log an error or handle it accordingly
            #print(f"Expected ref '{ref}' at position {rel_pos}, but found '{''.join(seq_list[rel_pos:rel_pos+len(ref)])}'")
    #print(''.join(seq_list))
    #input()

    return ''.join(seq_list)  # Convert list back to string

In [17]:
def get_sequences(df, reference_fasta):
    data = []
    for idx, row in df.iterrows():
        #print(row)
        chrom = row[0]
        ref_start = row[1]
        ref_end = row[2]
        variant_start = row['START_POS']
        variant_end = row['END_POS']
        ref_nucleotide = row['REF']
        alt = row["ALT"]
        
        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start
        #print(ref_nucleotide , alt, variant_pos_start, variant_pos_end)
        
        
        # Get reference sequence
        #print(chrom)
        ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
        #print(ref_seq)
        
        # Handle insertion and deletion to get the correct alt sequence
        # Identify if the variant is an insertion or deletion
        if len(ref_nucleotide) < len(alt):  # Insertion
            delete_size =  len(alt) - len(ref_nucleotide)
            #print(variant_pos_start, variant_pos_end ,delete_size)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq) - delete_size]

        elif len(ref_nucleotide) > len(alt):  # Deletion
            insert_size = len(ref_nucleotide) - len(alt)
            #print(insert_size)
            extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end + insert_size)
            #print(extra_bases)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases

        else:  # SNV
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
        # print(ref_seq)
        # print(alt_seq)
        # input()


        data.append({
            'chr': chrom,
            #'strand': row['strand'],
            #'Transcript_ID': row['transcript_id'],
            'Chip_Seq_start': row[1],
            'Chip_Seq_end': row[2],
            'varinat_start': variant_start,
            'variant_end': variant_end,
            'ref_neucleotide': ref_nucleotide,
            'alternative_neucleotide': alt,
            'reference_seq': ref_seq,
            'alt_seq': alt_seq
        })
            
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    # print(new_df.shape)
    # print(new_df)
    new_df = new_df.drop_duplicates().reset_index()
    #print(new_df.shape)
    #print("&*")
    data = []
    merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
    merged_list = [item.upper() for tup in merged_list for item in tup]
    #print(merged_list)
    kmer_lst = list(map(seq2kmer, merged_list))
    df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
    df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
    #print(df_kmer.shape)
    
    
    grouped_df = new_df.groupby(['chr', 'Chip_Seq_start', 'Chip_Seq_end']).agg({
                                'varinat_start': lambda x: list(x),  # Get unique starts
                                'variant_end': lambda x: list(x),    # Get unique ends
                                'ref_neucleotide': lambda x: list(x),  # Unique reference nucleotides
                                'alternative_neucleotide': lambda x: list(x),  # Unique alternative nucleotides
                                'reference_seq': 'first',  # Assuming you might want just the first sequence
                                'alt_seq':  lambda x: list(set(x))  # Assuming you might want all the alt sequence
                            }).reset_index()
    grouped_df['mutated_sequence'] = grouped_df.apply(lambda row: apply_mutations_absolute(
                row['reference_seq'], row['Chip_Seq_start'], row['varinat_start'],
                row['variant_end'], row['ref_neucleotide'], row['alternative_neucleotide']), axis=1)
    
    
    merged_list_regionwise = list(zip(grouped_df['reference_seq'], grouped_df['mutated_sequence']))
    merged_list_regionwise = [item.upper() for tup in merged_list_regionwise for item in tup]
    regionwise_kmer_list = list(map(seq2kmer, merged_list_regionwise))
    df_kmer_region = pd.DataFrame(regionwise_kmer_list, columns=['Sequence'])
    df_kmer_region['Label'] = np.random.choice([0, 1], size=len(df_kmer_region))
    return new_df, df_kmer, grouped_df, df_kmer_region

In [18]:
## Load the reference file
reference_fasta = pysam.FastaFile(reference_genome_path)
print("#####The reference file is loaded######")

#####The reference file is loaded######


In [19]:
missing_files = []



for index, row in top_models_df.iterrows():
    intersected_data = f"{intersected_base_path}/{row[0]}/intersected_vcf_data.pkl"
    print(intersected_data)
    
    try:
        with open(intersected_data, "rb") as file:
            loaded_dictionary = pickle.load(file)
        print(f"****All the VCF files are loaded for {row[0]}***") 

        dnabert_raw_data = {}
        dnabert_raw_data_regionwise = {}
        for key, value in loaded_dictionary.items():
            new_df, df_kmer, grouped_df, df_kmer_region = get_sequences(value, reference_fasta)
            dnabert_raw_data[key] = new_df
            dnabert_raw_data_regionwise[key] = grouped_df
            # display(df_kmer)
            # display(new_df)
            
            out_folder_path = f"{output_path}/{row[0]}/Patient_wise/variant_wise/{key}"
            if not os.path.exists(out_folder_path):
                os.makedirs(out_folder_path)
            df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index=False)

            region_out_folder_path = f"{output_path}/{row[0]}/Patient_wise/region_wise/{key}"
            if not os.path.exists(region_out_folder_path):
                os.makedirs(region_out_folder_path)
            df_kmer_region.to_csv(region_out_folder_path + "/dev.tsv", sep="\t", index=False)
        
        with open(f"{output_path}/{row[0]}/variantwise_raw_vcf_data.pkl", "wb") as file:
            pickle.dump(dnabert_raw_data, file)
        with open(f"{output_path}/{row[0]}/regionwise_raw_vcf_data.pkl", "wb") as file:
            pickle.dump(dnabert_raw_data_regionwise, file)

    except FileNotFoundError:
        print(f"File not found for {row[0]}, skipping...")
        missing_files.append(row[0])

# Optionally, save the list of missing files/tags to a text file
missing_files_path = f"{output_path}/missing_files.txt"
with open(missing_files_path, "w") as f:
    for tag in missing_files:
        f.write(f"{tag}\n")

print(f"Missing files recorded at {missing_files_path}")

/data/private/pdutta_new/GDC_cancer_wise/Brain/Generated_files/Intersected_files/100bp_TFBS/TFBS_test/CBFA2T3/intersected_vcf_data.pkl
****All the VCF files are loaded for CBFA2T3***
/data/private/pdutta_new/GDC_cancer_wise/Brain/Generated_files/Intersected_files/100bp_TFBS/TFBS_test/POLR2A/intersected_vcf_data.pkl
****All the VCF files are loaded for POLR2A***
Missing files recorded at /data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Somatic/TFBS_300bp_test/missing_files.txt


In [20]:
len(missing_files)

0

In [21]:
grouped_df

Unnamed: 0,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq,mutated_sequence
0,chr1,19190,19291,[19189],[19191],[GC],[G],CCCCTGTAGCTCCCCTACCTCCAAGAGCCCAGCCCTTGCCCACAGG...,[CCCCTGTAGCTCCCCTACCTCCAAGAGCCCAGCCCTTGCCCACAG...,CCCCTGTAGCTCCCCTACCTCCAAGAGCCCAGCCCTTGCCCACAGG...
1,chr1,38986,39087,[39023],[39025],[TC],[T],AGTGCAAAATGAAAGAAGACTGTCAGAGACCCCAAACTCTGCTGTC...,[AGTGCAAAATGAAAGAAGACTGTCAGAGACCCCAAACTTGCTGTC...,AGTGCAAAATGAAAGAAGACTGTCAGAGACCCCAAACTTGCTGTCA...
2,chr1,109484,109585,[109574],[109579],[CGTGT],[C],TACGTAAGAGTGCTTGAGGGCTAATTTTATGAAAGCTTTGGGAAGT...,[TACGTAAGAGTGCTTGAGGGCTAATTTTATGAAAGCTTTGGGAAG...,TACGTAAGAGTGCTTGAGGGCTAATTTTATGAAAGCTTTGGGAAGT...
3,chr1,122513,122614,[122585],[122588],[CAT],[C],TCATTCTTTTGTTTATATAGTCAATATCTCTATCTCAATTGGATCT...,[TCATTCTTTTGTTTATATAGTCAATATCTCTATCTCAATTGGATC...,TCATTCTTTTGTTTATATAGTCAATATCTCTATCTCAATTGGATCT...
4,chr1,189375,189476,[189391],[189394],[ACC],[A],ACGGCTCACTGAGCAAACCCCGAGTCCCGACCACCGCCTCAGTGTG...,[ACGGCTCACTGAGCAAACCGAGTCCCGACCACCGCCTCAGTGTGG...,ACGGCTCACTGAGCAAACCGAGTCCCGACCACCGCCTCAGTGTGGT...
...,...,...,...,...,...,...,...,...,...,...
59387,chrY,56856031,56856132,"[56856102, 56856116]","[56856105, 56856117]","[CAA, A]","[C, AG]",CGTCTCTCACATCTGAACTGCATGCTGAGTGGGCAAGTTGGTTGTA...,[CGTCTCTCACATCTGAACTGCATGCTGAGTGGGCAAGTTGGTTGT...,CGTCTCTCACATCTGAACTGCATGCTGAGTGGGCAAGTTGGTTGTA...
59388,chrY,56862596,56862697,"[56862598, 56862620, 56862620]","[56862599, 56862621, 56862624]","[C, C, CTTT]","[CT, CT, C]",GACTTTAAAGCATAGATGAAAAATCTTTTTTTTTTTTTTAGACAAC...,[GACTTTAAAGCATAGATGAAAAATCTTTTTTTTTTTTTTTAGACA...,GACTTTTAAAGCATAGATGAAAAATCTTTTTTTTTTTTTTAGACAA...
59389,chrY,56865020,56865121,[56865118],[56865119],[C],[CA],CCCAGTGGAAATTCCAATCAGTTTTGTGAAGGCTGCTAAGTCAACT...,[CCCAGTGGAAATTCCAATCAGTTTTGTGAAGGCTGCTAAGTCAAC...,CCCAGTGGAAATTCCAATCAGTTTTGTGAAGGCTGCTAAGTCAACT...
59390,chrY,56880069,56880170,[56880149],[56880152],[CAA],[C],TTTAGCAGACACGGGGTTTTACCGTGTTAGCCAGGATGGTCTCGAT...,[TTTAGCAGACACGGGGTTTTACCGTGTTAGCCAGGATGGTCTCGA...,TTTAGCAGACACGGGGTTTTACCGTGTTAGCCAGGATGGTCTCGAT...


In [33]:
grouped_df.iloc[4]

chr                                                                     chr1
Chip_Seq_start                                                     123086175
Chip_Seq_end                                                       123086276
varinat_start                              [123086218, 123086246, 123086260]
variant_end                                [123086219, 123086247, 123086261]
ref_neucleotide                                                    [T, A, T]
alternative_neucleotide                                            [A, G, C]
reference_seq              ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...
alt_seq                    [ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTG...
mutated_sequence           ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGAGG...
Name: 4, dtype: object

In [34]:
grouped_df.iloc[4]['alt_seq']

['ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGGAGATTTCAGCCGCTTTGAGGTCAATGGTAGAAAAGGAAATAACTTCGTAGAAAAA',
 'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAACAACTTCGTAGAAAAA',
 'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGAGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAATAACTTCGTAGAAAAA']

In [35]:
grouped_df.iloc[4]['mutated_sequence']

'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGAGGAGATTTCAGCCGCTTTGAGGTCAATGGTAGAAAAGGAAACAACTTCGTAGAAAAA'

In [36]:
grouped_df.iloc[4]['reference_seq']

'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAATAACTTCGTAGAAAAA'

In [20]:
new_df

Unnamed: 0,index,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,0,chr1,54707858,54707958,54707863,54707864,C,T,TGCATCCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...,TGCATTCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...
1,1,chr5,415490,415590,415508,415509,G,T,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...,GGCCGAGTCTCCCTGGTCTGGTGGGAGGCCTAGGGGCCGAGTCTGC...
2,2,chr5,415490,415590,415510,415511,G,C,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...,GGCCGAGTCTCCCTGGTCGGCTGGGAGGCCTAGGGGCCGAGTCTGC...
3,3,chr5,415490,415590,415564,415565,A,G,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...
4,4,chr16,89068861,89068961,89068902,89068903,A,G,TGGTGATGATGATGGTGGTGATGATGATGGTGGTGATGGTGATGAT...,TGGTGATGATGATGGTGGTGATGATGATGGTGGTGATGGTGGTGAT...
...,...,...,...,...,...,...,...,...,...,...
105,105,chr19,41789861,41789962,41789887,41789888,G,A,AGAAAGAAAGAAAGAAAGAAAGAAAAGAAAGAAAGAAAGAAAGAAA...,AGAAAGAAAGAAAGAAAGAAAGAAAAAAAAGAAAGAAAGAAAGAAA...
106,106,chr13,57431125,57431226,57431173,57431174,C,T,CTGAAGGAACATACCGTAAGATAAGAAGAATCATATATGTCAGACC...,CTGAAGGAACATACCGTAAGATAAGAAGAATCATATATGTCAGACC...
107,107,chr20,29508463,29508564,29508498,29508499,G,A,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTGCCTTGGGCTC...,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTACCTTGGGCTC...
108,108,chr20,29508463,29508564,29508500,29508501,C,G,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTGCCTTGGGCTC...,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTGCGTTGGGCTC...


In [25]:
new_df[new_df['alternative_neucleotide'].apply(len) == 0]

Unnamed: 0,index,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq


In [27]:
new_df[
    (new_df['chr'] == 'chr1') & 
    (new_df['Chip_Seq_start'] == 123086175) & 
    (new_df['Chip_Seq_end'] == 123086276)
]

Unnamed: 0,index,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
62,62,chr1,123086175,123086276,123086218,123086219,T,A,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGAGG...
63,63,chr1,123086175,123086276,123086246,123086247,A,G,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...
64,64,chr1,123086175,123086276,123086260,123086261,T,C,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...
