In [1]:
import os, glob
import pandas as pd
import pysam
import numpy as np
import pickle

In [2]:
cancer_type = "Brain"
non_coding_region="acceptor"
intersected_data = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/Intersected_Data/Germline/non_coding_regions/{}/intersected_vcf_{}_data.pkl".format(cancer_type, non_coding_region, non_coding_region)
reference_path  = "/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa"

In [3]:
output_path = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/DNABERT_Data/Germline/{}/".format(cancer_type, non_coding_region)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [4]:
output_path

'/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Germline/acceptor/'

In [5]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [6]:
def get_sequences(df, reference_fasta):
    data = []
    for idx, row in df.iterrows():
        #print(row)
        chrom = row['chr_name']
        ref_start = row['start']
        ref_end = row['end']
        variant_start = row['START_POS']
        variant_end = row['END_POS']
        ref_nucleotide = row['REF']
        alt = row["ALT"]
        
        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start
        #print(ref_nucleotide , alt, variant_pos_start, variant_pos_end)
        
        
        # Get reference sequence
        #print(chrom)
        ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
        #print(ref_seq)
        
        # Handle insertion and deletion to get the correct alt sequence
        # Identify if the variant is an insertion or deletion
        if len(ref_nucleotide) < len(alt):  # Insertion
            delete_size =  len(alt) - len(ref_nucleotide)
            #print(variant_pos_start, variant_pos_end ,delete_size)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq) - delete_size]

        elif len(ref_nucleotide) > len(alt):  # Deletion
            insert_size = len(ref_nucleotide) - len(alt)
            #print(insert_size)
            extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end + insert_size)
            #print(extra_bases)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases

        else:  # SNV
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
        # print(ref_seq)
        # print(alt_seq)
        # input()


        data.append({
            'chr': chrom,
            'strand': row['strand'],
            'Transcript_ID': row['transcript_id'],
            'Acceptor_start': row['start'],
            'Acceptor_end': row['end'],
            'varinat_start': variant_start,
            'variant_end': variant_end,
            'ref_neucleotide': ref_nucleotide,
            'alternative_neucleotide': alt,
            'reference_seq': ref_seq,
            'alt_seq': alt_seq
        })
            
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    # print(new_df.shape)
    # print(new_df)
    new_df = new_df.drop_duplicates().reset_index()
    print(new_df.shape)
    print("&*")
    data = []
    merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
    merged_list = [item.upper() for tup in merged_list for item in tup]
    #print(merged_list)
    kmer_lst = list(map(seq2kmer, merged_list))
    df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
    df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
    print(df_kmer.shape)
    return new_df, df_kmer

In [7]:
print(pd.__version__)

1.5.3


In [8]:
## Load the VCF files from pickle
with open(intersected_data, "rb") as file:
    loaded_dictionary = pickle.load(file)
print("****All the VCF files are loaded***")   

****All the VCF files are loaded***


In [9]:
len(loaded_dictionary)

148

In [10]:
## Load the reference file
reference_fasta = pysam.FastaFile(reference_path)
print("#####The reference file is loaded######")

#####The reference file is loaded######


In [11]:
# List all sequence names
sequence_names = reference_fasta.references
print("Sequence names in the FASTA file:", sequence_names)

Sequence names in the FASTA file: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1', 'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1', 'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1', 'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1', 'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1', 'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1', 'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1', 'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1', 'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1', 'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1', 'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1', 'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1', 'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1', 'KI270384.1', 'KI270385.

In [12]:
dnabert_raw_data = {}
for key, value in loaded_dictionary.items():
    #display(value[['chr_name', 'start', 'end', 'strand', 'transcript_type', 'CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT']])
    print("CODE STARTED")
    print(key, value.shape)
    #ins_del_df = value[(value['ALT'].str.len() > value['REF'].str.len()) | (value['REF'].str.len() > value['ALT'].str.len())]
    new_df, df_kmer = get_sequences(value, reference_fasta)
    dnabert_raw_data[key] = new_df
    
    #input()
    out_folder_path = output_path+"Patient_wise/"+ key
    print(out_folder_path, "is created")
    if not os.path.exists(out_folder_path):
        os.makedirs(out_folder_path)
    df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index= False)

CODE STARTED
TCGA-02-2483-01A-01D-1494-08 (24381, 12)
(24381, 12)
&*
(48762, 2)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Germline/acceptor/Patient_wise/TCGA-02-2483-01A-01D-1494-08 is created
CODE STARTED
TCGA-CS-5396-01A-02D-1465-02 (16366, 12)
(16366, 12)
&*
(32732, 2)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Germline/acceptor/Patient_wise/TCGA-CS-5396-01A-02D-1465-02 is created
CODE STARTED
TCGA-EZ-7264-01A-11D-2024-08 (23449, 12)
(23449, 12)
&*
(46898, 2)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Germline/acceptor/Patient_wise/TCGA-EZ-7264-01A-11D-2024-08 is created
CODE STARTED
TCGA-16-1460-01A-01D-0932-09 (24317, 12)
(24317, 12)
&*
(48634, 2)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Germline/acceptor/Patient_wise/TCGA-16-1460-01A-01D-0932-09 is created
CODE STARTED
TCGA-CS-5395-01A-01D-1468-08 (29392, 12)
(29392, 12)
&*
(58784, 2)
/data/projects/G

In [13]:
with open(output_path+"/raw_{}_vcf_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(dnabert_raw_data, file)

In [14]:
len(dnabert_raw_data)

148

In [15]:
dnabert_raw_data['TCGA-02-2483-01A-01D-1494-08']

Unnamed: 0,index,chr,strand,Transcript_ID,Acceptor_start,Acceptor_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,0,chr1,+,ENST00000450305.2,13412,13491,13416,13421,CGAGA,CGAGAGAGA,ACCCCGAGATCACATTTCTCACTGCCTTTTGTCTGCCCAGTTTCAC...,ACCCCGAGAGAGATCACATTTCTCACTGCCTTTTGTCTGCCCAGTT...
1,1,chr1,-,ENST00000623083.4,187248,187327,187301,187302,A,G,GGGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTGAAAC...,GGGGCGGTGGGGGTGGTGTTAGTACCCCATCTTGTAGGTCTGAAAC...
2,2,chr1,-,ENST00000623083.4,188227,188306,188251,188252,G,T,CTTGGTGCTCACGCACACAGGAAAGTCCTTCAGCTTCTCCTGGGAG...,CTTGGTGCTCACGCACACAGGAAATTCCTTCAGCTTCTCCTGGGAG...
3,3,chr1,+,ENST00000669836.1,268081,268160,268129,268130,G,T,CTACAGCAGATTCACTCTGTTCTGTTTCATTGTTGTTTAGTTTGCG...,CTACAGCAGATTCACTCTGTTCTGTTTCATTGTTGTTTAGTTTGCG...
4,4,chr1,-,"ENST00000641063.1, ENST00000641303.1, ENST0000...",514384,514463,514445,514461,TAAAAAAAAAAAAAAA,TAAAAAAAAAAAAAA,TCTCTTCTTGGATTCAGAAGTCTTTCATGGTAGGTCCAGCTAGAAG...,TCTCTTCTTGGATTCAGAAGTCTTTCATGGTAGGTCCAGCTAGAAG...
...,...,...,...,...,...,...,...,...,...,...,...,...
24376,24376,chrY,+,ENST00000614446.1,11417356,11417435,11417356,11417357,A,C,ACTTCAGACACAGCTACTTATGTTTTTAATTCCCTCACAGGATGAA...,CCTTCAGACACAGCTACTTATGTTTTTAATTCCCTCACAGGATGAA...
24377,24377,chrY,+,ENST00000614446.1,11417356,11417435,11417431,11417432,G,C,ACTTCAGACACAGCTACTTATGTTTTTAATTCCCTCACAGGATGAA...,ACTTCAGACACAGCTACTTATGTTTTTAATTCCCTCACAGGATGAA...
24378,24378,chrY,-,ENST00000455273.1,18453224,18453303,18453245,18453246,G,T,TATTGCCAAAGACTGTGCTTTGACAGACATTAAGTCAACCTGAAAC...,TATTGCCAAAGACTGTGCTTTTACAGACATTAAGTCAACCTGAAAC...
24379,24379,chrY,+,ENST00000451061.5,18674176,18674255,18674222,18674223,A,G,CTCTGCCCCCCAGGTAACCTGTAGCCATTTTGTCTTCTAGTCCAAC...,CTCTGCCCCCCAGGTAACCTGTAGCCATTTTGTCTTCTAGTCCAAC...


In [16]:
dnabert_raw_data['TCGA-02-2483-01A-01D-1494-08'][['chr','Acceptor_start','Acceptor_end']]

Unnamed: 0,chr,Acceptor_start,Acceptor_end
0,chr1,13412,13491
1,chr1,187248,187327
2,chr1,188227,188306
3,chr1,268081,268160
4,chr1,514384,514463
...,...,...,...
24376,chrY,11417356,11417435
24377,chrY,11417356,11417435
24378,chrY,18453224,18453303
24379,chrY,18674176,18674255


In [19]:
for key, value in dnabert_raw_data.items():
    print(dnabert_raw_data[key].shape, dnabert_raw_data[key][['chr','Acceptor_start','Acceptor_end']].shape)
    if (dnabert_raw_data[key].shape[0]!=dnabert_raw_data[key][['chr','Acceptor_start','Acceptor_end']].shape[0]):
        print(key , " has instances where there are multiple variants")

(24381, 12) (24381, 3)
(16366, 12) (16366, 3)
(23449, 12) (23449, 3)
(24317, 12) (24317, 3)
(29392, 12) (29392, 3)
(24582, 12) (24582, 3)
(22812, 12) (22812, 3)
(23475, 12) (23475, 3)
(14762, 12) (14762, 3)
(23825, 12) (23825, 3)
(25110, 12) (25110, 3)
(23906, 12) (23906, 3)
(17734, 12) (17734, 3)
(24823, 12) (24823, 3)
(24513, 12) (24513, 3)
(24059, 12) (24059, 3)
(24300, 12) (24300, 3)
(24855, 12) (24855, 3)
(22598, 12) (22598, 3)
(22588, 12) (22588, 3)
(23543, 12) (23543, 3)
(18796, 12) (18796, 3)
(18123, 12) (18123, 3)
(24776, 12) (24776, 3)
(24891, 12) (24891, 3)
(24483, 12) (24483, 3)
(24210, 12) (24210, 3)
(18016, 12) (18016, 3)
(24005, 12) (24005, 3)
(25066, 12) (25066, 3)
(25184, 12) (25184, 3)
(24920, 12) (24920, 3)
(23761, 12) (23761, 3)
(22967, 12) (22967, 3)
(15500, 12) (15500, 3)
(14495, 12) (14495, 3)
(14539, 12) (14539, 3)
(18058, 12) (18058, 3)
(16768, 12) (16768, 3)
(16113, 12) (16113, 3)
(24035, 12) (24035, 3)
(24120, 12) (24120, 3)
(17521, 12) (17521, 3)
(18135, 12)