In [1]:
import os, glob
import pandas as pd
import pysam
import numpy as np
import pickle

In [2]:
cancer_type = "Brain"
non_coding_region="acceptor"
intersected_data = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/Intersected_Data/intersected_vcf_{}_data.pkl".format(cancer_type, non_coding_region)
reference_path="/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa"

In [3]:
output_path = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/DNABERT_Data/Somatic/{}/".format(cancer_type, non_coding_region)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [4]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [5]:
# Function to aggregate data for unique acceptor regions
def aggregate_variants(group):
    group_data = {
        'varinat_start': group['varinat_start'].tolist(),
        'variant_end': group['variant_end'].tolist(),
        'ref_neucleotide': group['ref_neucleotide'].tolist(),
        'alternative_neucleotide': group['alternative_neucleotide'].tolist(),
        'alt_seq': modify_sequence(group)
    }
    return pd.Series(group_data, index=['varinat_start', 'variant_end', 'ref_neucleotide', 'alternative_neucleotide', 'alt_seq'])

In [6]:
# Function to modify the sequence based on variants
def modify_sequence(group):
    reference_sequence = list(group['reference_seq'].iloc[0])  # Assuming same reference for each group
    #print(reference_sequence)
    adjustments = 0  # To account for the shifts due to insertions/deletions
    for _, variant in group.sort_values('varinat_start').iterrows():
        # Calculate the relative positions in the reference
        start_index = variant['varinat_start'] - group['Acceptor_start'].iloc[0] + adjustments
        end_index = variant['variant_end'] - group['Acceptor_start'].iloc[0] + adjustments
        # Calculate adjustment
        ref_len = end_index - start_index
        alt_len = len(variant['alternative_neucleotide'])
        # Perform the insertion, deletion, or substitution
        reference_sequence[start_index:end_index] = list(variant['alternative_neucleotide'])
        #print(reference_sequence)
        # Adjust subsequent indices
        adjustments += (alt_len - ref_len)
    
    return ''.join(reference_sequence)

In [21]:
def get_sequences(df, reference_fasta):
    data = []
    for idx, row in df.iterrows():
        #print(row)
        chrom = row['chr_name']
        ref_start = row['start']
        ref_end = row['end']
        variant_start = row['START_POS']
        variant_end = row['END_POS']
        ref_nucleotide = row['REF']
        alt = row["ALT"]
        
        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start
        #print(ref_nucleotide , alt, variant_pos_start, variant_pos_end)
        
        
        # Get reference sequence
        #print(chrom)
        ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
        #print(ref_seq)
        
        # Handle insertion and deletion to get the correct alt sequence
        # Identify if the variant is an insertion or deletion
        if len(ref_nucleotide) < len(alt):  # Insertion
            delete_size =  len(alt) - len(ref_nucleotide)
            #print(variant_pos_start, variant_pos_end ,delete_size)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq) - delete_size]

        elif len(ref_nucleotide) > len(alt):  # Deletion
            insert_size = len(ref_nucleotide) - len(alt)
            #print(insert_size)
            extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end + insert_size)
            #print(extra_bases)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases

        else:  # SNV
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
        # print(ref_seq)
        # print(alt_seq)
        # input()


        data.append({
            'chr': chrom,
            'strand': row['strand'],
            'Transcript_ID': row['transcript_id'],
            'Acceptor_start': row['start'],
            'Acceptor_end': row['end'],
            'varinat_start': variant_start,
            'variant_end': variant_end,
            'ref_neucleotide': ref_nucleotide,
            'alternative_neucleotide': alt,
            'reference_seq': ref_seq,
            'alt_seq': alt_seq
        })
            
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    print(new_df.shape)
    #display(new_df)
    new_df = new_df.drop_duplicates().reset_index(drop=True)
    #display(new_df)
    print(new_df.shape)
    print("&*")
    data = []
    merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
    merged_list = [item.upper() for tup in merged_list for item in tup]
    #print(merged_list)
    kmer_lst = list(map(seq2kmer, merged_list))
    df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
    df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
    print(df_kmer.shape)
    
    grouped_df = new_df.groupby(['chr', 'strand','Transcript_ID', 'Acceptor_start', 'Acceptor_end', 'reference_seq']).apply(aggregate_variants).reset_index()
    print("Region wise number", grouped_df.shape)
    merged_list_regionwise = list(zip(grouped_df['reference_seq'], grouped_df['alt_seq']))
    merged_list_regionwise = [item.upper() for tup in merged_list_regionwise for item in tup]
    regionwise_kmer_list = list(map(seq2kmer, merged_list_regionwise))
    df_kmer_region = pd.DataFrame(regionwise_kmer_list, columns=['Sequence'])
    df_kmer_region['Label'] = np.random.choice([0, 1], size=len(df_kmer_region))
    
    return new_df, df_kmer, grouped_df, df_kmer_region

In [22]:
print(pd.__version__)

1.5.3


In [23]:
## Load the VCF files from pickle
with open(intersected_data, "rb") as file:
    loaded_dictionary = pickle.load(file)
print("****All the VCF files are loaded***")   

****All the VCF files are loaded***


In [24]:
len(loaded_dictionary)

432

In [25]:
## Load the reference file
reference_fasta = pysam.FastaFile(reference_path)
print("#####The reference file is loaded######")

#####The reference file is loaded######


In [26]:
# List all sequence names
sequence_names = reference_fasta.references
print("Sequence names in the FASTA file:", sequence_names)

Sequence names in the FASTA file: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1', 'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1', 'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1', 'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1', 'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1', 'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1', 'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1', 'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1', 'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1', 'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1', 'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1', 'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1', 'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1', 'KI270384.1', 'KI270385.

In [27]:
dnabert_raw_data = {}
dnabert_raw_data_regionwise = {}
for key, value in loaded_dictionary.items():
    #display(value[['chr_name', 'start', 'end', 'strand', 'transcript_type', 'CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT']])
    print("CODE STARTED")
    print(key, value.shape)
    #ins_del_df = value[(value['ALT'].str.len() > value['REF'].str.len()) | (value['REF'].str.len() > value['ALT'].str.len())]
    new_df, df_kmer, grouped_df, df_kmer_region= get_sequences(value, reference_fasta)
    dnabert_raw_data[key] = new_df
    dnabert_raw_data_regionwise[key] = grouped_df
    
    #input()
    out_folder_path = output_path+"Patient_wise/variant_wise/"+ key
    print(out_folder_path, "is created")
    if not os.path.exists(out_folder_path):
        os.makedirs(out_folder_path)
    df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index= False)
    
    
    region_out_folder_path = f"{output_path}/Patient_wise/region_wise/{key}"
    if not os.path.exists(region_out_folder_path):
        os.makedirs(region_out_folder_path)
    df_kmer_region.to_csv(region_out_folder_path + "/dev.tsv", sep="\t", index=False)

CODE STARTED
9e35e7c1-b8e9-441b-9520-3195359f8e43_sanger_raw_pindel (4581, 24)
(4581, 11)
(4580, 11)
&*
(9160, 2)
Region wise number (4333, 11)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Somatic/acceptor/Patient_wise/variant_wise/9e35e7c1-b8e9-441b-9520-3195359f8e43_sanger_raw_pindel is created
CODE STARTED
2c5b021f-f9e1-4a42-8755-8704f006016b_sanger_raw_pindel (5199, 24)
(5199, 11)
(5197, 11)
&*
(10394, 2)
Region wise number (4677, 11)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Somatic/acceptor/Patient_wise/variant_wise/2c5b021f-f9e1-4a42-8755-8704f006016b_sanger_raw_pindel is created
CODE STARTED
1d3daf9d-7740-484e-850b-827a6b7d451d_CaVEMan (100, 27)
(100, 11)
(100, 11)
&*
(200, 2)
Region wise number (79, 11)
/data/projects/GDC_Cancer_Wise/New_data/Brain/Generated_files/DNABERT_Data/Somatic/acceptor/Patient_wise/variant_wise/1d3daf9d-7740-484e-850b-827a6b7d451d_CaVEMan is created
CODE STARTED
54d621a5-8e45-4bbc-92c4-1c7f

In [28]:
with open(output_path+"/raw_{}_vcf_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(dnabert_raw_data, file)

In [None]:
len(dnabert_raw_data)

In [29]:
with open(output_path+"/regionwise_raw_{}_vcf_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(dnabert_raw_data_regionwise, file)

In [75]:
grouped_df

Unnamed: 0,chr,strand,Transcript_ID,Acceptor_start,Acceptor_end,reference_seq,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,alt_seq
0,chr1,+,"ENST00000236914.7, ENST00000367362.8, ENST0000...",200173922,200174001,TATTGAAATGTTGCTTTTTTTTTTTTTTTTTTTAATGCAGATGTCA...,[200173935],[200173940],[CTTTT],[C],TATTGAAATGTTGCTTTTTTTTTTTTTTTAATGCAGATGTCAAAAA...
1,chr1,+,"ENST00000237247.10, ENST00000371037.9, ENST000...",66690149,66690228,CTGTGTATCTAACTTTTCATCTCTTTTCTTCTCCTTACAGTCCAGA...,[66690178],[66690182],[TCTC],[T],CTGTGTATCTAACTTTTCATCTCTTTTCTTCTTACAGTCCAGACCT...
2,chr1,+,"ENST00000237247.10, ENST00000371037.9, ENST000...",66729223,66729302,GATTTTCTCTCTCTTTCCTCTCTGTGTTTTGATATGCCAGATGTAT...,[66729227],[66729230],[TTC],[T],GATTTTCTCTCTTTCCTCTCTGTGTTTTGATATGCCAGATGTATCG...
3,chr1,+,"ENST00000238181.11, ENST00000323938.10, ENST00...",236544709,236544788,TGGTTAATTAAGGTTTTTTTTTTTCTTTCTTTCTCAAAAGCTTTAA...,[236544721],[236544723],[GT],[G],TGGTTAATTAAGGTTTTTTTTTTCTTTCTTTCTCAAAAGCTTTAAT...
4,chr1,+,"ENST00000239462.9, ENST00000621086.1, ENST0000...",175146890,175146969,GAGCCTCTTCTTGATGTGGCTTTTTTTTTTTTTTTGGTAGGGGGTG...,[175146909],[175146910],[C],[CT],GAGCCTCTTCTTGATGTGGCTTTTTTTTTTTTTTTTGGTAGGGGGT...
...,...,...,...,...,...,...,...,...,...,...,...
4328,chrY,+,"ENST00000425031.1, ENST00000444263.6, ENST0000...",3096954,3097033,GATTTTTAAAAGTTCACTGCATTTATTTTCTTTTCTGTAGTAGAAA...,[3096955],[3096957],[AT],[A],GATTTTAAAAGTTCACTGCATTTATTTTCTTTTCTGTAGTAGAAAT...
4329,chrY,-,ENST00000331172.7,11340755,11340834,GACTCTTGCTGTTTCTAGAGCCAGAGGACCTAACACAGCCTACATT...,[11340786],[11340789],[AAC],[A],GACTCTTGCTGTTTCTAGAGCCAGAGGACCTAACAGCCTACATTTA...
4330,chrY,-,ENST00000331172.7,11350944,11351023,CGTTCCTTAGAGAGGCTATGGCGTTATTAATCATACCACCTGTAGG...,"[11350999, 11350999]","[11351000, 11351000]","[T, T]","[TAA, TTAAA]",CGTTCCTTAGAGAGGCTATGGCGTTATTAATCATACCACCTGTAGG...
4331,chrY,-,ENST00000418016.5,9523395,9523474,CCACTTGACTGCAGCCTGGATGACACAGTGAGACTCCAACTGAAAA...,[9523447],[9523448],[A],[AACAC],CCACTTGACTGCAGCCTGGATGACACAGTGAGACTCCAACTGAAAA...


In [77]:
grouped_df.iloc[4330]['reference_seq']

'CGTTCCTTAGAGAGGCTATGGCGTTATTAATCATACCACCTGTAGGGGCACACAATAAAAAAATTCACAAAGAAATATT'

In [78]:
grouped_df.iloc[4330]['alt_seq']

'CGTTCCTTAGAGAGGCTATGGCGTTATTAATCATACCACCTGTAGGGGCACACAATATTAAAAAAAAAATTCACAAAGAAATATT'