In [1]:
import os, glob
import pandas as pd
import pysam
import numpy as np
import pickle

In [2]:
cancer_type = "Brain"
non_coding_region="acceptor"
intersected_data = "/home/pdutta/Data/Cancer_wiseGDC/Data/{}/Generated_files/Intersected_Data/intersected_vcf_{}_data.pkl".format(cancer_type, non_coding_region)
reference_path  = "/home/pdutta/Data/Human_Genome_Data/GRCh38.primary_assembly.genome.fa"
output_path = "/home/pdutta/Data/Cancer_wiseGDC/Data/{}/Generated_files/DNABERT_Data/{}/".format(cancer_type, non_coding_region)

In [3]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [4]:
def get_sequences(df, reference_fasta):
    data = []
    for idx, row in df.iterrows():
        #print(row)
        chrom = row['chr_name']
        ref_start = row['start']
        ref_end = row['end']
        variant_start = row['START_POS']
        variant_end = row['END_POS']
        ref_nucleotide = row['REF']
        alt = row["ALT"]
        
        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start
        #print(ref_nucleotide , alt, variant_pos_start, variant_pos_end)
        
        
        # Get reference sequence
        #print(chrom)
        ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
        #print(ref_seq)
        
        # Handle insertion and deletion to get the correct alt sequence
        # Identify if the variant is an insertion or deletion
        if len(ref_nucleotide) < len(alt):  # Insertion
            delete_size =  len(alt) - len(ref_nucleotide)
            #print(variant_pos_start, variant_pos_end ,delete_size)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq) - delete_size]

        elif len(ref_nucleotide) > len(alt):  # Deletion
            insert_size = len(ref_nucleotide) - len(alt)
            #print(insert_size)
            extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end + insert_size)
            #print(extra_bases)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases

        else:  # SNV
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
        # print(ref_seq)
        # print(alt_seq)
        # input()


        data.append({
            'chr': chrom,
            'strand': row['strand'],
            'Transcript_ID': row['transcript_id'],
            'Acceptor_start': row['start'],
            'Acceptor_end': row['end'],
            'varinat_start': variant_start,
            'variant_end': variant_end,
            'ref_neucleotide': ref_nucleotide,
            'alternative_neucleotide': alt,
            'reference_seq': ref_seq,
            'alt_seq': alt_seq
        })
            
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    # print(new_df.shape)
    # print(new_df)
    new_df = new_df.drop_duplicates().reset_index()
    print(new_df.shape)
    print("&*")
    data = []
    merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
    merged_list = [item.upper() for tup in merged_list for item in tup]
    #print(merged_list)
    kmer_lst = list(map(seq2kmer, merged_list))
    df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
    df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
    print(df_kmer.shape)
    return new_df, df_kmer

In [5]:
## Load the VCF files from pickle
with open(intersected_data, "rb") as file:
    loaded_dictionary = pickle.load(file)
print("****All the VCF files are loaded***")   

****All the VCF files are loaded***


In [6]:
## Load the reference file
reference_fasta = pysam.FastaFile(reference_path)
print("#####The reference file is loaded######")

#####The reference file is loaded######


In [7]:
# List all sequence names
sequence_names = reference_fasta.references
print("Sequence names in the FASTA file:", sequence_names)

Sequence names in the FASTA file: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'GL000008.2', 'GL000009.2', 'GL000194.1', 'GL000195.1', 'GL000205.2', 'GL000208.1', 'GL000213.1', 'GL000214.1', 'GL000216.2', 'GL000218.1', 'GL000219.1', 'GL000220.1', 'GL000221.1', 'GL000224.1', 'GL000225.1', 'GL000226.1', 'KI270302.1', 'KI270303.1', 'KI270304.1', 'KI270305.1', 'KI270310.1', 'KI270311.1', 'KI270312.1', 'KI270315.1', 'KI270316.1', 'KI270317.1', 'KI270320.1', 'KI270322.1', 'KI270329.1', 'KI270330.1', 'KI270333.1', 'KI270334.1', 'KI270335.1', 'KI270336.1', 'KI270337.1', 'KI270338.1', 'KI270340.1', 'KI270362.1', 'KI270363.1', 'KI270364.1', 'KI270366.1', 'KI270371.1', 'KI270372.1', 'KI270373.1', 'KI270374.1', 'KI270375.1', 'KI270376.1', 'KI270378.1', 'KI270379.1', 'KI270381.1', 'KI270382.1', 'KI270383.1', 'KI270384.1', 'KI270385.

In [8]:
dnabert_raw_data = {}
for key, value in loaded_dictionary.items():
    #display(value[['chr_name', 'start', 'end', 'strand', 'transcript_type', 'CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT']])
    print("CODE STARTED")
    print(key, value.shape)
    out_folder_path = output_path+"Patient_wise/"+ key.split("_")[0].split(".")[0]
    print(out_folder_path, "is created")
    #ins_del_df = value[(value['ALT'].str.len() > value['REF'].str.len()) | (value['REF'].str.len() > value['ALT'].str.len())]
    new_df, df_kmer = get_sequences(value, reference_fasta)
    dnabert_raw_data[key.split("_")[0].split(".")[0]] = new_df
    
    #input()
    if not os.path.exists(out_folder_path):
        os.makedirs(out_folder_path)
    df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index= False)

CODE STARTED
54d621a5-8e45-4bbc-92c4-1c7f36212361 (5236, 24)
/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/Generated_files/DNABERT_Data/acceptor/Patient_wise/54d621a5-8e45-4bbc-92c4-1c7f36212361 is created
(5235, 12)
&*
(10470, 2)
CODE STARTED
ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1 (7679, 24)
/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/Generated_files/DNABERT_Data/acceptor/Patient_wise/ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1 is created
(7677, 12)
&*
(15354, 2)
CODE STARTED
0f158d5f-16c7-4fb7-b48c-0ff392f1706b (5290, 24)
/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/Generated_files/DNABERT_Data/acceptor/Patient_wise/0f158d5f-16c7-4fb7-b48c-0ff392f1706b is created
(5290, 12)
&*
(10580, 2)
CODE STARTED
da5fbf8f-862d-49a6-8119-86f508063e29 (5240, 24)
/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/Generated_files/DNABERT_Data/acceptor/Patient_wise/da5fbf8f-862d-49a6-8119-86f508063e29 is created
(5239, 12)
&*
(10478, 2)
CODE STARTED
24abc004-6712-4df9-83f2-bd98dcc6ea5e (5228, 24)
/home/pdutta/Data/Cance

In [9]:
with open(output_path+"/raw_{}_vcf_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(dnabert_raw_data, file)