In [1]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
from Bio.Seq import Seq
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [2]:
non_coding_region = "acceptor"

In [8]:
data_path = '/data/projects/DNABERT_snv/Manuscript_11_2023/DBSNP_output/Intersected_data/{}/all_{}_intesected_data.tsv'.format(non_coding_region, non_coding_region)
output_path = "/data/projects/DNABERT_snv/Manuscript_11_2023/DBSNP_output/DNABERT_data/{}".format(non_coding_region) 

In [9]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [10]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [11]:
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [21]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)

(37854437, 20)


In [None]:
# df = pd.read_csv(data_path, sep= '\t')
# print(df.shape)

# df = df[df['No_of_variations']<3].reset_index(drop=True)
# print(df.shape)
# df = df.loc[df['Alternative_Allele']== '.'].reset_index(drop=True)
# df
#df= df[df['strand']=="-"]


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)Reference_Allele	Alternative_Allele
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['{}_start'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)]])

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['{}_start'.format(non_coding_region)]
        variant_pos_end = row['Variant_end']  - row['{}_start'.format(non_coding_region)]
        #print(variant_pos_start, variant_pos_end, alts,  row['Reference_Nucleotide'])
        #print(ref_seq , "###")
        # if row['strand'] == '-':
        #     ref_seq = str(Seq(ref_seq).reverse_complement())
        #print(ref_seq)


        #Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['{}_end'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['{}_end'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            #print(alt_seq)
            #Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'strand': row['strand'],
                '{}_coordinates'.format(non_coding_region): str(row['{}_start'.format(non_coding_region)])+'-'+str(row['{}_end'.format(non_coding_region)]),
                'Ensemble_Transcript_ID': row['ENSEMBL_Transcript_ID'],
                'RS_ID': row['RS_ID'],
                'variant_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
            
    except KeyError:
        print("NA")
        # # If sequence is not present, append a default value
        # ref_sequences.append("NA")
        # alt_sequences.append("NA")
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index(drop=True)
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

In [None]:
new_df

In [None]:
df_kmer