In [2]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
from Bio.Seq import Seq
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [3]:
non_coding_region = "donor"

In [4]:
data_path = '/data/projects/DNABERT_snv/Manuscript_11_2023/DBSNP_output/Intersected_data/{}/all_{}_intesected_data.tsv'.format(non_coding_region, non_coding_region)
output_path = "/data/projects/DNABERT_snv/Manuscript_11_2023/DBSNP_output/DNABERT_data/{}".format(non_coding_region) 

In [5]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [6]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [7]:
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [8]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)

(37045469, 20)


In [9]:
# df = pd.read_csv(data_path, sep= '\t')
# print(df.shape)

# df = df[df['No_of_variations']<3].reset_index(drop=True)
# print(df.shape)
# df = df.loc[df['Alternative_Allele']== '.'].reset_index(drop=True)
# df
#df= df[df['strand']=="-"]


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)Reference_Allele	Alternative_Allele
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['{}_start'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)]])

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['{}_start'.format(non_coding_region)]
        variant_pos_end = row['Variant_end']  - row['{}_start'.format(non_coding_region)]
        #print(variant_pos_start, variant_pos_end, alts,  row['Reference_Nucleotide'])
        #print(ref_seq , "###")
        # if row['strand'] == '-':
        #     ref_seq = str(Seq(ref_seq).reverse_complement())
        #print(ref_seq)


        #Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['{}_end'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['{}_end'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            #print(alt_seq)
            #Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'strand': row['strand'],
                '{}_coordinates'.format(non_coding_region): str(row['{}_start'.format(non_coding_region)])+'-'+str(row['{}_end'.format(non_coding_region)]),
                'Ensemble_Transcript_ID': row['ENSEMBL_Transcript_ID'],
                'RS_ID': row['RS_ID'],
                'variant_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
            
    except KeyError:
        print("NA")
        # # If sequence is not present, append a default value
        # ref_sequences.append("NA")
        # alt_sequences.append("NA")
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index(drop=True)
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

(43537490, 11)
(43504528, 11)


In [10]:
new_df

Unnamed: 0,chr,strand,donor_coordinates,Ensemble_Transcript_ID,RS_ID,variant_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,chr11,+,64315231-64315310,ENST00000000442,rs1484286606,64315233,64315245,TATGTTCTACTA,TA,AGTATGTTCTACTAAAGGCCTTGGCCCTTGCCAATTCAGGTGAGTC...,AGTAAAGGCCTTGGCCCTTGCCAATTCAGGTGAGTCTGGGGCAGGC...
1,chr11,+,64315231-64315310,ENST00000000442,rs750270682,64315250,64315251,C,T,AGTATGTTCTACTAAAGGCCTTGGCCCTTGCCAATTCAGGTGAGTC...,AGTATGTTCTACTAAAGGCTTTGGCCCTTGCCAATTCAGGTGAGTC...
2,chr11,+,64315231-64315310,ENST00000000442,rs2035223711,64315259,64315260,T,C,AGTATGTTCTACTAAAGGCCTTGGCCCTTGCCAATTCAGGTGAGTC...,AGTATGTTCTACTAAAGGCCTTGGCCCTCGCCAATTCAGGTGAGTC...
3,chr11,+,64315231-64315310,ENST00000000442,rs2035223790,64315260,64315261,G,A,AGTATGTTCTACTAAAGGCCTTGGCCCTTGCCAATTCAGGTGAGTC...,AGTATGTTCTACTAAAGGCCTTGGCCCTTACCAATTCAGGTGAGTC...
4,chr11,+,64315231-64315310,ENST00000000442,rs1361791105,64315268,64315269,A,G,AGTATGTTCTACTAAAGGCCTTGGCCCTTGCCAATTCAGGTGAGTC...,AGTATGTTCTACTAAAGGCCTTGGCCCTTGCCAATTCGGGTGAGTC...
...,...,...,...,...,...,...,...,...,...,...,...
43504523,chr2,-,85839156-85839235,ENST00000640992,rs1487482349,85839224,85839225,A,G,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...
43504524,chr2,-,85839156-85839235,ENST00000640992,rs3202465,85839225,85839226,G,A,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...
43504525,chr2,-,85839156-85839235,ENST00000640992,rs115591304,85839226,85839227,A,G,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...
43504526,chr2,-,85839156-85839235,ENST00000640992,rs1267804946,85839227,85839228,A,G,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...,AAGTTTTTATTCTTTTTCATCTTTTTAAACTGGCACACTGCCTGGT...


In [11]:
df_kmer

Unnamed: 0,Sequence,Label
0,AGTATG GTATGT TATGTT ATGTTC TGTTCT GTTCTA TTCT...,1
1,AGTAAA GTAAAG TAAAGG AAAGGC AAGGCC AGGCCT GGCC...,1
2,AGTATG GTATGT TATGTT ATGTTC TGTTCT GTTCTA TTCT...,1
3,AGTATG GTATGT TATGTT ATGTTC TGTTCT GTTCTA TTCT...,1
4,AGTATG GTATGT TATGTT ATGTTC TGTTCT GTTCTA TTCT...,0
...,...,...
87009051,AAGTTT AGTTTT GTTTTT TTTTTA TTTTAT TTTATT TTAT...,1
87009052,AAGTTT AGTTTT GTTTTT TTTTTA TTTTAT TTTATT TTAT...,0
87009053,AAGTTT AGTTTT GTTTTT TTTTTA TTTTAT TTTATT TTAT...,1
87009054,AAGTTT AGTTTT GTTTTT TTTTTA TTTTAT TTTATT TTAT...,1
