In [1]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
from Bio.Seq import Seq
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [2]:
non_coding_region = "core_prom"

In [3]:
data_path = '/data/projects/DNABERT_snv/Manuscript_11_2023/DBSNP_output/Intersected_data/{}/all_{}_intesected_data.tsv'.format(non_coding_region, non_coding_region)
output_path = "/data/projects/DNABERT_snv/Manuscript_11_2023/DBSNP_output/DNABERT_data/{}".format(non_coding_region) 

In [4]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [6]:
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [7]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)

(7801436, 20)


In [8]:
df

Unnamed: 0,chr_name,core_prom_start,core_prom_end,ENSEMBL_Transcript_ID,ENSEMBL_Gene ID,strand,TSS,Transcript_length,Transcript_type,Gene_CG_content,GENE_symbol,Category,Tissue,chromosome,Variant_start,Variant_end,RS_ID,Reference_Nucleotide,No_of_variations,Alternative_Nucleotides
0,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,ESRRA,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr11,64305478,64305482,rs1565373574,CCCC,2,"CCC,CCCCC,"
1,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,ESRRA,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr11,64305479,64305480,rs2035011417,C,1,"A,"
2,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,ESRRA,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr11,64305481,64305482,rs1279195484,C,1,"A,"
3,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,ESRRA,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr11,64305482,64305483,rs1414580223,G,1,"A,"
4,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,ESRRA,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr11,64305483,64305484,rs2035011529,C,2,"A,T,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7801431,chr2,85888732,85888821,ENST00000640992,ENSG00000115525,-,85888776,2292,protein_coding,45.07,ST3GAL5,NullTrans,.,chr2,85888809,85888810,rs1430924124,G,1,"A,"
7801432,chr2,85888732,85888821,ENST00000640992,ENSG00000115525,-,85888776,2292,protein_coding,45.07,ST3GAL5,NullTrans,.,chr2,85888812,85888813,rs1573747314,G,1,"A,"
7801433,chr2,85888732,85888821,ENST00000640992,ENSG00000115525,-,85888776,2292,protein_coding,45.07,ST3GAL5,NullTrans,.,chr2,85888813,85888814,rs1350089366,C,2,"A,T,"
7801434,chr2,85888732,85888821,ENST00000640992,ENSG00000115525,-,85888776,2292,protein_coding,45.07,ST3GAL5,NullTrans,.,chr2,85888816,85888817,rs1438222811,C,2,"A,T,"


In [9]:
# df = pd.read_csv(data_path, sep= '\t')
# print(df.shape)

# df = df[df['No_of_variations']<3].reset_index(drop=True)
# print(df.shape)
# df = df.loc[df['Alternative_Allele']== '.'].reset_index(drop=True)
# df
#df= df[df['strand']=="-"]


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)Reference_Allele	Alternative_Allele
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['{}_start'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)]])

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['{}_start'.format(non_coding_region)]
        variant_pos_end = row['Variant_end']  - row['{}_start'.format(non_coding_region)]
        #print(variant_pos_start, variant_pos_end, alts,  row['Reference_Nucleotide'])
        #print(ref_seq , "###")
        # if row['strand'] == '-':
        #     ref_seq = str(Seq(ref_seq).reverse_complement())
        #print(ref_seq)


        #Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['{}_end'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['{}_end'.format(non_coding_region)]:row['{}_end'.format(non_coding_region)] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            #print(alt_seq)
            #Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'strand': row['strand'],
                '{}_coordinates'.format(non_coding_region): str(row['{}_start'.format(non_coding_region)])+'-'+str(row['{}_end'.format(non_coding_region)]),
                'Ensemble_Transcript_ID': row['ENSEMBL_Transcript_ID'],
                'RS_ID': row['RS_ID'],
                'variant_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
            
    except KeyError:
        print("NA")
        # # If sequence is not present, append a default value
        # ref_sequences.append("NA")
        # alt_sequences.append("NA")
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index(drop=True)
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

(9697819, 11)
(9697819, 11)


In [10]:
new_df

Unnamed: 0,chr,strand,core_prom_coordinates,Ensemble_Transcript_ID,RS_ID,variant_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,chr11,+,64305479-64305568,ENST00000000442,rs2035011417,64305479,64305480,C,A,CCCGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...,ACCGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...
1,chr11,+,64305479-64305568,ENST00000000442,rs1279195484,64305481,64305482,C,A,CCCGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...,CCAGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...
2,chr11,+,64305479-64305568,ENST00000000442,rs1414580223,64305482,64305483,G,A,CCCGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...,CCCACCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...
3,chr11,+,64305479-64305568,ENST00000000442,rs2035011529,64305483,64305484,C,A,CCCGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...,CCCGACCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...
4,chr11,+,64305479-64305568,ENST00000000442,rs2035011529,64305483,64305484,C,T,CCCGCCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...,CCCGTCCCCTGCTTTGCATGCGCACGGCCGGCCCCACCCCCGCTGT...
...,...,...,...,...,...,...,...,...,...,...,...
9697814,chr2,-,85888732-85888821,ENST00000640992,rs1350089366,85888813,85888814,C,A,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...
9697815,chr2,-,85888732-85888821,ENST00000640992,rs1350089366,85888813,85888814,C,T,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...
9697816,chr2,-,85888732-85888821,ENST00000640992,rs1438222811,85888816,85888817,C,A,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...
9697817,chr2,-,85888732-85888821,ENST00000640992,rs1438222811,85888816,85888817,C,T,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...,GCGCCCAGCCGGCCCGGGAAGAGACAAGTCGCCGCCGCAGCCCCCA...


In [11]:
df_kmer

Unnamed: 0,Sequence,Label
0,CCCGCC CCGCCC CGCCCC GCCCCC CCCCCT CCCCTG CCCT...,0
1,ACCGCC CCGCCC CGCCCC GCCCCC CCCCCT CCCCTG CCCT...,1
2,CCCGCC CCGCCC CGCCCC GCCCCC CCCCCT CCCCTG CCCT...,1
3,CCAGCC CAGCCC AGCCCC GCCCCC CCCCCT CCCCTG CCCT...,0
4,CCCGCC CCGCCC CGCCCC GCCCCC CCCCCT CCCCTG CCCT...,1
...,...,...
19395633,GCGCCC CGCCCA GCCCAG CCCAGC CCAGCC CAGCCG AGCC...,0
19395634,GCGCCC CGCCCA GCCCAG CCCAGC CCAGCC CAGCCG AGCC...,1
19395635,GCGCCC CGCCCA GCCCAG CCCAGC CCAGCC CAGCCG AGCC...,1
19395636,GCGCCC CGCCCA GCCCAG CCCAGC CCAGCC CAGCCG AGCC...,0
