In [2]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
pybedtools.helpers.set_tempdir('/home/pdutta/temp')

In [3]:
data_path = '/home/pdutta/Data/EnhancerPoster/ZBTB33/Intersected_files/all_Enhancer_data_Consensus.tsv'
output_path = "/home/pdutta/Data/EnhancerPoster/ZBTB33/DNABERT_data/"

In [4]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [5]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [7]:
genome = SeqIO.to_dict(SeqIO.parse("/home/pdutta/Data/Human_Genome_Data/GRCh38.primary_assembly.genome.fa", "fasta"))

In [8]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)
df = df[df['No_of_variations']<3].reset_index(drop=True)
print(df.shape)
df = df.loc[df['Alternative_Nucleotides'] != '.'].reset_index(drop=True)
print(df.shape)


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['Enhancer_start']:row['Enhancer_end']])
        #ref_sequences.append(ref_sequence)
        #print(alts)
        #print(ref_sequence)

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['Enhancer_start']
        variant_pos_end = row['Variant_end']  - row['Enhancer_start']
        #print(variant_pos_start, variant_pos_end)


        # Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['Enhancer_end']:row['Enhancer_end'] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['Enhancer_end']:row['Enhancer_end'] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            # Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'Enhancer_coordinates': str(row['Enhancer_start'])+'-'+str(row['Enhancer_end']),
                'dbsnp_id': row['RS_ID'],
                'varinat_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
    except KeyError:
        # If sequence is not present, append a default value
        ref_sequences.append("NA")
        alt_sequences.append("NA")

        
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index()
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

(439792, 9)
(430596, 9)
(430596, 9)
(497648, 9)
(497648, 10)


In [9]:
df

Unnamed: 0,chr_name,Enhancer_start,Enhancer_end,Variant_start,Variant_end,RS_ID,Reference_Nucleotide,No_of_variations,Alternative_Nucleotides
0,chr19,561375,561574,561375,561376,rs1345678776,C,1,"T,"
1,chr19,561375,561574,561377,561378,rs570139519,C,2,"G,T,"
2,chr19,561375,561574,561378,561379,rs1022722361,G,1,"A,"
3,chr19,561375,561574,561380,561381,rs1980794075,C,1,"T,"
4,chr19,561375,561574,561382,561383,rs1980794126,G,2,"A,C,"
...,...,...,...,...,...,...,...,...,...
430591,chr18,2944504,2944703,2944683,2944687,rs2077422449,TTTT,1,"TTTTT,"
430592,chr18,2944504,2944703,2944688,2944689,rs2077422509,C,1,"T,"
430593,chr18,2944504,2944703,2944689,2944690,rs1271925579,A,1,"T,"
430594,chr18,2944504,2944703,2944695,2944696,rs1334793346,G,1,"A,"


In [10]:
new_df

Unnamed: 0,index,chr,Enhancer_coordinates,dbsnp_id,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,0,chr19,561375-561574,rs1345678776,561375,561376,C,T,CACGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...,TACGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...
1,1,chr19,561375-561574,rs570139519,561377,561378,C,G,CACGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...,CAGGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...
2,2,chr19,561375-561574,rs570139519,561377,561378,C,T,CACGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...,CATGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...
3,3,chr19,561375-561574,rs1022722361,561378,561379,G,A,CACGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...,CACACCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...
4,4,chr19,561375-561574,rs1980794075,561380,561381,C,T,CACGCCTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...,CACGCTTGTCATCCCAGCACTTCGGGAGGCCAAGGCGGGTGGATCA...
...,...,...,...,...,...,...,...,...,...,...
497643,497643,chr18,2944504-2944703,rs2077422449,2944683,2944687,TTTT,TTTTT,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...
497644,497644,chr18,2944504-2944703,rs2077422509,2944688,2944689,C,T,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...
497645,497645,chr18,2944504-2944703,rs1271925579,2944689,2944690,A,T,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...
497646,497646,chr18,2944504-2944703,rs1334793346,2944695,2944696,G,A,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...,CTACGCCCAGCTAATTTTTGTATTTCTAGTAGAGACGGGGTTTCAC...


In [11]:
df_kmer

Unnamed: 0,Sequence,Label
0,CACGCC ACGCCT CGCCTG GCCTGT CCTGTC CTGTCA TGTC...,1
1,TACGCC ACGCCT CGCCTG GCCTGT CCTGTC CTGTCA TGTC...,0
2,CACGCC ACGCCT CGCCTG GCCTGT CCTGTC CTGTCA TGTC...,0
3,CAGGCC AGGCCT GGCCTG GCCTGT CCTGTC CTGTCA TGTC...,1
4,CACGCC ACGCCT CGCCTG GCCTGT CCTGTC CTGTCA TGTC...,1
...,...,...
995291,CTACGC TACGCC ACGCCC CGCCCA GCCCAG CCCAGC CCAG...,0
995292,CTACGC TACGCC ACGCCC CGCCCA GCCCAG CCCAGC CCAG...,0
995293,CTACGC TACGCC ACGCCC CGCCCA GCCCAG CCCAGC CCAG...,1
995294,CTACGC TACGCC ACGCCC CGCCCA GCCCAG CCCAGC CCAG...,0
