In [30]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [2]:
non_coding_region = "Acceptor"

In [3]:
data_path = '/data/projects/Enhancer/RECOMB_2024/Intersected_data/{}/all_data.tsv'.format(non_coding_region)
output_path = "/data/projects/Enhancer/RECOMB_2024/SNP_data/{}".format(non_coding_region) 

In [4]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [28]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [13]:
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [34]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)
df = df[df['No_of_variations']<3].reset_index(drop=True)
print(df.shape)
df = df.loc[df['Alternative_Nucleotides'] != '.'].reset_index(drop=True)
print(df.shape)


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['Acceptor_start']:row['Acceptor_end']])
        #ref_sequences.append(ref_sequence)
        #print(alts)
        #print(ref_sequence)

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['Acceptor_start']
        variant_pos_end = row['Variant_end']  - row['Acceptor_start']
        #print(variant_pos_start, variant_pos_end)


        # Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['Acceptor_end']:row['Acceptor_end'] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['Acceptor_end']:row['Acceptor_end'] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            # Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'strand': row['Strand'],
                'Acceptor_coordinates': str(row['Acceptor_start'])+'-'+str(row['Acceptor_end']),
                'Ensemble_Transcript_ID': row['Transcript_id'],
                'dbsnp_id': row['RS_ID'],
                'varinat_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
    except KeyError:
        # If sequence is not present, append a default value
        ref_sequences.append("NA")
        alt_sequences.append("NA")

        
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index()
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

(38758, 13)
(38184, 13)
(38184, 13)
(43563, 11)
(42854, 12)


In [36]:
df

Unnamed: 0,chr_name,Acceptor_start,Acceptor_end,Transcript_id,Strand,Enhancer_start,Enhancer_end,Variant_start,Variant_end,RS_ID,Reference_Nucleotide,No_of_variations,Alternative_Nucleotides
0,chr2,108897190,108897269,ENST00000258443.7,-,108896998,108897197,108897187,108897193,rs1574362124,TCTTCT,1,"TCT,"
1,chr2,108897190,108897269,ENST00000258443.7,-,108896998,108897197,108897191,108897192,rs566530415,C,1,"A,"
2,chr2,108897190,108897269,ENST00000258443.7,-,108896998,108897197,108897193,108897194,rs121908456,C,2,"A,T,"
3,chr2,108897190,108897269,ENST00000258443.7,-,108896998,108897197,108897194,108897195,rs777416175,G,1,"A,"
4,chr2,108897190,108897269,ENST00000258443.7,-,108896998,108897197,108897206,108897207,rs770460028,T,1,"C,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38179,chr5,179708198,179708277,ENST00000681903.1,+,179708271,179708470,179708265,179708266,rs774523334,A,1,"C,"
38180,chr5,179708198,179708277,ENST00000681903.1,+,179708271,179708470,179708268,179708269,rs1581856354,C,1,"T,"
38181,chr5,179708198,179708277,ENST00000681903.1,+,179708271,179708470,179708269,179708270,rs759471723,A,1,"G,"
38182,chr5,179708198,179708277,ENST00000681903.1,+,179708271,179708470,179708270,179708271,rs1175755473,A,1,"C,"
