In [4]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [5]:
non_coding_region = "CoreProm"

In [6]:
data_path = '/data/projects/Enhancer/RECOMB_2024/Intersected_data/{}/all_data.tsv'.format(non_coding_region)
output_path = "/data/projects/Enhancer/RECOMB_2024/SNP_data/{}".format(non_coding_region) 

In [7]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [8]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [9]:
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [10]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)
df = df[df['No_of_variations']<3].reset_index(drop=True)
print(df.shape)
df = df.loc[df['Alternative_Nucleotides'] != '.'].reset_index(drop=True)
print(df.shape)


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['CoreProm_start']:row['CoreProm_end']])

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['CoreProm_start']
        variant_pos_end = row['Variant_end']  - row['CoreProm_start']
        #print(variant_pos_start, variant_pos_end)


        # Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['CoreProm_end']:row['CoreProm_end'] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['CoreProm_end']:row['CoreProm_end'] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            # Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'strand': row['Strand'],
                'CoreProm_coordinates': str(row['CoreProm_start'])+'-'+str(row['CoreProm_end']),
                'Ensemble_Transcript_ID': row['Transcript_id'],
                'dbsnp_id': row['RS_ID'],
                'varinat_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
    except KeyError:
        # If sequence is not present, append a default value
        ref_sequences.append("NA")
        alt_sequences.append("NA")

        
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index()
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

(5169, 13)
(5011, 13)
(5011, 13)
(5668, 11)
(5668, 12)


In [11]:
df_kmer

Unnamed: 0,Sequence,Label
0,TGGATG GGATGG GATGGT ATGGTC TGGTCA GGTCAT GTCA...,0
1,TGAATG GAATGG AATGGT ATGGTC TGGTCA GGTCAT GTCA...,0
2,TGGATG GGATGG GATGGT ATGGTC TGGTCA GGTCAT GTCA...,0
3,TGTATG GTATGG TATGGT ATGGTC TGGTCA GGTCAT GTCA...,1
4,TGGATG GGATGG GATGGT ATGGTC TGGTCA GGTCAT GTCA...,1
...,...,...
11331,TGGTCA GGTCAG GTCAGA TCAGAG CAGAGA AGAGAG GAGA...,0
11332,TGGTCA GGTCAG GTCAGA TCAGAG CAGAGA AGAGAG GAGA...,1
11333,TGGTCA GGTCAG GTCAGA TCAGAG CAGAGA AGAGAG GAGA...,0
11334,TGGTCA GGTCAG GTCAGA TCAGAG CAGAGA AGAGAG GAGA...,0
