In [7]:
import os, glob
import pandas as pd
from io import StringIO
import numpy as np
import pysam
import pybedtools 
from Bio import SeqIO
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [8]:
data_path = '/data/projects/Enhancer/RECOMB_2024/Intersected_data/all_Enhancer_data_Consensus.tsv'
output_path = "/data/projects/Enhancer/RECOMB_2024/SNP_data/Consensus_data/"

In [9]:
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [10]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [11]:
genome = SeqIO.to_dict(SeqIO.parse("/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa", "fasta"))

In [12]:
df = pd.read_csv(data_path, sep= '\t')
print(df.shape)
df = df[df['No_of_variations']<3].reset_index(drop=True)
print(df.shape)
df = df.loc[df['Alternative_Nucleotides'] != '.'].reset_index(drop=True)
print(df.shape)


data= []
#df = df.head(10)
# Iterate over DataFrame rows
for idx, row in df.iterrows():
    try:
        #print(row)
        alts = row['Alternative_Nucleotides'].rstrip(',').split(',')
        ref_seq = str(genome[row['chr_name']].seq[ row['Enhancer_start']:row['Enhancer_end']])
        #ref_sequences.append(ref_sequence)
        #print(alts)
        #print(ref_sequence)

        # Calculate the variant position relative to the fetched sequence
        variant_pos_start = row['Variant_start'] - row['Enhancer_start']
        variant_pos_end = row['Variant_end']  - row['Enhancer_start']
        #print(variant_pos_start, variant_pos_end)


        # Replace the reference nucleotide with the alternate nucleotide to get the alternate sequence
        for alt in alts:
            if alt == "":  # Deletion
                # Fetch additional bases from genome to maintain sequence length
                extra_bases = str(genome[row['chr_name']].seq[row['Enhancer_end']:row['Enhancer_end'] + len(row['Reference_Nucleotide'])])
                #print(len(row['Reference_Nucleotide']))
                alt_seq = ref_seq[:variant_pos_start] + ref_seq[variant_pos_end:] + extra_bases
            else:  # SNPs, Insertions, and partial Deletions
                extra_bases = str(genome[row['chr_name']].seq[row['Enhancer_end']:row['Enhancer_end'] + len(row['Reference_Nucleotide']) - len(alt)])
                #print(extra_bases)
                if(variant_pos_start>=0):
                    alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases
                else:
                    continue
            # Append to the list as a dictionary
            data.append({
                'chr': row['chr_name'],
                'Enhancer_coordinates': str(row['Enhancer_start'])+'-'+str(row['Enhancer_end']),
                'dbsnp_id': row['RS_ID'],
                'varinat_start':  row['Variant_start'],
                'variant_end':  row['Variant_end'],
                'ref_neucleotide': row['Reference_Nucleotide'],
                'alternative_neucleotide': alt,
                'reference_seq': ref_seq,
                'alt_seq': alt_seq
            })
    except KeyError:
        # If sequence is not present, append a default value
        ref_sequences.append("NA")
        alt_sequences.append("NA")

        
        
new_df = pd.DataFrame(data)
print(new_df.shape)
new_df = new_df.drop_duplicates().reset_index()
print(new_df.shape)
merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
merged_list = [item.upper() for tup in merged_list for item in tup]
#print(merged_list)
kmer_lst = list(map(seq2kmer, merged_list))
df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))

new_df.to_csv(output_path + "/all_data_new.tsv", sep="\t", index= False)
df_kmer.to_csv(output_path + "/dev.tsv", sep="\t", index= False)

(6949537, 9)
(6854167, 9)
(6854164, 9)
(7738792, 9)
(7738792, 10)


In [13]:
df

Unnamed: 0,chr_name,Enhancer_start,Enhancer_end,Variant_start,Variant_end,RS_ID,Reference_Nucleotide,No_of_variations,Alternative_Nucleotides
0,chr2,236984140,236984339,236984140,236984141,rs1423429845,C,1,"A,"
1,chr2,236984140,236984339,236984152,236984153,rs1192060657,G,1,"C,"
2,chr2,236984140,236984339,236984153,236984154,rs1040350809,G,1,"A,"
3,chr2,236984140,236984339,236984154,236984155,rs1238087442,C,1,"T,"
4,chr2,236984140,236984339,236984158,236984159,rs1574804327,T,1,"A,"
...,...,...,...,...,...,...,...,...,...
6854159,chr5,172241923,172242122,172242111,172242112,rs1343067098,C,1,"T,"
6854160,chr5,172241923,172242122,172242112,172242113,rs1772142686,C,1,"A,"
6854161,chr5,172241923,172242122,172242113,172242114,rs1772142740,T,1,"A,"
6854162,chr5,172241923,172242122,172242118,172242119,rs140804318,C,2,"A,T,"


In [14]:
new_df

Unnamed: 0,index,chr,Enhancer_coordinates,dbsnp_id,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,0,chr2,236984140-236984339,rs1423429845,236984140,236984141,C,A,CTTCTATTCATGGGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...,ATTCTATTCATGGGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...
1,1,chr2,236984140-236984339,rs1192060657,236984152,236984153,G,C,CTTCTATTCATGGGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...,CTTCTATTCATGCGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...
2,2,chr2,236984140-236984339,rs1040350809,236984153,236984154,G,A,CTTCTATTCATGGGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...,CTTCTATTCATGGACTTTTGTCAAGCTTCCAATTTAGGACTATTAC...
3,3,chr2,236984140-236984339,rs1238087442,236984154,236984155,C,T,CTTCTATTCATGGGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...,CTTCTATTCATGGGTTTTTGTCAAGCTTCCAATTTAGGACTATTAC...
4,4,chr2,236984140-236984339,rs1574804327,236984158,236984159,T,A,CTTCTATTCATGGGCTTTTGTCAAGCTTCCAATTTAGGACTATTAC...,CTTCTATTCATGGGCTTTAGTCAAGCTTCCAATTTAGGACTATTAC...
...,...,...,...,...,...,...,...,...,...,...
7738787,7738787,chr5,172241923-172242122,rs1772142686,172242112,172242113,C,A,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...
7738788,7738788,chr5,172241923-172242122,rs1772142740,172242113,172242114,T,A,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...
7738789,7738789,chr5,172241923-172242122,rs140804318,172242118,172242119,C,A,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...
7738790,7738790,chr5,172241923-172242122,rs140804318,172242118,172242119,C,T,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...,GGGAGGATCACTAGAGCCTGGGAGATGTAGGCTGCAGTGAGCTGCG...
