In [32]:
DNASeq= ["ATGCCCCCTTTTTGGGGGGGGGGTTTGGTGTGTGATGTGACGTGCCCCCCTGTAGACTGAT"]

In [2]:
import numpy as np
import itertools
from sklearn.preprocessing import StandardScaler

In [3]:
# motif info database
complement = str.maketrans('ATGC', 'TACG')
b = ['A', 'T', 'G', 'C']
mers5 = [''.join(p) for p in itertools.product(b, repeat=5)]
rc_mers5 = [''] * int(len(mers5)/2)
for i, s in enumerate(mers5):
    rc_mers5[i] = s[::-1].translate(complement)
    if rc_mers5[i] in mers5:
        mers5.remove(rc_mers5[i])
mer5_list = np.column_stack((mers5, rc_mers5))


In [4]:
def compute_gc_content(sequence):
    gc_count = sequence.count('G') + sequence.count('C')
    return gc_count / len(sequence)

In [5]:
def compute_5mer_counts(sequence):
    motif_counts = np.zeros(len(mer5_list), dtype=int)
    for i in range(len(sequence) - 4):
        kmer = sequence[i:i+5]
        idx = np.where(mer5_list == kmer)[0].item()
        motif_counts[idx] += 1
    return motif_counts

In [23]:
def seqToX(DNASeq_list):

    n_samples = len(DNASeq_list)
    n_features = 512 + 1  # 512 5-mer counts + 1 GC content
    X = np.zeros((n_samples, n_features))  # shape: (samples, features)

    for i,currSeq in enumerate(DNASeq_list):
        seq = currSeq['sequence']
        X[i,:] = np.concatenate((compute_5mer_counts(seq), [compute_gc_content(seq)]))
    
    feature_names = [f'{mer}' for mer in mer5_list[:,0]] + ['gc_content']
    return X, feature_names

In [31]:
import json

DNASeq_dicts = [{'sequence': s} for s in DNASeq]
X, feature_names = seqToX(DNASeq_dicts)

input_dict = {feature_names[i]: float(X[0, i]) for i in range(len(feature_names))}
print(json.dumps(input_dict, indent=2))


Feature matrix shape: (1, 513)
{
  "AAAAA": 1.0,
  "AAAAT": 0.0,
  "AAAAG": 1.0,
  "AAAAC": 0.0,
  "AAATA": 0.0,
  "AAATT": 0.0,
  "AAATG": 0.0,
  "AAATC": 0.0,
  "AAAGA": 0.0,
  "AAAGT": 0.0,
  "AAAGG": 1.0,
  "AAAGC": 0.0,
  "AAACA": 0.0,
  "AAACT": 0.0,
  "AAACG": 0.0,
  "AAACC": 1.0,
  "AATAA": 0.0,
  "AATAT": 0.0,
  "AATAG": 0.0,
  "AATAC": 0.0,
  "AATTA": 0.0,
  "AATTG": 0.0,
  "AATTC": 0.0,
  "AATGA": 0.0,
  "AATGT": 0.0,
  "AATGG": 0.0,
  "AATGC": 0.0,
  "AATCA": 0.0,
  "AATCT": 0.0,
  "AATCG": 0.0,
  "AATCC": 0.0,
  "AAGAA": 0.0,
  "AAGAT": 0.0,
  "AAGAG": 0.0,
  "AAGAC": 0.0,
  "AAGTA": 0.0,
  "AAGTT": 0.0,
  "AAGTG": 0.0,
  "AAGTC": 0.0,
  "AAGGA": 0.0,
  "AAGGT": 0.0,
  "AAGGG": 1.0,
  "AAGGC": 0.0,
  "AAGCA": 0.0,
  "AAGCT": 0.0,
  "AAGCG": 0.0,
  "AAGCC": 0.0,
  "AACAA": 0.0,
  "AACAT": 0.0,
  "AACAG": 0.0,
  "AACAC": 0.0,
  "AACTA": 0.0,
  "AACTG": 0.0,
  "AACTC": 0.0,
  "AACGA": 0.0,
  "AACGT": 0.0,
  "AACGG": 0.0,
  "AACGC": 0.0,
  "AACCA": 0.0,
  "AACCT": 0.0,
  "AACC