In [126]:
import joblib
import pandas as pd
import numpy as np
import ast

from tensorflow.keras.models import load_model

In [127]:
# ------------------------------------------------------------------------------
# 1. Load the Trained Model and Preprocessing Artifacts
# ------------------------------------------------------------------------------
model = load_model('dnshield_rnn_timestamp.h5')
print("Model loaded successfully.")

# Load label encoder for the target labels
le_label = joblib.load('label_encoder.pkl')
# Load domain encoders for top-level and second-level domains
le_tld = joblib.load('le_dns_top_level_domain.pkl')
le_sld = joblib.load('le_dns_second_level_domain.pkl')
# Load the fitted scaler
scaler = joblib.load('scaler.pkl')
print("Encoders and scaler loaded successfully.")



Model loaded successfully.
Encoders and scaler loaded successfully.


In [128]:
# ------------------------------------------------------------------------------
# 2. Define the Expected Final Feature Order (27 numeric features)
# ------------------------------------------------------------------------------
FINAL_ORDER = [
    'dns_domain_name_length',
    'numerical_percentage',
    'character_entropy',
    'max_continuous_numeric_len',
    'max_continuous_alphabet_len',
    'vowels_consonant_ratio',
    'conv_freq_vowels_consonants',
    'packets_numbers',
    'receiving_packets_numbers',
    'sending_packets_numbers',
    'receiving_bytes',
    'sending_bytes',
    'distinct_ttl_values',
    'ttl_values_min',
    'ttl_values_max',
    'ttl_values_mean',
    'dns_top_level_domain_encoded',
    'dns_second_level_domain_encoded',
    'uni_gram_count',
    'bi_gram_count',
    'tri_gram_count',
    'query_resource_record_type_count',
    'ans_resource_record_type_count',
    'query_resource_record_class_count',
    'ans_resource_record_class_count',
    'vowel_count',
    'consonant_count'
]


In [175]:
# sample_queries = [
#     {
#         'timestamp': '2019-10-08 14:50:34.984911',
#         'dns_top_level_domain': 'mstanleysec-us',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 18,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.5724312513221195,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 11,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 78,
#         'sending_bytes': 173,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 's', 't', 'a', 'n', 'l', 'e', 'y', 's', 'e', 'c', '-', 'u', 's', '.', 'c', 'o', 'm']",
#         'bi_gram_domain_name': "['ms', 'st', 'ta', 'an', 'nl', 'le', 'ey', 'ys', 'se', 'ec', 'c-', '-u', 'us', 's.', '.c', 'co', 'om']",
#         'tri_gram_domain_name': "['mst', 'sta', 'tan', 'anl', 'nle', 'ley', 'eys', 'yse', 'sec', 'ec-', 'c-u', '-us', 'us.', 's.c', '.co', 'com']",
#         'character_distribution': "{'n': 1, 'e': 2, 'y': 1, 'l': 1, 't': 1, 'u': 1, 'c': 2, 'm': 2, '.': 1, 'o': 1, '-': 1, 'a': 1, 's': 3}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.098372',
#         'dns_top_level_domain': 'mstanleysec-us',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 18,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.5724312513221195,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 11,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 78,
#         'sending_bytes': 173,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 's', 't', 'a', 'n', 'l', 'e', 'y', 's', 'e', 'c', '-', 'u', 's', '.', 'c', 'o', 'm']",
#         'bi_gram_domain_name': "['ms', 'st', 'ta', 'an', 'nl', 'le', 'ey', 'ys', 'se', 'ec', 'c-', '-u', 'us', 's.', '.c', 'co', 'om']",
#         'tri_gram_domain_name': "['mst', 'sta', 'tan', 'anl', 'nle', 'ley', 'eys', 'yse', 'sec', 'ec-', 'c-u', '-us', 'us.', 's.c', '.co', 'com']",
#         'character_distribution': "{'n': 1, 'e': 2, 'y': 1, 'l': 1, 't': 1, 'u': 1, 'c': 2, 'm': 2, '.': 1, 'o': 1, '-': 1, 'a': 1, 's': 3}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.209019',
#         'dns_top_level_domain': 'mufgunion',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 13,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.085055102756477,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 9,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 73,
#         'sending_bytes': 161,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'u', 'f', 'g', 'u', 'n', 'i', 'o', 'n', '.', 'c', 'o', 'm']",
#         'bi_gram_domain_name': "['mu', 'uf', 'fg', 'gu', 'un', 'ni', 'io', 'on', 'n.', '.c', 'co', 'om']",
#         'tri_gram_domain_name': "['muf', 'ufg', 'fgu', 'gun', 'uni', 'nio', 'ion', 'on.', 'n.c', '.co', 'com']",
#         'character_distribution': "{'n': 2, 'g': 1, 'i': 1, 'u': 2, 'c': 1, 'm': 2, '.': 1, 'o': 2, 'f': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.260235',
#         'dns_top_level_domain': 'mufgunion',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 13,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.085055102756477,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 9,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 73,
#         'sending_bytes': 161,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'u', 'f', 'g', 'u', 'n', 'i', 'o', 'n', '.', 'c', 'o', 'm']",
#         'bi_gram_domain_name': "['mu', 'uf', 'fg', 'gu', 'un', 'ni', 'io', 'on', 'n.', '.c', 'co', 'om']",
#         'tri_gram_domain_name': "['muf', 'ufg', 'fgu', 'gun', 'uni', 'nio', 'ion', 'on.', 'n.c', '.co', 'com']",
#         'character_distribution': "{'n': 2, 'g': 1, 'i': 1, 'u': 2, 'c': 1, 'm': 2, '.': 1, 'o': 2, 'f': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.429857',
#         'dns_top_level_domain': 'mufj-financialgroup',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 23,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.8279097821439705,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 14,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 83,
#         'sending_bytes': 166,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'u', 'f', 'j', '-', 'f', 'i', 'n', 'a', 'n', 'c', 'i', 'a', 'l', 'g', 'r', 'o', 'u', 'p', '.', 'c', 'o', 'm']",
#         'bi_gram_domain_name': "['mu', 'uf', 'fj', 'j-', '-f', 'fi', 'in', 'na', 'an', 'nc', 'ci', 'ia', 'al', 'lg', 'gr', 'ro', 'ou', 'up', 'p.', '.c', 'co', 'om']",
#         'tri_gram_domain_name': "['muf', 'ufj', 'fj-', 'j-f', '-fi', 'fin', 'ina', 'nan', 'anc', 'nci', 'cia', 'ial', 'alg', 'lgr', 'gro', 'rou', 'oup', 'up.', 'p.c', '.co', 'com']",
#         'character_distribution': "{'n': 2, 'p': 1, 'j': 1, 'l': 1, 'g': 1, 'i': 2, 'u': 2, 'r': 1, 'c': 2, 'm': 2, '.': 1, 'o': 2, '-': 1, 'a': 2, 'f': 2}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.535907',
#         'dns_top_level_domain': 'mufj-financialgroup',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 23,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.8279097821439705,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 14,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 83,
#         'sending_bytes': 166,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'u', 'f', 'j', '-', 'f', 'i', 'n', 'a', 'n', 'c', 'i', 'a', 'l', 'g', 'r', 'o', 'u', 'p', '.', 'c', 'o', 'm']",
#         'bi_gram_domain_name': "['mu', 'uf', 'fj', 'j-', '-f', 'fi', 'in', 'na', 'an', 'nc', 'ci', 'ia', 'al', 'lg', 'gr', 'ro', 'ou', 'up', 'p.', '.c', 'co', 'om']",
#         'tri_gram_domain_name': "['muf', 'ufj', 'fj-', 'j-f', '-fi', 'fin', 'ina', 'nan', 'anc', 'nci', 'cia', 'ial', 'alg', 'lgr', 'gro', 'rou', 'oup', 'up.', 'p.c', '.co', 'com']",
#         'character_distribution': "{'n': 2, 'p': 1, 'j': 1, 'l': 1, 'g': 1, 'i': 2, 'u': 2, 'r': 1, 'c': 2, 'm': 2, '.': 1, 'o': 2, '-': 1, 'a': 2, 'f': 2}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.650655',
#         'dns_top_level_domain': 'mutualdeep',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 14,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.3248629576173574,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 10,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 74,
#         'sending_bytes': 139,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'u', 't', 'u', 'a', 'l', 'd', 'e', 'e', 'p', '.', 'i', 'c', 'u']",
#         'bi_gram_domain_name': "['mu', 'ut', 'tu', 'ua', 'al', 'ld', 'de', 'ee', 'ep', 'p.', '.i', 'ic', 'cu']",
#         'tri_gram_domain_name': "['mut', 'utu', 'tua', 'ual', 'ald', 'lde', 'dee', 'eep', 'ep.', 'p.i', '.ic', 'icu']",
#         'character_distribution': "{'n': 1, 'e': 2, 'y': 1, 'l': 1, 't': 1, 'i': 1, 'u': 3, 'c': 1, 'm': 1, '.': 1, 'a': 1, 'd': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.712807',
#         'dns_top_level_domain': 'mutualdeep',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 14,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.3248629576173574,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 10,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 74,
#         'sending_bytes': 139,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'u', 't', 'u', 'a', 'l', 'd', 'e', 'e', 'p', '.', 'i', 'c', 'u']",
#         'bi_gram_domain_name': "['mu', 'ut', 'tu', 'ua', 'al', 'ld', 'de', 'ee', 'ep', 'p.', '.i', 'ic', 'cu']",
#         'tri_gram_domain_name': "['mut', 'utu', 'tua', 'ual', 'ald', 'lde', 'dee', 'eep', 'ep.', 'p.i', '.ic', 'icu']",
#         'character_distribution': "{'n': 1, 'e': 2, 'y': 1, 'l': 1, 't': 1, 'i': 1, 'u': 3, 'c': 1, 'm': 1, '.': 1, 'a': 1, 'd': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.783108',
#         'dns_top_level_domain': 'my-id-telstra-com-au',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 25,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.923856189774724,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 7,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 85,
#         'sending_bytes': 145,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'y', '-', 'i', 'd', '-', 't', 'e', 'l', 's', 't', 'r', 'a', '-', 'c', 'o', 'm', '-', 'a', 'u', '.', 'i', 'n', 'f', 'o']",
#         'bi_gram_domain_name': "['my', 'y-', '-i', 'id', 'd-', '-t', 'te', 'el', 'ls', 'st', 'tr', 'ra', 'a-', '-c', 'co', 'om', 'm-', '-a', 'au', 'u.', '.i', 'in', 'nf', 'fo']",
#         'tri_gram_domain_name': "['my-', 'y-i', '-id', 'id-', 'd-t', '-te', 'tel', 'els', 'lst', 'str', 'tra', 'ra-', 'a-c', '-co', 'com', 'om-', 'm-a', '-au', 'au.', 'u.i', '.in', 'inf', 'nfo']",
#         'character_distribution': "{'n': 1, 'e': 1, 'y': 1, 'l': 1, 't': 2, 'i': 2, 'u': 1, 'r': 1, 'c': 1, 'm': 2, '.': 1, 'o': 2, '-': 4, 'a': 2, 's': 1, 'd': 1, 'f': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.860589',
#         'dns_top_level_domain': 'my-id-telstra-com-au',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 25,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.923856189774724,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 7,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 85,
#         'sending_bytes': 145,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'y', '-', 'i', 'd', '-', 't', 'e', 'l', 's', 't', 'r', 'a', '-', 'c', 'o', 'm', '-', 'a', 'u', '.', 'i', 'n', 'f', 'o']",
#         'bi_gram_domain_name': "['my', 'y-', '-i', 'id', 'd-', '-t', 'te', 'el', 'ls', 'st', 'tr', 'ra', 'a-', '-c', 'co', 'om', 'm-', '-a', 'au', 'u.', '.i', 'in', 'nf', 'fo']",
#         'tri_gram_domain_name': "['my-', 'y-i', '-id', 'id-', 'd-t', '-te', 'tel', 'els', 'lst', 'str', 'tra', 'ra-', 'a-c', '-co', 'com', 'om-', 'm-a', '-au', 'au.', 'u.i', '.in', 'inf', 'nfo']",
#         'character_distribution': "{'n': 1, 'e': 1, 'y': 1, 'l': 1, 't': 2, 'i': 2, 'u': 1, 'r': 1, 'c': 1, 'm': 2, '.': 1, 'o': 2, '-': 4, 'a': 2, 's': 1, 'd': 1, 'f': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:35.952993',
#         'dns_top_level_domain': 'mysecretcbds',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 17,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.734521664779752,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 12,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 77,
#         'sending_bytes': 137,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'y', 's', 'e', 'c', 'r', 'e', 't', 'c', 'b', 'd', 's', '.', 'i', 'n', 'f', 'o']",
#         'bi_gram_domain_name': "['my', 'ys', 'se', 'ec', 'cr', 're', 'et', 'tc', 'cb', 'bd', 'ds', 's.', '.i', 'in', 'nf', 'fo']",
#         'tri_gram_domain_name': "['mys', 'yse', 'sec', 'ecr', 'cre', 'ret', 'etc', 'tcb', 'cbd', 'bds', 'ds.', 's.i', '.in', 'inf', 'nfo']",
#         'character_distribution': "{'n': 1, 'e': 2, 'y': 1, 't': 1, 'i': 1, 'r': 1, 'c': 2, 'm': 1, '.': 1, 'o': 1, 's': 2, 'd': 1, 'f': 1, 'b': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:36.053868',
#         'dns_top_level_domain': 'mysecretcbds',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 17,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.734521664779752,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 12,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 77,
#         'sending_bytes': 137,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'y', 's', 'e', 'c', 'r', 'e', 't', 'c', 'b', 'd', 's', '.', 'i', 'n', 'f', 'o']",
#         'bi_gram_domain_name': "['my', 'ys', 'se', 'ec', 'cr', 're', 'et', 'tc', 'cb', 'bd', 'ds', 's.', '.i', 'in', 'nf', 'fo']",
#         'tri_gram_domain_name': "['mys', 'yse', 'sec', 'ecr', 'cre', 'ret', 'etc', 'tcb', 'cbd', 'bds', 'ds.', 's.i', '.in', 'inf', 'nfo']",
#         'character_distribution': "{'n': 1, 'e': 2, 'y': 1, 't': 1, 'i': 1, 'r': 1, 'c': 2, 'm': 1, '.': 1, 'o': 1, 's': 2, 'd': 1, 'f': 1, 'b': 1}"
#     },
#     {
#         'timestamp': '2019-10-08 14:50:41.103145',
#         'dns_top_level_domain': 'mystripe',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 15,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.640223928941851,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 8,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 75,
#         'sending_bytes': 75,
#         'distinct_ttl_values': 0,
#         'ttl_values_min': -1,
#         'ttl_values_max': -1,
#         'ttl_values_mean': -1.0,
#         'query_resource_record_type': '[]',
#         'ans_resource_record_type': '[]',
#         'query_resource_record_class': '[]',
#         'ans_resource_record_class': '[]',
#         'uni_gram_domain_name': "['m', 'y', 's', 't', 'r', 'i', 'p', 'e', '.', 'a', 'g', 'e', 'n', 'c', 'y']",
#         'bi_gram_domain_name': "['my', 'ys', 'st', 'tr', 'ri', 'ip', 'pe', 'e.', '.a', 'ag', 'ge', 'en', 'nc', 'cy']",
#         'tri_gram_domain_name': "['mys', 'yst', 'str', 'tri', 'rip', 'ipe', 'pe.', 'e.a', '.ag', 'age', 'gen', 'enc', 'ncy']",
#         'character_distribution': "{'n': 1, 'e': 2, 'p': 1, 'y': 2, 'g': 1, 't': 1, 'i': 1, 'r': 1, 'c': 1, 'm': 1, '.': 1, 'a': 1, 's': 1}"
#     }
# ]


# sample_queries = [
#     {
#         'timestamp': '2023-11-15 12:00:00',
#         'dns_top_level_domain': 'google',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 11,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.2,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 5,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 71,
#         'sending_bytes': 87,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 29,
#         'ttl_values_max': 29,
#         'ttl_values_mean': 29.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['g','o','o','g','l','e','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['go','oo','og','gl','le','e.','.c','co','om']",
#         'tri_gram_domain_name': "['goo','oog','ogl','gle','le.','e.c','.co','com']",
#         'character_distribution': "{'g':1,'o':2,'l':1,'e':1,'.':1,'c':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:05:00',
#         'dns_top_level_domain': 'apple',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 11,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.0,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 5,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 70,
#         'sending_bytes': 85,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['a','p','p','l','e','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['ap','pp','pl','le','e.','.c','co','om']",
#         'tri_gram_domain_name': "['app','ppl','ple','le.','e.c','.co','com']",
#         'character_distribution': "{'a':1,'p':2,'l':1,'e':1,'.':1,'c':1,'o':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:10:00',
#         'dns_top_level_domain': 'microsoft',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 15,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.5,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 8,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 72,
#         'sending_bytes': 90,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 28,
#         'ttl_values_max': 28,
#         'ttl_values_mean': 28.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['m','i','c','r','o','s','o','f','t','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['mi','ic','cr','ro','os','so','of','ft','t.','.c','co','om']",
#         'tri_gram_domain_name': "['mic','icr','cro','ros','oso','sof','oft','ft.','t.c','.co','com']",
#         'character_distribution': "{'m':1,'i':1,'c':1,'r':1,'o':2,'s':1,'f':1,'t':1,'.':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:15:00',
#         'dns_top_level_domain': 'amazon',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 12,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.1,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 6,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 68,
#         'sending_bytes': 88,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['a','m','a','z','o','n','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['am','ma','az','zo','on','n.','.c','co','om']",
#         'tri_gram_domain_name': "['ama','maz','azo','zon','on.','n.c','.co','com']",
#         'character_distribution': "{'a':2,'m':1,'z':1,'o':1,'n':1,'.':1,'c':1,'o':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:20:00',
#         'dns_top_level_domain': 'facebook',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 13,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.3,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 7,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 73,
#         'sending_bytes': 89,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['f','a','c','e','b','o','o','k','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['fa','ac','ce','eb','bo','oo','ok','k.','.c','co','om']",
#         'tri_gram_domain_name': "['fac','ace','ceb','ebo','boo','ook','ok.','k.c','.co','com']",
#         'character_distribution': "{'f':1,'a':1,'c':1,'e':2,'b':1,'o':2,'k':1,'.':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:25:00',
#         'dns_top_level_domain': 'wikipedia',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 16,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.4,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 8,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 74,
#         'sending_bytes': 90,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['w','i','k','i','p','e','d','i','a','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['wi','ik','ki','ip','pe','ed','di','ia','a.','.c','co','om']",
#         'tri_gram_domain_name': "['wik','iki','kip','ipe','ped','edi','dia','ia.','.co','com']",
#         'character_distribution': "{'w':1,'i':3,'k':1,'p':1,'e':1,'d':1,'a':1,'.':1,'c':1,'o':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:30:00',
#         'dns_top_level_domain': 'yahoo',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 12,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.1,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 6,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 72,
#         'sending_bytes': 86,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['y','a','h','o','o','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['ya','ah','ho','oo','o.','.c','co','om']",
#         'tri_gram_domain_name': "['yah','aho','hoo','oo.','o.c','.co','com']",
#         'character_distribution': "{'y':1,'a':1,'h':1,'o':2,'.':1,'c':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:35:00',
#         'dns_top_level_domain': 'bing',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 11,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.0,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 5,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 71,
#         'sending_bytes': 87,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 29,
#         'ttl_values_max': 29,
#         'ttl_values_mean': 29.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['b','i','n','g','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['bi','in','ng','g.','.c','co','om']",
#         'tri_gram_domain_name': "['bin','ing','ng.','g.c','.co','com']",
#         'character_distribution': "{'b':1,'i':1,'n':1,'g':1,'.':1,'c':1,'o':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:40:00',
#         'dns_top_level_domain': 'linkedin',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 15,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.4,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 7,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 73,
#         'sending_bytes': 89,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['l','i','n','k','e','d','i','n','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['li','in','nk','ke','ed','di','in','n.','.c','co','om']",
#         'tri_gram_domain_name': "['lin','ink','nke','ked','edi','din','in.','n.c','.co','com']",
#         'character_distribution': "{'l':1,'i':2,'n':2,'k':1,'e':1,'d':1,'.':1,'c':1,'o':1,'m':1}"
#     },
#     {
#         'timestamp': '2023-11-15 12:45:00',
#         'dns_top_level_domain': 'netflix',
#         'dns_second_level_domain': 'not-found',
#         'dns_domain_name_length': 14,
#         'numerical_percentage': 0.0,
#         'character_entropy': 3.3,
#         'max_continuous_numeric_len': 0,
#         'max_continuous_alphabet_len': 6,
#         'packets_numbers': 2,
#         'receiving_packets_numbers': 1,
#         'sending_packets_numbers': 1,
#         'receiving_bytes': 72,
#         'sending_bytes': 88,
#         'distinct_ttl_values': 1,
#         'ttl_values_min': 30,
#         'ttl_values_max': 30,
#         'ttl_values_mean': 30.0,
#         'query_resource_record_type': "[]",
#         'ans_resource_record_type': "[1]",
#         'query_resource_record_class': "[]",
#         'ans_resource_record_class': "[1]",
#         'uni_gram_domain_name': "['n','e','t','f','l','i','x','.', 'c','o','m']",
#         'bi_gram_domain_name':  "['ne','et','tf','fl','li','ix','x.','.c','co','om']",
#         'tri_gram_domain_name': "['net','etf','tfl','fli','lix','ix.','x.c','.co','com']",
#         'character_distribution': "{'n':1,'e':1,'t':1,'f':1,'l':1,'i':1,'x':1,'.':1,'c':1,'o':1,'m':1}"
#     }
# ]

sample_queries = [
    {
        'timestamp': '2019-10-15 21:58:50.240620',
        'dns_top_level_domain': 'amazonaws',
        'dns_second_level_domain': 's3.amazonaws',
        'dns_domain_name_length': 25,
        'numerical_percentage': 0.04,
        'character_entropy': 3.7034651896016464,
        'max_continuous_numeric_len': 1,
        'max_continuous_alphabet_len': 9,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 85,
        'sending_bytes': 187,
        'distinct_ttl_values': 1,
        'ttl_values_min': 2213,
        'ttl_values_max': 2213,
        'ttl_values_mean': 2213.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[5]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[1]",
        'uni_gram_domain_name': "['d', 'k', '-', 'm', 'e', 'd', 'i', 'a', '.', 's', '3', '.', 'a', 'm', 'a', 'z', 'o', 'n', 'a', 'w', 's', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['dk', 'k-', '-m', 'me', 'ed', 'di', 'ia', 'a.', '.s', 's3', '3.', '.a', 'am', 'ma', 'az', 'zo', 'on', 'na', 'aw', 'ws', 's.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['dk-', 'k-m', '-me', 'med', 'edi', 'dia', 'ia.', 'a.s', '.s3', 's3.', '3.a', '.am', 'ama', 'maz', 'azo', 'zon', 'ona', 'naw', 'aws', 'ws.', 's.c', '.co', 'com']",
        'character_distribution': "{'m': 3, 's': 2, 'w': 1, 'o': 2, 'k': 1, '-': 1, '3': 1, 'z': 1, '.': 3, 'e': 1, 'd': 2, 'a': 4, 'i': 1, 'n': 1, 'c': 1}"
    },
    {
        'timestamp': '2019-10-15 21:58:51.347241',
        'dns_top_level_domain': 'peakbizperformance',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 21,
        'numerical_percentage': 0.0,
        'character_entropy': 3.653756708287001,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 18,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 81,
        'sending_bytes': 137,
        'distinct_ttl_values': 1,
        'ttl_values_min': 299,
        'ttl_values_max': 299,
        'ttl_values_mean': 299.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[28, 28]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[1, 1]",
        'uni_gram_domain_name': "['p', 'e', 'a', 'k', 'b', 'i', 'z', 'p', 'e', 'r', 'f', 'o', 'r', 'm', 'a', 'n', 'c', 'e', '.', 'c', 'a']",
        'bi_gram_domain_name': "['pe', 'ea', 'ak', 'kb', 'bi', 'iz', 'zp', 'pe', 'er', 'rf', 'fo', 'or', 'rm', 'ma', 'an', 'nc', 'ce', 'e.', '.c', 'ca']",
        'tri_gram_domain_name': "['pea', 'eak', 'akb', 'kbi', 'biz', 'izp', 'zpe', 'per', 'erf', 'rfo', 'for', 'orm', 'rma', 'man', 'anc', 'nce', 'ce.', 'e.c', '.ca']",
        'character_distribution': "{'m': 1, 'o': 1, 'k': 1, 'f': 1, 'z': 1, 'p': 2, 'e': 3, '.': 1, 'a': 3, 'b': 1, 'i': 1, 'r': 2, 'n': 1, 'c': 2}"
    },
    {
        'timestamp': '2019-10-15 21:58:53.541944',
        'dns_top_level_domain': 'yahoo',
        'dns_second_level_domain': 'mail.yahoo',
        'dns_domain_name_length': 14,
        'numerical_percentage': 0.0,
        'character_entropy': 3.039148671903071,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 5,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 74,
        'sending_bytes': 179,
        'distinct_ttl_values': 2,
        'ttl_values_min': 209,
        'ttl_values_max': 297,
        'ttl_values_mean': 267.66666667,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[5, 28, 28]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[1, 1, 1]",
        'uni_gram_domain_name': "['m', 'a', 'i', 'l', '.', 'y', 'a', 'h', 'o', 'o', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['ma', 'ai', 'il', 'l.', '.y', 'ya', 'ah', 'ho', 'oo', 'o.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['mai', 'ail', 'il.', 'l.y', '.ya', 'yah', 'aho', 'hoo', 'oo.', 'o.c', '.co', 'com']",
        'character_distribution': "{'m': 2, 'o': 3, 'h': 1, '.': 2, 'l': 1, 'a': 2, 'y': 1, 'i': 1, 'c': 1}"
    },
    {
        'timestamp': '2019-10-15 21:58:54.500797',
        'dns_top_level_domain': 'juanthradio',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 15,
        'numerical_percentage': 0.0,
        'character_entropy': 3.640223928941852,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 11,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 75,
        'sending_bytes': 154,
        'distinct_ttl_values': 0,
        'ttl_values_min': -1,
        'ttl_values_max': -1,
        'ttl_values_mean': -1.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[]",
        'uni_gram_domain_name': "['j', 'u', 'a', 'n', 't', 'h', 'r', 'a', 'd', 'i', 'o', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['ju', 'ua', 'an', 'nt', 'th', 'hr', 'ra', 'ad', 'di', 'io', 'o.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['jua', 'uan', 'ant', 'nth', 'thr', 'hra', 'rad', 'adi', 'dio', 'io.', 'o.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 'o': 2, 'c': 1, 't': 1, '.': 1, 'u': 1, 'd': 1, 'a': 2, 'h': 1, 'i': 1, 'r': 1, 'n': 1, 'j': 1}"
    },
    {
        'timestamp': '2019-10-15 21:58:55.861618',
        'dns_top_level_domain': 'jalfre',
        'dns_second_level_domain': 'www.jalfre',
        'dns_domain_name_length': 14,
        'numerical_percentage': 0.0,
        'character_entropy': 3.3248629576173574,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 6,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 74,
        'sending_bytes': 135,
        'distinct_ttl_values': 0,
        'ttl_values_min': -1,
        'ttl_values_max': -1,
        'ttl_values_mean': -1.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[]",
        'uni_gram_domain_name': "['w', 'w', 'w', '.', 'j', 'a', 'l', 'f', 'r', 'e', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['ww', 'ww', 'w.', '.j', 'ja', 'al', 'lf', 'fr', 're', 'e.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['www', 'ww.', 'w.j', '.ja', 'jal', 'alf', 'lfr', 'fre', 're.', 'e.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 'w': 3, 'e': 1, 'o': 1, 'f': 1, '.': 2, 'l': 1, 'a': 1, 'r': 1, 'c': 1, 'j': 1}"
    },
    {
        'timestamp': '2019-10-15 21:58:57.167135',
        'dns_top_level_domain': 'oncue1',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 10,
        'numerical_percentage': 0.1,
        'character_entropy': 2.9219280948873623,
        'max_continuous_numeric_len': 1,
        'max_continuous_alphabet_len': 5,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 70,
        'sending_bytes': 149,
        'distinct_ttl_values': 0,
        'ttl_values_min': -1,
        'ttl_values_max': -1,
        'ttl_values_mean': -1.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[]",
        'uni_gram_domain_name': "['o', 'n', 'c', 'u', 'e', '1', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['on', 'nc', 'cu', 'ue', 'e1', '1.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['onc', 'ncu', 'cue', 'ue1', 'e1.', '1.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 'o': 2, '1': 1, '.': 1, 'e': 1, 'u': 1, 'n': 1, 'c': 2}"
    },
    {
        'timestamp': '2019-10-15 21:58:58.181052',
        'dns_top_level_domain': 'alsultanah',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 18,
        'numerical_percentage': 0.0,
        'character_entropy': 3.4193819456463714,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 10,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 78,
        'sending_bytes': 171,
        'distinct_ttl_values': 1,
        'ttl_values_min': 8359,
        'ttl_values_max': 8359,
        'ttl_values_mean': 8359.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[5]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[1]",
        'uni_gram_domain_name': "['w', 'w', 'w', '.', 'a', 'l', 's', 'u', 'l', 't', 'a', 'n', 'a', 'h', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['ww', 'ww', 'w.', '.a', 'al', 'ls', 'su', 'ul', 'lt', 'ta', 'an', 'na', 'ah', 'h.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['www', 'ww.', 'w.a', '.al', 'als', 'lsu', 'sul', 'ult', 'lta', 'tan', 'ana', 'nah', 'ah.', 'h.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 's': 1, 'w': 3, 'o': 1, 't': 1, '.': 2, 'l': 2, 'u': 1, 'a': 3, 'h': 1, 'n': 1, 'c': 1}"
    },
    {
        'timestamp': '2019-10-15 21:58:59.586744',
        'dns_top_level_domain': 'kathymkcannon',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 17,
        'numerical_percentage': 0.0,
        'character_entropy': 3.219528282299548,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 13,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 77,
        'sending_bytes': 142,
        'distinct_ttl_values': 0,
        'ttl_values_min': -1,
        'ttl_values_max': -1,
        'ttl_values_mean': -1.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[]",
        'uni_gram_domain_name': "['k', 'a', 't', 'h', 'y', 'm', 'k', 'c', 'a', 'n', 'n', 'o', 'n', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['ka', 'at', 'th', 'hy', 'ym', 'mk', 'kc', 'ca', 'an', 'nn', 'no', 'on', 'n.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['kat', 'ath', 'thy', 'hym', 'ymk', 'mkc', 'kca', 'can', 'ann', 'nno', 'non', 'on.', 'n.c', '.co', 'com']",
        'character_distribution': "{'m': 2, 'y': 1, 'o': 2, 'k': 2, 'n': 3, 't': 1, '.': 1, 'a': 2, 'h': 1, 'c': 2}"
    },
    {
        'timestamp': '2019-10-15 21:59:01.732376',
        'dns_top_level_domain': 'vivirenchina',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 16,
        'numerical_percentage': 0.0,
        'character_entropy': 3.327819531114783,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 12,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 76,
        'sending_bytes': 144,
        'distinct_ttl_values': 0,
        'ttl_values_min': -1,
        'ttl_values_max': -1,
        'ttl_values_mean': -1.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[]",
        'uni_gram_domain_name': "['v', 'i', 'v', 'i', 'r', 'e', 'n', 'c', 'h', 'i', 'n', 'a', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['vi', 'iv', 'vi', 'ir', 're', 'en', 'nc', 'ch', 'hi', 'in', 'na', 'a.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['viv', 'ivi', 'vir', 'ire', 'ren', 'enc', 'nch', 'chi', 'hin', 'ina', 'na.', 'a.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 'o': 1, 'v': 2, '.': 1, 'e': 1, 'a': 1, 'h': 1, 'i': 3, 'r': 1, 'n': 2, 'c': 2}"
    },
    {
        'timestamp': '2019-10-15 21:59:03.550739',
        'dns_top_level_domain': 'clone-site',
        'dns_second_level_domain': 'edb.clone-site',
        'dns_domain_name_length': 18,
        'numerical_percentage': 0.0,
        'character_entropy': 3.5724312513221195,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 5,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 78,
        'sending_bytes': 140,
        'distinct_ttl_values': 0,
        'ttl_values_min': -1,
        'ttl_values_max': -1,
        'ttl_values_mean': -1.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[]",
        'uni_gram_domain_name': "['e', 'd', 'b', '.', 'c', 'l', 'o', 'n', 'e', '-', 's', 'i', 't', 'e', '.', 'b', 'i', 'z']",
        'bi_gram_domain_name': "['ed', 'db', 'b.', '.c', 'cl', 'lo', 'on', 'ne', 'e-', '-s', 'si', 'it', 'te', 'e.', '.b', 'bi', 'iz']",
        'tri_gram_domain_name': "['edb', 'db.', 'b.c', '.cl', 'clo', 'lon', 'one', 'ne-', 'e-s', '-si', 'sit', 'ite', 'te.', 'e.b', '.bi', 'biz']",
        'character_distribution': "{'s': 1, 'o': 1, '-', 1, 'n': 1, 't': 1, 'z': 1, 'l': 1, '.': 2, 'e': 3, 'd': 1, 'b': 2, 'i': 2, 'c': 1}"
    },
    {
        'timestamp': '2019-10-15 21:59:11.003020',
        'dns_top_level_domain': 'njpear',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 10,
        'numerical_percentage': 0.0,
        'character_entropy': 3.321928094887362,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 6,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 70,
        'sending_bytes': 86,
        'distinct_ttl_values': 1,
        'ttl_values_min': 21599,
        'ttl_values_max': 21599,
        'ttl_values_mean': 21599.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[1]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[1]",
        'uni_gram_domain_name': "['n', 'j', 'p', 'e', 'a', 'r', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['nj', 'jp', 'pe', 'ea', 'ar', 'r.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['njp', 'jpe', 'pea', 'ear', 'ar.', 'r.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 'o': 1, 'c': 1, 'p': 1, 'e': 1, '.': 1, 'a': 1, 'r': 1, 'n': 1, 'j': 1}"
    },
    {
        'timestamp': '2019-10-15 21:59:14.648148',
        'dns_top_level_domain': 'stsgroupbd',
        'dns_second_level_domain': 'not-found',
        'dns_domain_name_length': 14,
        'numerical_percentage': 0.0,
        'character_entropy': 3.521640636343319,
        'max_continuous_numeric_len': 0,
        'max_continuous_alphabet_len': 10,
        'packets_numbers': 2,
        'receiving_packets_numbers': 1,
        'sending_packets_numbers': 1,
        'receiving_bytes': 74,
        'sending_bytes': 90,
        'distinct_ttl_values': 1,
        'ttl_values_min': 12298,
        'ttl_values_max': 12298,
        'ttl_values_mean': 12298.0,
        'query_resource_record_type': "[]",
        'ans_resource_record_type': "[1]",
        'query_resource_record_class': "[]",
        'ans_resource_record_class': "[1]",
        'uni_gram_domain_name': "['s', 't', 's', 'g', 'r', 'o', 'u', 'p', 'b', 'd', '.', 'c', 'o', 'm']",
        'bi_gram_domain_name': "['st', 'ts', 'sg', 'gr', 'ro', 'ou', 'up', 'pb', 'bd', 'd.', '.c', 'co', 'om']",
        'tri_gram_domain_name': "['sts', 'tsg', 'sgr', 'gro', 'rou', 'oup', 'upb', 'pbd', 'bd.', 'd.c', '.co', 'com']",
        'character_distribution': "{'m': 1, 's': 2, 'o': 2, 'g': 1, 't': 1, 'p': 1, '.': 1, 'u': 1, 'd': 1, 'b': 1, 'r': 1, 'c': 1}"
    }
]


In [176]:
# ------------------------------------------------------------------------------
# 4. Helper Functions for Parsing & Transformation
# ------------------------------------------------------------------------------
def parse_list_safe(val):
    """Converts a string representation of a list to an actual list."""
    if isinstance(val, list):
        return val
    elif isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return []
    else:
        return []

def parse_ngram_count(val):
    """Convert an n-gram column (string or list) into its count (length)."""
    lst = parse_list_safe(val)
    return len(lst)

def parse_resource_record_counts(val):
    """Convert a resource record column (string or list) into the count of unique values."""
    lst = parse_list_safe(val)
    return len(set(lst)) if isinstance(lst, list) else 0

def parse_vowel_consonant_counts(dist_str):
    """Extract vowel_count and consonant_count from a string representation of a dictionary."""
    vowels = set('aeiou')
    consonants = set('bcdfghjklmnpqrstvwxyz')
    vowel_count, consonant_count = 0, 0
    try:
        d = ast.literal_eval(dist_str)
    except:
        d = {}
    if isinstance(d, dict):
        for ch, count in d.items():
            if ch in vowels:
                vowel_count += count
            elif ch in consonants:
                consonant_count += count
    return vowel_count, consonant_count

In [177]:
# ------------------------------------------------------------------------------
# 5. Inference Preprocessing Function
# ------------------------------------------------------------------------------
def preprocess_single_dns_query(query_dict, le_tld, le_sld, scaler, sequence_length=10):
    """
    Transforms a single DNS query (given as a dictionary) into a numpy array of shape
    (1, sequence_length, num_features) that matches the training pipeline.
    """
    # Convert query_dict into a single-row DataFrame
    df = pd.DataFrame([query_dict])
    
    # -- Process Domain Columns --
    df['dns_top_level_domain'] = df['dns_top_level_domain'].fillna('unknown')
    valid_tld = set(le_tld.classes_)
    df['dns_top_level_domain'] = df['dns_top_level_domain'].apply(lambda x: x if x in valid_tld else 'unknown')
    df['dns_top_level_domain_encoded'] = le_tld.transform(df['dns_top_level_domain'])
    
    df['dns_second_level_domain'] = df['dns_second_level_domain'].fillna('unknown')
    valid_sld = set(le_sld.classes_)
    df['dns_second_level_domain'] = df['dns_second_level_domain'].apply(lambda x: x if x in valid_sld else 'unknown')
    df['dns_second_level_domain_encoded'] = le_sld.transform(df['dns_second_level_domain'])
    
    df.drop(['dns_top_level_domain', 'dns_second_level_domain'], axis=1, inplace=True, errors='ignore')
    
    # -- Process n-gram Columns to Counts --
    df['uni_gram_count'] = df['uni_gram_domain_name'].apply(parse_ngram_count)
    df['bi_gram_count']  = df['bi_gram_domain_name'].apply(parse_ngram_count)
    df['tri_gram_count'] = df['tri_gram_domain_name'].apply(parse_ngram_count)
    df.drop(['uni_gram_domain_name', 'bi_gram_domain_name', 'tri_gram_domain_name'], axis=1, errors='ignore', inplace=True)
    
    # -- Process Resource Record Columns to Unique Counts --
    df['query_resource_record_type_count']  = df.get('query_resource_record_type', "").apply(parse_resource_record_counts)
    df['ans_resource_record_type_count']    = df.get('ans_resource_record_type', "").apply(parse_resource_record_counts)
    df['query_resource_record_class_count'] = df.get('query_resource_record_class', "").apply(parse_resource_record_counts)
    df['ans_resource_record_class_count']   = df.get('ans_resource_record_class', "").apply(parse_resource_record_counts)
    df.drop(['query_resource_record_type', 'ans_resource_record_type',
             'query_resource_record_class', 'ans_resource_record_class'],
            axis=1, errors='ignore', inplace=True)
    
    # -- Process character_distribution into vowel_count and consonant_count --
    vcount, ccount = parse_vowel_consonant_counts(df.iloc[0]['character_distribution'])
    df['vowel_count'] = vcount
    df['consonant_count'] = ccount
    df.drop('character_distribution', axis=1, inplace=True, errors='ignore')
    
    # -- Drop timestamp (it was used only for sorting) --
    if 'timestamp' in df.columns:
        df.drop('timestamp', axis=1, inplace=True)
    
    # -- Ensure All Expected Features Are Present in EXACT Order --
    # FINAL_ORDER is the full list of 27 features the model was trained on.
    FINAL_ORDER = [
        'dns_domain_name_length',
        'numerical_percentage',
        'character_entropy',
        'max_continuous_numeric_len',
        'max_continuous_alphabet_len',
        'vowels_consonant_ratio',
        'conv_freq_vowels_consonants',
        'packets_numbers',
        'receiving_packets_numbers',
        'sending_packets_numbers',
        'receiving_bytes',
        'sending_bytes',
        'distinct_ttl_values',
        'ttl_values_min',
        'ttl_values_max',
        'ttl_values_mean',
        'dns_top_level_domain_encoded',
        'dns_second_level_domain_encoded',
        'uni_gram_count',
        'bi_gram_count',
        'tri_gram_count',
        'query_resource_record_type_count',
        'ans_resource_record_type_count',
        'query_resource_record_class_count',
        'ans_resource_record_class_count',
        'vowel_count',
        'consonant_count'
    ]
    # Fill any missing expected columns with 0
    for col in FINAL_ORDER:
        if col not in df.columns:
            df[col] = 0
    # Reorder the DataFrame to match the training order
    df = df[FINAL_ORDER]
    
    # -- Convert to float32 and Scale --
    df = df.astype('float32', errors='ignore')
    # Reindex to exactly match the scaler's expected feature names, if available:
    # if hasattr(scaler, 'feature_names_in_'):
    #     expected_cols = list(scaler.feature_names_in_)
    #     df = df.reindex(columns=expected_cols, fill_value=0)
    df[:] = scaler.transform(df[:])
    
    # -- Replicate the Single Row to Create a Sequence --
    repeated = pd.concat([df]*sequence_length, ignore_index=True)
    final_array = repeated.values.reshape((1, sequence_length, repeated.shape[1]))
    
    return final_array

In [178]:
# ------------------------------------------------------------------------------
# 6. Preprocess All Sample Queries into a Batch
# ------------------------------------------------------------------------------
sequences = []
for query in sample_queries:
    seq = preprocess_single_dns_query(query, le_tld, le_sld, scaler, sequence_length=5)
    sequences.append(seq)
    
# Stack into a batch (shape: (10, sequence_length, num_features))
inference_batch = np.vstack(sequences)
print("Inference batch shape:", inference_batch.shape)

Inference batch shape: (12, 5, 27)


In [179]:
y_pred_probs = model.predict(inference_batch)
pred_idx = np.argmax(y_pred_probs, axis=1)[0]
pred_class_label = le_label.inverse_transform([pred_idx])[0]

print("Predicted class index:", pred_idx)
print("Predicted class label:", pred_class_label)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Predicted class index: 2
Predicted class label: Phishing
