# Fine-tuning a pre-built Doc2Vec Model With DNS Log Data
1/13/2025, Dave Sisk, https://github.com/davidcsisk, https://www.linkedin.com/in/davesisk-doctordatabase/

In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd

In [2]:
# DNS log data from this URL: https://www.secrepo.com/maccdc2012/dns.log.gz
# Examine other data here as well: https://www.secrepo.com
# Helpful notebook: https://github.com/cyberdefendersprogram/MachineLearning/blob/master/Data_analysis/Network%20analysis/dns%20analysis.ipynb
df = pd.read_csv('dns.log.gz', compression='gzip', sep='\t', header=None)


# col19 = 1 seems to be all NB query types
df = df.drop(columns=[0, 1, 2, 3, 4, 5, 7, 9, 11, 13, 15, 16, 17, 18, 19, 20, 21])

# Rename known columns
df = df.rename(columns={6:'protocol', 8:'resolved_address', 10:'class', 12:'query_type', 
                        14:'response', 22:'rejected'})

# Set rejected column values to something more human / model readable...T = 'rejected' or F = 'Accepted'
# If we do this here, we'll have to do the same to every DNS record we examine
#df['rejected'] = df['rejected'].map({'T': 'rejected', 'F': 'accepted'})

df.shape


(427935, 6)

DNS record types and descriptions can be found here:
https://en.wikipedia.org/wiki/List_of_DNS_record_types

In [3]:
pd.set_option('display.max_columns', None)
df.sample(5)
#df[df['rejected'] != 'T'].sample(5)
#df[df['class'] != 'C_INTERNET'].sample(5)

Unnamed: 0,protocol,resolved_address,class,query_type,response,rejected
84415,udp,teredo.ipv6.microsoft.com,C_INTERNET,A,-,F
172967,udp,www.download.windowsupdate.com,C_INTERNET,A,-,F
297140,udp,www.apple.com,C_INTERNET,AAAA,-,F
257646,udp,EWREP1,C_INTERNET,NB,-,F
44237,udp,LAPTOP1-PC,C_INTERNET,NB,-,F


In [4]:
# Split the DNS log data into traininig data and test data...427935 -> 400000 training + 27935 test
df_train = df.sample(n=400000, random_state=96)  # Get 400K random rows
df_test = df.drop(df_train.index) # Get the 27K remaining rows

df_train.to_csv('dns-log_training-data.csv', index=False)
df_test.to_csv('dns-log_test-data.csv', index=False)

print('Train data: ', df_train.shape, ' Test data: ', df_test.shape)

Train data:  (400000, 6)  Test data:  (27935, 6)


In [5]:
# Preprocess a single row by concatenating all columns and tokenizing
def preprocess_dns_row(row):
    concatenated = ' '.join(map(str, row))  # Combine all column values as a single string
    tokens = concatenated.lower().split()  # Tokenize and convert to lowercase
    return tokens

# Create TaggedDocument objects
tagged_dns_logs = [
    TaggedDocument(words=preprocess_dns_row(row), tags=[f"dns_{i}"])
    for i, row in df_train.iterrows()
]


In [7]:
# Fine-tuning the generic DM model with 400K rows of DNS data runs for around 5 mins

# Load the pre-trained model
model = Doc2Vec.load("doc2vec_wikipedia_dm.model")

# Update vocabulary with DNS logs (optional if vocab is complete)
model.build_vocab(tagged_dns_logs, update=True)

# Set a lower learning rate for fine-tuning
fine_tuning_alpha = 0.001  # Lower learning rate for fine-tuning
fine_tuning_min_alpha = 0.0005  # Minimum learning rate

# Fine-tune the model on DNS logs
model.train(
    tagged_dns_logs,
    total_examples=len(tagged_dns_logs),
    epochs=10,
    start_alpha=fine_tuning_alpha,  # Set starting learning rate
    end_alpha=fine_tuning_min_alpha  # Set minimum learning rate
)

# Save the fine-tuned model
model.save("doc2vec_wikipedia_dm_fine-tuned-on-dns.model")

In [8]:
# Fine-tuning the generic DBOW model with 400K rows of DNS data runs for around 5 mins

# Load the pre-trained model
model2 = Doc2Vec.load("doc2vec_wikipedia_dbow.model")

# Update vocabulary with DNS logs (optional if vocab is complete)
model2.build_vocab(tagged_dns_logs, update=True)

# Set a lower learning rate for fine-tuning
fine_tuning_alpha = 0.001  # Lower learning rate for fine-tuning
fine_tuning_min_alpha = 0.0005  # Minimum learning rate

# Fine-tune the model on DNS logs
model2.train(
    tagged_dns_logs,
    total_examples=len(tagged_dns_logs),
    epochs=10,
    start_alpha=fine_tuning_alpha,  # Set starting learning rate
    end_alpha=fine_tuning_min_alpha  # Set minimum learning rate
)

# Save the fine-tuned model
model2.save("doc2vec_wikipedia_dbow_fine-tuned-on-dns.model")