# Unsupervised Labeling of Test Data
This is an exercise in labeling DNS log test data without using vector embeddings.  I've used one-hot encoding to get a matrix of 0/1's, then PCA dimensionality reduction on that matrix, then trained an isolation forest model with the data, and finally used that model to then label each row.

In [5]:
# Done with PCA dimensionality reduction for the one-hot encoding
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

# Load your dataset
file_path = "dns-log_test-data.csv"  # Replace with your file path
dns_log_data = pd.read_csv(file_path)

# Step 1: One-Hot Encode Categorical Features
encoder = OneHotEncoder(sparse_output=False)  # Use sparse=False for dense output
encoded_features = encoder.fit_transform(dns_log_data)

# Step 2: Dimensionality Reduction with PCA
pca = PCA(n_components=50, random_state=42)  # Reduce dimensions to 50 components
reduced_features = pca.fit_transform(encoded_features)

# Step 3: Apply Isolation Forest for Anomaly Detection
iforest = IsolationForest(n_estimators=100, max_samples=1000, contamination='auto', random_state=42)
iforest.fit(reduced_features)

# Step 4: Add Anomaly Scores and Flags to Original Dataset
dns_log_data['raw_score'] = iforest.decision_function(reduced_features)  # Anomaly score
dns_log_data['anomaly_score'] = iforest.predict(reduced_features)       # -1 for anomaly, 1 for normal

# Step 5: Separate Anomalies
anomalies = dns_log_data[dns_log_data['anomaly_score'] == -1]  # Filter flagged anomalies

# Optional: Save results to a new file
anomalies.to_csv("dns-log_test-data_labeled-anomalies-only.csv", index=False)
dns_log_data.to_csv("dns-log_test-data_labeled.csv", index=False)

# Print summary
print(f"Number of anomalies detected: {anomalies.shape[0]}")
print("Sample anomalies:")
print(anomalies.head())


Number of anomalies detected: 140
Sample anomalies:
    protocol resolved_address    class query_type response rejected  \
588      udp     version.bind  C_CHAOS        TXT   NOTIMP        T   
592      udp     version.bind  C_CHAOS        TXT        -        F   
602      udp     version.bind  C_CHAOS        TXT   NOTIMP        T   
616      udp     version.bind  C_CHAOS        TXT  NOERROR        F   
618      udp     version.bind  C_CHAOS        TXT  NOERROR        F   

     raw_score  anomaly_score  
588  -0.088306             -1  
592  -0.054408             -1  
602  -0.088306             -1  
616  -0.063306             -1  
618  -0.063306             -1  
