In [None]:
# Install required packages (run in Colab cell)
!pip install -q huggingface_hub joblib pandas scikit-learn


In [None]:
from huggingface_hub import hf_hub_download
import joblib, json, os

repo_id = "dr-rakshith-truth-zeeker/truth-zeeker-ai-demo"   # <-- confirm this
model_filename = "model_20251020.joblib"                    # <-- confirm this

print("Downloading HF model...")
hf_path = hf_hub_download(repo_id=repo_id, filename=model_filename, repo_type="model")
print("Downloaded to:", hf_path)

# Inspect
obj = joblib.load(hf_path)
print("\n--- Hugging Face model info ---")
if isinstance(obj, dict):
    print("type: dict")
    print("keys:", list(obj.keys()))
    print("features:", obj.get("features"))
    print("has pipeline:", isinstance(obj.get("pipeline"), object) and obj.get("pipeline") is not None)
else:
    print("Loaded object type:", type(obj))
    # if it's a sklearn pipeline directly
    try:
        print("If sklearn pipeline, get feature_names attribute (if present):", getattr(obj, "feature_names_in_", None))
    except Exception:
        pass

# Keep the model/pipeline in memory for next cells:
if isinstance(obj, dict) and "pipeline" in obj:
    pipeline = obj["pipeline"]
    features = obj["features"]
elif hasattr(obj, "predict"):
    pipeline = obj
    features = None
else:
    raise RuntimeError("Model structure unexpected - let me know the printed output above.")


Downloading HF model...
Downloaded to: /root/.cache/huggingface/hub/models--dr-rakshith-truth-zeeker--truth-zeeker-ai-demo/snapshots/9d56c1ead5c64f2024c9dd59a06ca939e8c2a986/model_20251020.joblib

--- Hugging Face model info ---
type: dict
keys: ['pipeline', 'features']
features: ['conn_count', 'total_orig_bytes', 'total_resp_bytes', 'avg_duration', 'anomaly_score']
has pipeline: True


In [None]:
# Fallback: download CSV from HF and print simple text output (no JS interactive display)
import pandas as pd
from huggingface_hub import hf_hub_download
import os

repo_id = "dr-rakshith-truth-zeeker/truth-zeeker-ai-demo"
csv_name = "zeek_features_for_training_pseudo.csv"

# If the repo/file is private you must pass the token (set it below or in Colab secrets)
HF_TOKEN = os.environ.get("HF_TOKEN", None)   # set this env var in Colab if needed

try:
    if HF_TOKEN:
        csv_path = hf_hub_download(repo_id=repo_id, filename=csv_name, repo_type="model",
                                   use_auth_token=HF_TOKEN)
    else:
        csv_path = hf_hub_download(repo_id=repo_id, filename=csv_name, repo_type="model")
    print("✅ Downloaded CSV path:", csv_path)
except Exception as e:
    raise RuntimeError(f"Could not fetch CSV from HF: {e}")

df = pd.read_csv(csv_path)
print("Loaded shape:", df.shape)
print("\nFirst 10 rows (text):\n")
print(df.head(10).to_string(index=False))


✅ Downloaded CSV path: /root/.cache/huggingface/hub/models--dr-rakshith-truth-zeeker--truth-zeeker-ai-demo/snapshots/c1084d6ef2317cf7e4b5762654e7eb965f4afedf/zeek_features_for_training_pseudo.csv
Loaded shape: (16, 9)

First 10 rows (text):

 src_ip  conn_count  total_orig_bytes  total_resp_bytes  unique_dst_ports  avg_duration  total_orig_pkts  total_resp_pkts  anomaly_score
 host_1           1               0.0               0.0                 1      0.000000                1                0      -0.394479
 host_2           1               0.0               0.0                 1      0.000000                1                0      -0.394479
 host_3           1             150.0               0.0                 1      1.502042                3                0      -0.128259
 host_4           1               0.0               0.0                 1      0.000000                1                0      -0.394479
 host_5           1               0.0               0.0                 1

In [None]:
import os
import joblib
import pandas as pd
from huggingface_hub import hf_hub_download

# --- Config ---
repo_id = "dr-rakshith-truth-zeeker/truth-zeeker-ai-demo"
model_filename = "model_20251020.joblib"   # the HF model you uploaded earlier
HF_TOKEN = os.environ.get("HF_TOKEN", None)

# --- Download model from HF ---
try:
    model_path = hf_hub_download(
        repo_id=repo_id,
        filename=model_filename,
        repo_type="model",
        use_auth_token=HF_TOKEN
    )
    print(f"✅ Model downloaded from HF: {model_path}")
except Exception as e:
    raise RuntimeError(f"❌ Could not download model: {e}")

# --- Load model ---
model = joblib.load(model_path)
print(f"\nLoaded model type: {type(model)}")

if isinstance(model, dict):
    pipeline = model.get("pipeline")
    features = model.get("features")
else:
    pipeline = model
    features = None

print(f"Model features: {features}")

# --- Ensure required columns exist ---
for f in features:
    if f not in df.columns:
        print(f"⚠️ Missing feature column in CSV: {f}")
        df[f] = 0  # filler if needed

# --- Run inference ---
if pipeline:
    preds = pipeline.predict(df[features])
    df["anomaly_flag"] = preds
    print("\n✅ Inference complete. Sample output:")
    print(df.head(10).to_string(index=False))
else:
    print("❌ No valid pipeline found in model.")

# --- Save output CSV ---
out_path = "/content/zeek_inference_output.csv"
df.to_csv(out_path, index=False)
print(f"\n📁 Saved inference results to: {out_path}")


✅ Model downloaded from HF: /root/.cache/huggingface/hub/models--dr-rakshith-truth-zeeker--truth-zeeker-ai-demo/snapshots/c1084d6ef2317cf7e4b5762654e7eb965f4afedf/model_20251020.joblib

Loaded model type: <class 'dict'>
Model features: ['conn_count', 'total_orig_bytes', 'total_resp_bytes', 'avg_duration', 'anomaly_score']

✅ Inference complete. Sample output:
 src_ip  conn_count  total_orig_bytes  total_resp_bytes  unique_dst_ports  avg_duration  total_orig_pkts  total_resp_pkts  anomaly_score  anomaly_flag
 host_1           1               0.0               0.0                 1      0.000000                1                0      -0.394479             1
 host_2           1               0.0               0.0                 1      0.000000                1                0      -0.394479             1
 host_3           1             150.0               0.0                 1      1.502042                3                0      -0.128259             1
 host_4           1               

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [13]:
# Create a horizontal bar chart (JS-free) and save to file
import pandas as pd
import matplotlib.pyplot as plt

# load results (from your previous cell)
df = pd.read_csv("/content/zeek_inference_output.csv")

# Decide how to rank anomalies:
# - If your model gives an 'anomaly_score' where larger means more anomalous:
#     sort by 'anomaly_score' descending
# - If the pipeline uses IsolationForest (-1 == anomaly), use anomaly_flag == -1
# Here we'll pick top rows by anomaly_score (change if needed).
if "anomaly_score" in df.columns:
    top = df.sort_values(by="anomaly_score", ascending=False).head(10)
else:
    top = df[df["anomaly_flag"] == -1].head(10) if "anomaly_flag" in df.columns else df.head(10)

# pick label for y-axis if available (hostnames) else use index
if "src_ip" in top.columns:
    labels = top["src_ip"].astype(str)
elif "host" in top.columns:
    labels = top["host"].astype(str)
else:
    labels = top.index.astype(str)

scores = top["anomaly_score"] if "anomaly_score" in top.columns else top["anomaly_flag"]

plt.figure(figsize=(10,6))
plt.barh(labels, scores)
plt.xlabel("Anomaly score (higher = more anomalous)")
plt.title("Top anomalous source hosts (example)")
plt.gca().invert_yaxis()  # biggest at top
plt.tight_layout()

out_png = "/content/top_anomalies.png"
plt.savefig(out_png)
print(f"Saved chart to: {out_png}")
plt.close()


Saved chart to: /content/top_anomalies.png
