# Neural Search applied to Proxy Logs

In [16]:
# python -v venv venv
# venv\Scripts\Activate.ps1
# pip install pandas
# pip install chromadb
# From https://github.com/chroma-core/chroma

In [2]:
import os, sys
import pandas as pd 
import chromadb

In [39]:
df = pd.read_csv('proxy_logs_malicious.csv')
df.columns

Index(['ID', 'IP Address', 'Timestamp', 'Log Entry'], dtype='object')

In [40]:
df.sample(3)

Unnamed: 0,ID,IP Address,Timestamp,Log Entry
428,429,50.135.210.214,01/Aug/2024:14:15:16,GET http://www.example.com/search?q=<script>al...
323,324,228.228.202.131,01/Aug/2024:01:23:54,GET http://www.example.com/index.php?page=../....
539,540,24.215.31.65,01/Aug/2024:04:40:30,"GET http://www.example.com/admin HTTP/1.1"" 500..."


In [41]:
# Add a sourcetype and source to the data, so that it looks a bit more like data that would come
# out of Splunk. 
df.insert(1, "Sourcetype", "fabricated_data")
df.insert(2, "Source", "fake_source")

In [54]:
df.sample(3)

Unnamed: 0,ID,Sourcetype,Source,IP Address,Timestamp,Log Entry
868,869,fabricated_data,fake_source,192.12.149.234,01/Aug/2024:02:01:13,"GET http://www.example.com/admin HTTP/1.1"" 404..."
768,769,fabricated_data,fake_source,30.210.242.220,01/Aug/2024:04:22:47,GET http://www.example.com/index.php?page=../....
778,779,fabricated_data,fake_source,119.156.234.12,01/Aug/2024:06:54:27,GET http://www.example.com/login.php?user=admi...


In [6]:
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()
#client = chromadb.PersistentClient(path='./chromadb_proxy_logs')

In [7]:
# Create collection. get_collection, get_or_create_collection, delete_collection also available
collection = client.get_or_create_collection(name='malicious_proxy_logs')

In [86]:
# Create lists of the necessary data from the dataframe
ID_list = df['ID'].astype(str).tolist()  # ID list
Sourcetype_list = df['Sourcetype']
# list comprehension:  Sourcetype_dict = []  # Metadata list of dicts
Source_list = df['Source']
# list comprehension:  Source_dict = []   # Metadata list of dicts
LogEntry_list = df['Log Entry'].tolist()     # List of documents (log content)

In [88]:
# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
    documents=LogEntry_list, # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    #metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=ID_list, # unique ID for each doc
)

In [106]:
# Examine a few records by ID
#collection.get('doc1')
collection.get(['1','2'])

{'ids': ['1', '2'],
 'embeddings': None,
 'metadatas': [None, None],
 'documents': ['GET http://www.example.com/search?q=<script>alert(\'xss\')</script> HTTP/1.1" 403 1035845 "http://www.example.com/search?q=<script>alert(\'xss\')</script>" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36',
  'GET http://www.example.com/login.php?user=admin\'-- HTTP/1.1" 403 616857 "http://www.example.com/login.php?user=admin\'--" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'],
 'uris': None,
 'data': None}

In [107]:
# ANN Query/search K most similar results. You can also .get by id
results = collection.query(
    query_texts=["http://www.example.com/../../etc/passwd"],
    #query_texts=["This is a query document"],
    n_results=3,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
)

results

{'ids': [['95', '780', '207']],
 'distances': [[0.7512704133987427, 0.7586382627487183, 0.7596009373664856]],
 'metadatas': [[None, None, None]],
 'embeddings': None,
 'documents': [['GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 400 1035268 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36',
   'GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 500 334877 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36',
   'GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 400 839738 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro