# Neural Search applied to Proxy Logs

In [None]:
# python -v venv venv
# venv\Scripts\Activate.ps1
# pip install pandas
# pip install chromadb
# From https://github.com/chroma-core/chroma

In [44]:
import os, sys
import pandas as pd 
import chromadb
#from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

In [45]:
df = pd.read_csv('proxy_logs_malicious.csv')
df.columns

Index(['ID', 'IP Address', 'Timestamp', 'Log Entry'], dtype='object')

In [46]:
# Add a source to the data, so that it looks a bit more like data that would come
# out of Splunk. 
df.insert(2, "Source", "fake_source")

In [47]:
df.sample(3)

Unnamed: 0,ID,IP Address,Source,Timestamp,Log Entry
407,408,33.138.50.79,fake_source,01/Aug/2024:21:19:16,GET http://www.example.com/index.php?page=../....
583,584,55.10.207.21,fake_source,01/Aug/2024:10:14:14,"GET http://www.example.com/admin HTTP/1.1"" 404..."
595,596,93.193.223.143,fake_source,01/Aug/2024:21:56:32,GET http://www.example.com/login.php?user=admi...


In [48]:
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()
#client = chromadb.PersistentClient(path='./chromadb_proxy_logs')

In [49]:
# Create collection. get_collection, get_or_create_collection, delete_collection also available
collection = client.get_or_create_collection(name='malicious_proxy_logs')

In [51]:
# Create lists of the necessary data from the dataframe
ID_list = df['ID'].astype(str).tolist()  # ID list

Source_list = df['Source']  # List of sources
metadata = ['{"source": "' + source + '"}' for source in Source_list]  # List of dictionaries with sources

LogEntry_list = df['Log Entry'].tolist()   # List of documents (log content)


In [55]:

# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
    documents=LogEntry_list, # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    #metadatas=metadata,
    #metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=ID_list, # unique ID for each doc
)

Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3
Insert of existing embedding ID: 4
Insert of existing embedding ID: 5
Insert of existing embedding ID: 6
Insert of existing embedding ID: 7
Insert of existing embedding ID: 8
Insert of existing embedding ID: 9
Insert of existing embedding ID: 10
Insert of existing embedding ID: 11
Insert of existing embedding ID: 12
Insert of existing embedding ID: 13
Insert of existing embedding ID: 14
Insert of existing embedding ID: 15
Insert of existing embedding ID: 16
Insert of existing embedding ID: 17
Insert of existing embedding ID: 18
Insert of existing embedding ID: 19
Insert of existing embedding ID: 20
Insert of existing embedding ID: 21
Insert of existing embedding ID: 22
Insert of existing embedding ID: 23
Insert of existing embedding ID: 24
Insert of existing embedding ID: 25
Insert of existing embedding ID: 26
Insert of existing embedding ID: 27
Insert of existing embedding ID: 28
I

In [56]:
# Examine a few records by ID
collection.get('1', include=['embeddings', 'documents', 'metadatas'])
#collection.get(['1','2'], include=['embeddings', 'documents', 'metadatas'])

{'ids': ['1'],
 'embeddings': [array([ 7.62485852e-03,  4.53667305e-02, -4.74873595e-02,  6.38289424e-03,
          1.04712076e-01, -9.29494649e-02,  3.30195762e-03, -2.59210616e-02,
          5.31232208e-02, -9.98799801e-02, -1.34964027e-02,  2.52751410e-02,
          9.09934863e-02, -2.30148784e-03, -3.68964337e-02,  2.72639599e-02,
         -3.08614544e-05,  1.75895784e-02,  4.42130752e-02, -4.82321531e-02,
          9.13773328e-02,  3.76302861e-02,  4.21186276e-02, -8.69319122e-03,
         -7.84153715e-02, -4.20195386e-02, -3.71134132e-02, -1.43288216e-02,
          3.94372605e-02,  7.00980704e-03,  4.29793000e-02, -1.34540960e-01,
          1.49487145e-02, -1.01147126e-02,  6.74820086e-03, -2.62145475e-02,
          2.07935330e-02,  5.18131116e-03,  1.60162561e-02, -8.18299782e-03,
         -7.13483337e-03, -1.71514926e-03,  5.44389822e-02,  5.02826646e-02,
         -1.11683058e-02, -3.28837633e-02, -8.96301419e-02, -5.58557063e-02,
         -8.27203915e-02, -8.97455681e-03, -4.6

In [61]:
# ANN Query/search K most similar results. You can also .get by id
results = collection.query(
    query_texts=["http://www.example.com/../../etc/passwd"],
    n_results=3,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
)

results

{'ids': [['95', '660', '780']],
 'distances': [[0.7512704133987427, 0.7562668323516846, 0.7586382627487183]],
 'metadatas': [[None, None, None]],
 'embeddings': None,
 'documents': [['GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 400 1035268 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36',
   'GET http://www.example.com/../../etc/passwd HTTP/1.1" 500 322256 "http://www.example.com/../../etc/passwd" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
   'GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 500 334877 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36']],
 'uris': None,


We have a vector DB of malicious logs to execute neural searches against.  Now, let's feed it some fresh logs.

In [72]:
df = pd.read_csv('proxy_logs_mixed.csv')

In [74]:
df.sample(3)

Unnamed: 0,ID,IP Address,Timestamp,Log Entry
529,530,173.189.229.255,01/Aug/2024:20:05:36,GET http://intranet.company.com/docs/document....
252,253,143.237.65.175,01/Aug/2024:15:07:38,"GET http://cloudstorage.com/login HTTP/1.1"" 20..."
937,938,148.181.184.81,01/Aug/2024:15:31:02,"GET http://cloudstorage.com/files HTTP/1.1"" 20..."
