# Neural Search applied to Proxy Logs

In [1]:
import os, sys
import pandas as pd 
import chromadb
#from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

## Create vector DB of malicious logs
We can use this vectorstore of known malicious logs as reference data to compare incoming logs against to determine if any of those events are similar enough to raise an alert.  

In [2]:
# This CSV file contains fabricated proxy logs that are examples of malicious activity attempts
df = pd.read_csv('proxy_logs_malicious.csv')
df.columns

Index(['ID', 'IP Address', 'Timestamp', 'Log Entry'], dtype='object')

In [3]:
df.sample(3)

Unnamed: 0,ID,IP Address,Timestamp,Log Entry
993,994,138.117.27.2,01/Aug/2024:20:28:25,"GET http://www.example.com/admin HTTP/1.1"" 404..."
940,941,125.61.182.56,01/Aug/2024:18:24:35,GET http://www.example.com/index.php?page=../....
129,130,224.86.188.235,01/Aug/2024:02:35:59,GET http://www.example.com/index.php?page=../....


In [4]:
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()
#client = chromadb.PersistentClient(path='./chromadb_proxy_logs')

In [5]:
# Create collection. get_collection, get_or_create_collection, delete_collection also available
# ChromaDB uses L2 (Euclidean distance) by default...we want Cosine metric.
# Cosine similarity -> higher = better
collection = client.get_or_create_collection(name='malicious_proxy_logs', metadata={"hnsw:space": "cosine"})



In [6]:
# Create lists of the necessary data from the dataframe
ID_list = df['ID'].astype(str).tolist()  # ID list, converted to string
LogEntry_list = df['Log Entry'].tolist()   # List of documents (log content)


In [7]:

# Add docs to the collection. Can also update and delete.
# We are letting ChromaDB automatically calculate the vector embedding, instead of explicitly handling it
# By default, ChromaDB uses all-MiniLM-L6-v2 sentence transformer model to calculate vector embeddings
# This all-MiniLM-L6-v2 model provides a 384 dimension vector that can be used for embedding and clustering
collection.add(
    documents=LogEntry_list, # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    #metadatas=[{"source": "notion"}, {"source": "google-docs"}], # metadata filters
    ids=ID_list, # unique ID for each doc
)

In [9]:
# Examine a record by ID...tell it to show the vector embedding so we can see what it looks like
# The vector embedding is the 384 dimension numeric representation of what that text "means"...
collection.get('1', include=['embeddings', 'documents', 'metadatas'])
#collection.get(['1','2'], include=['embeddings', 'documents', 'metadatas'])

{'ids': ['1'],
 'embeddings': [array([ 7.62485852e-03,  4.53667305e-02, -4.74873595e-02,  6.38289424e-03,
          1.04712076e-01, -9.29494649e-02,  3.30195762e-03, -2.59210616e-02,
          5.31232208e-02, -9.98799801e-02, -1.34964027e-02,  2.52751410e-02,
          9.09934863e-02, -2.30148784e-03, -3.68964337e-02,  2.72639599e-02,
         -3.08614544e-05,  1.75895784e-02,  4.42130752e-02, -4.82321531e-02,
          9.13773328e-02,  3.76302861e-02,  4.21186276e-02, -8.69319122e-03,
         -7.84153715e-02, -4.20195386e-02, -3.71134132e-02, -1.43288216e-02,
          3.94372605e-02,  7.00980704e-03,  4.29793000e-02, -1.34540960e-01,
          1.49487145e-02, -1.01147126e-02,  6.74820086e-03, -2.62145475e-02,
          2.07935330e-02,  5.18131116e-03,  1.60162561e-02, -8.18299782e-03,
         -7.13483337e-03, -1.71514926e-03,  5.44389822e-02,  5.02826646e-02,
         -1.11683058e-02, -3.28837633e-02, -8.96301419e-02, -5.58557063e-02,
         -8.27203915e-02, -8.97455681e-03, -4.6

In [10]:
# Execute an ANN Query/search for K most similar results.
results = collection.query(
    query_texts=["http://www.example.com/../../etc/passwd"], # This gets vectorized and used for vector query
    n_results=3,
    # where_document={"$contains":"Macintosh"}  # optional keyword filter
    # where={"metadata_field": "is_equal_to_this"}, # optional metadata filter
)

results

{'ids': [['95', '660', '780']],
 'distances': [[0.37563520669937134, 0.3781334161758423, 0.37931931018829346]],
 'metadatas': [[None, None, None]],
 'embeddings': None,
 'documents': [['GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 400 1035268 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36',
   'GET http://www.example.com/../../etc/passwd HTTP/1.1" 500 322256 "http://www.example.com/../../etc/passwd" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
   'GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 500 334877 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36']],
 'uris': None

In [15]:
# Ask the same question, but this time in plain English!
results = collection.query(
    query_texts=["Can you show me possible attempts to change a password?"],  # Vectorize and search
    n_results=3,
    # where_document={"$contains":"script"}  # optional keyword filter
    # where={"metadata_field": "is_equal_to_this"}, # optional metadata filter
)

results

{'ids': [['730', '573', '153']],
 'distances': [[0.6075342893600464, 0.6118622422218323, 0.6151671409606934]],
 'metadatas': [[None, None, None]],
 'embeddings': None,
 'documents': [['GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 403 580156 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
   'GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 403 530538 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
   'GET http://www.example.com/index.php?page=../../../../../../etc/passwd HTTP/1.1" 403 190296 "http://www.example.com/index.php?page=../../../../../../etc/passwd" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, 

# Semantic matching of malicious attempts
We have a vector DB of malicious logs to execute neural searches against.  Now, let's feed it some fresh logs to see what matches we get.  We want to use cosine similarity for this.  We'll have to experiment a bit to set a reasonable threshold for when to return an alert vs not.

In [16]:
# Some fabricated proxy logs that contain 950 benign log entries and 50 malicious log entries
df = pd.read_csv('proxy_logs_mixed.csv')

In [17]:
df.sample(3)

Unnamed: 0,ID,IP Address,Timestamp,Log Entry
723,724,63.217.5.97,01/Aug/2024:12:01:28,"GET http://intranet.company.com/home HTTP/1.1""..."
34,35,235.6.25.54,01/Aug/2024:22:29:56,GET http://cloudstorage.com/authenticate HTTP/...
359,360,68.49.67.111,01/Aug/2024:17:21:18,"GET http://www.example.com/ HTTP/1.1"" 200 1017..."


In [18]:
incoming_proxy_logs = df['Log Entry']

In [19]:
incoming_proxy_logs

0      GET http://www.example.com/ HTTP/1.1" 200 4516...
1      GET http://intranet.company.com/home HTTP/1.1"...
2      GET http://update.software.com/check HTTP/1.1"...
3      GET http://update.software.com/version.xml HTT...
4      GET http://www.example.com/ HTTP/1.1" 200 6339...
                             ...                        
995    GET http://www.example.com/admin HTTP/1.1" 400...
996    GET http://www.example.com/admin HTTP/1.1" 403...
997    GET http://www.example.com/../../etc/passwd HT...
998    GET http://www.example.com/admin HTTP/1.1" 403...
999    GET http://www.example.com/search?q=<script>al...
Name: Log Entry, Length: 1000, dtype: object

In [20]:
# Find incoming log entries that match known malicious activity at a pre-determined threshold 
for log_entry in incoming_proxy_logs:
    results = collection.query(
    query_texts=log_entry,
    n_results=1,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
    )
    if results['distances'][0][0] >= 0.485:  # Threshold for a match
       print(f'Possible alert on: {log_entry} \n  -- Match within threshold on {results}')

Possible alert on: GET http://cloudstorage.com/files HTTP/1.1" 200 365923 "http://cloudstorage.com/files" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 
  -- Match within threshold on {'ids': [['251']], 'distances': [[0.4886305332183838]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['GET http://www.example.com/admin HTTP/1.1" 500 108615 "http://www.example.com/admin" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15']], 'uris': None, 'data': None}
Possible alert on: GET http://cloudstorage.com/files HTTP/1.1" 200 142231 "http://cloudstorage.com/files" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 
  -- Match within threshold on {'ids': [['251']], 'distances': [[0.4881707429885864]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['GET http://www.example

# Build vector DB of Benign Proxy log data

In [21]:
# To conserve memory, re-use the previous collection for new data
collection = client.get_or_create_collection(name='benign_proxy_logs', metadata={"hnsw:space": "cosine"})


In [22]:
# This CSV file contains fabricated proxy logs that are examples of benign activity
df = pd.read_csv('proxy_logs_good.csv')
df.sample(3)

Unnamed: 0,ID,IP Address,Timestamp,Log Entry
311,312,135.245.131.83,01/Aug/2024:22:49:43,"GET http://update.software.com/check HTTP/1.1""..."
40,41,60.193.62.232,01/Aug/2024:16:31:31,"GET http://update.software.com/check HTTP/1.1""..."
364,365,158.184.144.124,01/Aug/2024:05:55:31,GET http://cloudstorage.com/authenticate HTTP/...


In [23]:
# Create lists of the necessary data from the dataframe
ID_list = df['ID'].astype(str).tolist()  # ID list, converted to string
LogEntry_list = df['Log Entry'].tolist()   # List of documents (log content)

In [24]:
# Add log records to vector store, allowing ChromaDB to calculate vector embeddings
collection.add(
    documents=LogEntry_list, # we handle tokenization, embedding, and indexing automatically. 
    #metadatas=[{"source": "notion"}, {"source": "google-docs"}], # metadata filters
    ids=ID_list, # unique ID for each doc
)

In [25]:
# Check a few records...
collection.get(['1','2','3'])


{'ids': ['1', '2', '3'],
 'embeddings': None,
 'metadatas': [None, None, None],
 'documents': ['GET http://www.example.com/ HTTP/1.1" 200 140321 "http://www.example.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
  'GET http://update.software.com/check HTTP/1.1" 200 943292 "http://update.software.com/check" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
  'GET http://www.example.com/ HTTP/1.1" 200 813021 "http://www.example.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'],
 'uris': None,
 'data': None}

# Vector search for Anomaly Detection
With a vector DB of known benign/good proxy log entries, we can do semantic comparison to flag incoming log entries that are too different from what we know to be benign logs. 

In [26]:
# We'll use fabricated proxy logs that contain 950 benign log entries and 50 malicious log entries again
df = pd.read_csv('proxy_logs_mixed.csv')
df.sample(3)

Unnamed: 0,ID,IP Address,Timestamp,Log Entry
294,295,57.76.176.91,01/Aug/2024:16:22:03,"GET http://intranet.company.com/home HTTP/1.1""..."
272,273,133.160.229.254,01/Aug/2024:02:47:15,"GET http://www.example.com/script.js HTTP/1.1""..."
855,856,200.162.29.102,01/Aug/2024:02:47:13,"GET http://intranet.company.com/home HTTP/1.1""..."


In [27]:
# Grab just the log entry iself
incoming_proxy_logs = df['Log Entry']
incoming_proxy_logs

0      GET http://www.example.com/ HTTP/1.1" 200 4516...
1      GET http://intranet.company.com/home HTTP/1.1"...
2      GET http://update.software.com/check HTTP/1.1"...
3      GET http://update.software.com/version.xml HTT...
4      GET http://www.example.com/ HTTP/1.1" 200 6339...
                             ...                        
995    GET http://www.example.com/admin HTTP/1.1" 400...
996    GET http://www.example.com/admin HTTP/1.1" 403...
997    GET http://www.example.com/../../etc/passwd HT...
998    GET http://www.example.com/admin HTTP/1.1" 403...
999    GET http://www.example.com/search?q=<script>al...
Name: Log Entry, Length: 1000, dtype: object

In [30]:
# Let's identify anomalies in the incoming logs by using vector search against DB of known benign logs
for log_entry in incoming_proxy_logs:
    results = collection.query(
    query_texts=log_entry,
    n_results=1,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
    )
    if results['distances'][0][0] <= 0.0001:  # Threshold for a non-match
       print(f'Possible anomaly on: {log_entry} \n  -- Different within threshold on {results}')

Possible anomaly on: GET http://intranet.company.com/docs/document.pdf HTTP/1.1" 200 937423 "http://intranet.company.com/docs/document.pdf" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15 
  -- Different within threshold on {'ids': [['250']], 'distances': [[9.751319885253906e-05]], 'metadatas': [[None]], 'embeddings': None, 'documents': [['GET http://intranet.company.com/docs/document.pdf HTTP/1.1" 200 787396 "http://intranet.company.com/docs/document.pdf" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15']], 'uris': None, 'data': None}
Possible anomaly on: GET http://cloudstorage.com/authenticate HTTP/1.1" 200 596569 "http://cloudstorage.com/authenticate" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15 
  -- Different within threshold on {'ids': [['597']], 'distances': [[