# Imports and Declarations

In [None]:
import pandas as pd
import numpy as np
import regex as re
import faiss
from sentence_transformers import SentenceTransformer
import os
from openai import OpenAI
from anthropic import Anthropic
from config import openai_key, claude_key
import json

In [None]:
status2desc_dict = {
    100: "Continue",
    101: "Switching Protocols",
    102: "Processing",
    103: "Early Hints",
    
    200: "OK",
    201: "Created",
    202: "Accepted",
    203: "Non-Authoritative Information",
    204: "No Content",
    205: "Reset Content",
    206: "Partial Content",
    207: "Multi-Status",
    208: "Already Reported",
    226: "IM Used",
    
    300: "Multiple Choices",
    301: "Moved Permanently",
    302: "Found",
    303: "See Other",
    304: "Not Modified",
    305: "Use Proxy",
    306: "(Unused)",
    307: "Temporary Redirect",
    308: "Permanent Redirect",
    
    400: "Bad Request",
    401: "Unauthorized",
    402: "Payment Required",
    403: "Forbidden",
    404: "Not Found",
    405: "Method Not Allowed",
    406: "Not Acceptable",
    407: "Proxy Authentication Required",
    408: "Request Timeout",
    409: "Conflict",
    410: "Gone",
    411: "Length Required",
    412: "Precondition Failed",
    413: "Payload Too Large",
    414: "URI Too Long",
    415: "Unsupported Media Type",
    416: "Range Not Satisfiable",
    417: "Expectation Failed",
    418: "I'm a teapot",
    421: "Misdirected Request",
    422: "Unprocessable Entity",
    423: "Locked",
    424: "Failed Dependency",
    425: "Too Early",
    426: "Upgrade Required",
    428: "Precondition Required",
    429: "Too Many Requests",
    431: "Request Header Fields Too Large",
    451: "Unavailable For Legal Reasons",
    
    500: "Internal Server Error",
    501: "Not Implemented",
    502: "Bad Gateway",
    503: "Service Unavailable",
    504: "Gateway Timeout",
    505: "HTTP Version Not Supported",
    506: "Variant Also Negotiates",
    507: "Insufficient Storage",
    508: "Loop Detected",
    510: "Not Extended",
    511: "Network Authentication Required"
}

for key, desc in status2desc_dict.items():

    if key / 100 >= 5:
        status2desc_dict[key] = f"{key} [server error] {status2desc_dict[key]}"
    elif key / 100 >= 4:
        status2desc_dict[key] = f"{key} [client error] {status2desc_dict[key]}"
    elif key / 100 >= 3:
        status2desc_dict[key] = f"{key} [redirection] {status2desc_dict[key]}"
    elif key / 100 >= 2:
        status2desc_dict[key] = f"{key} [success] {status2desc_dict[key]}"
    else:
        status2desc_dict[key] = f"{key} [info] {status2desc_dict[key]}"


# Data Parsing and Pre-processing

In [None]:
log_dir = './apache_logs.txt'
matches = []

#              IP              ts        Request Status Size   Referrer User-Agent
rx_pattern = r'([(\d\.)]+) - - \[(.*?)\] "(.*?)" (\d+) (\d+|-) "(.*?)" "(.*?)"'

with open(log_dir) as log:
    for line in log:
        match = re.match(rx_pattern, line)
        if match:
            matches.append(match.groups())
        else:
            print(line)

columns = ["IP", "Timestamp", "Request", "Status", "Size", "Referrer", "User-Agent"]
df = pd.DataFrame(matches, columns=columns)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
df['Size'] = pd.to_numeric(df['Size'], errors='coerce').fillna(0).astype(int)
df['Status'] = df['Status'].astype(int)

In [None]:
emb_df = df.copy()
emb_df['Request'] = df['Request'].apply(lambda x: f"Request: {x}")
emb_df['Status-Description'] = df['Status'].apply(lambda x: f"Status: {status2desc_dict[x]}")
emb_df['Referrer'] = df['Referrer'].apply(lambda x: f"Referrer: {x}")
emb_df['User-Agent'] = df['User-Agent'].apply(lambda x: f"User-Agent: {x}")

# Embedding

In [None]:
emb_df.head()

In [None]:
model = SentenceTransformer('multi-qa-distilbert-dot-v1')
indices_dir = 'indices'
emb_cols = ['Request', 'Status-Description', 'Referrer', 'User-Agent']

if not os.path.isdir(indices_dir):

    print('Indices do not exist, embedding values.')

    os.makedirs(indices_dir)

    # Generate embeddings
    embeddings = dict()
    for col in emb_cols:
        embeddings[col] = model.encode(emb_df[col])

    # Init FAISS index shape with the first column
    doc_len, emb_dim = list(embeddings.values())[0].shape

    indices = dict()
    for col, emb in embeddings.items():
        indices[col] = faiss.IndexFlatL2(emb_dim)
        indices[col].add(emb)
        faiss.write_index(indices[col], os.path.join(indices_dir, f"faiss_{col}_index.bin"))

    print('Indices Created:')
    print(indices.keys())

else:

    doc_len = len(emb_df)
    print('Found indices, loading embeddings.')

    # Load the saved file to index
    indices = dict()
    for pth in os.listdir(indices_dir):
        indices[pth.split('_')[1]] = faiss.read_index(os.path.join(indices_dir, pth))

    print('Indices Loaded:')
    print(indices.keys())

# Query

In [None]:
def get_context(query, max_lines=25):

    def calculate_distances(query_embedding, indices):

        # Initialize results dictionary
        distances = dict()
        # Search in each index
        for key, index in indices.items():
            dts, idx = index.search(np.array([query_embedding]), k=doc_len)
            distance = np.array(sorted(zip(idx[0], dts[0]), key = lambda x: x[0]))[:,1]    # All distances sorted by index
            distances[key] = distance

        return distances
    
    # Search only for specific ip(s), if found in query
    ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    query_ips = re.findall(ip_pattern, query)

    # Embed and calculate vector distances
    query_embedding = model.encode(query, convert_to_numpy=True)
    dist_dict = calculate_distances(query_embedding, indices)

    disp_df = df.copy()

    for key, dists in dist_dict.items():
        disp_df[f'{key}_distance'] = dists

    disp_df['distances_mean'] = disp_df[[f'{nm}_distance' for nm in emb_cols]].mean(axis=1)

    if query_ips:
        disp_df = disp_df[emb_df['IP'].isin(query_ips)].sort_values('distances_mean', ascending=True)
    else:
        disp_df = disp_df.sort_values('distances_mean', ascending=True)

    # Debug
    display(disp_df.head())
    print(disp_df.shape)

    if len(disp_df) == 0:
        return "No log found with given information."
    else:
        return disp_df.iloc[:max_lines,:7].to_csv(index=False)

In [None]:
df[df['Status'] == 404]['IP'].value_counts()

In [None]:
print(get_context("SQL injection"))

In [None]:
class Chat():

    def __init__(self, system_text):
        self.client = Anthropic(api_key=claude_key)
        self.system = system_text
        self.messages = []

    def __call__(self, message, context=None):

        # Initialize current message by the user
        curr_msg = {
            "role": "user",
            "content": []
        }

        # Add context if available
        if context:
            curr_msg["content"].append({"type": "text", "text": context})

        # Add user message
        curr_msg["content"].append({"type": "text", "text": message})

        # Append the message to the conversation
        self.messages.append(curr_msg)

        # Get response from the API
        response = self.client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=1000,
            temperature=0,
            system=self.system,
            messages=self.messages
        )

        # Initialize agent message
        agent_response_message = {
            "role": "assistant",
            "content": [{"type": "text", "text": response.content[0].text}]
        }

        # Append response to conversation
        self.messages.append(agent_response_message)

        # print(json.dumps(self.messages, indent=4))

        # Return agent response to be printed
        return response.content[0].text

In [None]:
chat = Chat("You are a web security expert. You are tasked with analysing web logs and answering to questions/inquiries. A table to logs in csv format is given.")

In [None]:
print("Write your inquiry. Write \'esc\' to quit.")
for i in range(10):
    print("> ", end="")
    u_msg = input()

    if u_msg == "esc":
        break

    print(u_msg)

    if i == 0:
        print(chat(u_msg, get_context(u_msg)))
    else:
        print(chat(u_msg))