# • Converted the log file into a Data Frame to manipulate it.

In [11]:
import pandas as pd
import re

file_path = 'data/weblogdata.log'

columns = ['IP', 'DATE&TIME', 'REQUEST_METHOD', 'URL', 'STATUS_CODE', 'SIZE', 'USER_AGENT', 'RANDOM_LOG_NUMBER']

with open(file_path, 'r') as file:
    lines = file.readlines()

weblogs = []
for line in lines:
    match = re.match(r'(\S+) - - \[(.*?)\] "(.*?) (.*?) (.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (\d+)', line)
    if match:
        ip = match.group(1)
        datetime = match.group(2)
        request_method = match.group(3)
        url = match.group(4)
        protocol_version = match.group(5)
        status_code = match.group(6)
        size = match.group(7)
        user_agent = match.group(9)
        random_log_number = match.group(10)

        weblogs.append([ip, datetime, request_method, url, status_code, size, user_agent, random_log_number])

weblogs = pd.DataFrame(weblogs, columns=columns)

weblogs.head()


Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE,USER_AGENT,RANDOM_LOG_NUMBER
0,52.74.106.242,25/Aug/2022:12:26:21 +0300,DELETE,/usr/register,502,5045,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,1193
1,77.239.28.23,11/Aug/2023:03:48:34 +0300,GET,/api/v1/data,304,5008,Mozilla/5.0 (Android 10; Mobile; rv:84.0) Geck...,1927
2,199.167.112.241,18/Apr/2020:12:48:44 +0300,PUT,/usr/admin/developer,200,4974,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,3423
3,68.46.97.32,19/Apr/2021:02:24:05 +0300,POST,/images/banner.jpg,304,4968,Mozilla/5.0 (Android 10; Mobile; rv:84.0) Geck...,525
4,31.25.18.184,26/Dec/2022:06:12:50 +0300,PUT,/usr/register,303,5034,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,1760


# • Removed columns that are not relevant to user questions

In [12]:
weblogs = weblogs.drop(columns=["USER_AGENT", "RANDOM_LOG_NUMBER"])
weblogs.head()

Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE
0,52.74.106.242,25/Aug/2022:12:26:21 +0300,DELETE,/usr/register,502,5045
1,77.239.28.23,11/Aug/2023:03:48:34 +0300,GET,/api/v1/data,304,5008
2,199.167.112.241,18/Apr/2020:12:48:44 +0300,PUT,/usr/admin/developer,200,4974
3,68.46.97.32,19/Apr/2021:02:24:05 +0300,POST,/images/banner.jpg,304,4968
4,31.25.18.184,26/Dec/2022:06:12:50 +0300,PUT,/usr/register,303,5034


# • Converting Date and Time into workable format was divided into two.

In [13]:
weblogs['DATE&TIME'] = pd.to_datetime(weblogs['DATE&TIME'], format='%d/%b/%Y:%H:%M:%S %z')

weblogs['DATE'] = weblogs['DATE&TIME'].dt.date
weblogs['TIME'] = weblogs['DATE&TIME'].dt.time

weblogs = weblogs.drop(columns=["DATE&TIME"])

weblogs.to_csv('data/data.csv', index=False)

# • In order for the RAG model to produce meaningful answers, all information was brought together in a single column as a sentence.

In [14]:
weblogs['LOG_CONTENT'] = weblogs.apply(
    lambda row: (
        f"The client with IP address {row['IP']}, sent and accessed the request method named {row['REQUEST_METHOD']}, request to the {row['URL']}, address "
        f"at {row['TIME']}, on {row['DATE']}, This request was responded with a {row['STATUS_CODE']}, status code "
        f"and a size of {row['SIZE']}."
    ),
    axis=1
)


weblogs.head()


Unnamed: 0,IP,REQUEST_METHOD,URL,STATUS_CODE,SIZE,DATE,TIME,LOG_CONTENT
0,52.74.106.242,DELETE,/usr/register,502,5045,2022-08-25,12:26:21,"The client with IP address 52.74.106.242, sent..."
1,77.239.28.23,GET,/api/v1/data,304,5008,2023-08-11,03:48:34,"The client with IP address 77.239.28.23, sent ..."
2,199.167.112.241,PUT,/usr/admin/developer,200,4974,2020-04-18,12:48:44,"The client with IP address 199.167.112.241, se..."
3,68.46.97.32,POST,/images/banner.jpg,304,4968,2021-04-19,02:24:05,"The client with IP address 68.46.97.32, sent a..."
4,31.25.18.184,PUT,/usr/register,303,5034,2022-12-26,06:12:50,"The client with IP address 31.25.18.184, sent ..."


# • Other columns were discarded. Data was made vectorizable

In [15]:
log_content_data = weblogs[['LOG_CONTENT']]

log_content_data.to_csv('data/cleaned_data.csv', index=False)