# • Converted the log file into a Data Frame to manipulate it.

In [26]:
import pandas as pd
import re

file_path = 'data/weblogdata.log'

columns = ['IP', 'DATE&TIME', 'REQUEST_METHOD', 'URL', 'STATUS_CODE', 'SIZE', 'USER_AGENT', 'RANDOM_LOG_NUMBER']

with open(file_path, 'r') as file:
    lines = file.readlines()

weblogs = []
for line in lines:
    match = re.match(r'(\S+) - - \[(.*?)\] "(.*?) (.*?) (.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (\d+)', line)
    if match:
        ip = match.group(1)
        datetime = match.group(2)
        request_method = match.group(3)
        url = match.group(4)
        protocol_version = match.group(5)
        status_code = match.group(6)
        size = match.group(7)
        user_agent = match.group(9)
        random_log_number = match.group(10)

        weblogs.append([ip, datetime, request_method, url, status_code, size, user_agent, random_log_number])

weblogs = pd.DataFrame(weblogs, columns=columns)

weblogs.head()


Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE,USER_AGENT,RANDOM_LOG_NUMBER
0,83.117.110.156,03/Apr/2020:02:22:45 +0300,POST,/user/profile/settings,500,5032,Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_9 like...,405
1,99.199.99.35,07/Oct/2023:03:50:45 +0300,POST,/contact/,502,4935,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:8...,4582
2,88.4.198.250,08/Oct/2023:04:57:40 +0300,DELETE,/usr/login,403,5003,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,1526
3,152.84.79.11,27/Feb/2023:07:43:23 +0300,PUT,/api/v1/data,500,5075,Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_9 like...,1551
4,207.187.124.211,14/Aug/2020:03:47:56 +0300,POST,/about.php,403,4994,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,2612


# • Removed columns that are not relevant to user questions

In [27]:
weblogs = weblogs.drop(columns=["USER_AGENT", "RANDOM_LOG_NUMBER"])
weblogs.head()

Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE
0,83.117.110.156,03/Apr/2020:02:22:45 +0300,POST,/user/profile/settings,500,5032
1,99.199.99.35,07/Oct/2023:03:50:45 +0300,POST,/contact/,502,4935
2,88.4.198.250,08/Oct/2023:04:57:40 +0300,DELETE,/usr/login,403,5003
3,152.84.79.11,27/Feb/2023:07:43:23 +0300,PUT,/api/v1/data,500,5075
4,207.187.124.211,14/Aug/2020:03:47:56 +0300,POST,/about.php,403,4994


# • Converting Date and Time into workable format was divided into two.

In [31]:
weblogs['DATE&TIME'] = pd.to_datetime(weblogs['DATE&TIME'], format='%d/%b/%Y:%H:%M:%S %z')


weblogs['DATE'] = weblogs['DATE&TIME'].dt.date
weblogs['TIME'] = weblogs['DATE&TIME'].dt.time

weblogs = weblogs.drop(columns=["DATE&TIME"])

weblogs.to_csv('data/data.csv', index=False)

# • In order for the RAG model to produce meaningful answers, all information was brought together in a single column as a sentence.

In [32]:
weblogs['LOG_CONTENT'] = weblogs.apply(
    lambda row: (
        f"The client with IP address {row['IP']}, sent and accessed the request method named {row['REQUEST_METHOD']}, request to the {row['URL']}, address "
        f"at {row['TIME']}, on {row['DATE']}, This request was responded with a {row['STATUS_CODE']} status code, "
        f"and a size of {row['SIZE']}."
    ),
    axis=1
)


weblogs.head()


Unnamed: 0,IP,REQUEST_METHOD,URL,STATUS_CODE,SIZE,LOG_CONTENT,DATE,TIME
0,83.117.110.156,POST,/user/profile/settings,500,5032,"The client with IP address 83.117.110.156, sen...",2020-04-03,02:22:45
1,99.199.99.35,POST,/contact/,502,4935,"The client with IP address 99.199.99.35, sent ...",2023-10-07,03:50:45
2,88.4.198.250,DELETE,/usr/login,403,5003,"The client with IP address 88.4.198.250, sent ...",2023-10-08,04:57:40
3,152.84.79.11,PUT,/api/v1/data,500,5075,"The client with IP address 152.84.79.11, sent ...",2023-02-27,07:43:23
4,207.187.124.211,POST,/about.php,403,4994,"The client with IP address 207.187.124.211, se...",2020-08-14,03:47:56


# • Other columns were discarded. Data was made vectorizable

In [33]:
log_content_data = weblogs[['LOG_CONTENT']]

log_content_data.to_csv('data/cleaned_data.csv', index=False)