# • Converted the log file into a Data Frame to manipulate it.

In [85]:
import pandas as pd
import re

file_path = 'data/weblogdata.log'

columns = ['IP', 'DATE&TIME', 'REQUEST_METHOD', 'URL', 'STATUS_CODE', 'SIZE', 'USER_AGENT', 'RANDOM_LOG_NUMBER']

with open(file_path, 'r') as file:
    lines = file.readlines()

weblogs = []
for line in lines:
    match = re.match(r'(\S+) - - \[(.*?)\] "(.*?) (.*?) (.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (\d+)', line)
    if match:
        ip = match.group(1)
        datetime = match.group(2)
        request_method = match.group(3)
        url = match.group(4)
        protocol_version = match.group(5)
        status_code = match.group(6)
        size = match.group(7)
        user_agent = match.group(9)
        random_log_number = match.group(10)

        weblogs.append([ip, datetime, request_method, url, status_code, size, user_agent, random_log_number])

weblogs = pd.DataFrame(weblogs, columns=columns)

weblogs.head()


Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE,USER_AGENT,RANDOM_LOG_NUMBER
0,215.248.174.197,19/Aug/2023:12:37:33 +0300,GET,/download/file.pdf,403,5062,Mozilla/5.0 (Android 10; Mobile; rv:84.0) Geck...,2640
1,52.252.193.24,29/Feb/2020:11:04:32 +0300,DELETE,/usr/admin,500,4992,Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_9 like...,2697
2,203.216.96.23,22/Feb/2021:12:57:07 +0300,PUT,/images/banner.jpg,303,5055,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,4458
3,33.175.116.61,28/Mar/2021:10:25:26 +0300,DELETE,/usr/admin/developer,403,4997,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,4705
4,72.193.29.169,21/Mar/2022:01:23:38 +0300,GET,/download/file.pdf,304,4994,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,1314


# • Removed columns that are not relevant to user questions

In [86]:
weblogs = weblogs.drop(columns=["USER_AGENT", "RANDOM_LOG_NUMBER"])
weblogs.head()

Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE
0,215.248.174.197,19/Aug/2023:12:37:33 +0300,GET,/download/file.pdf,403,5062
1,52.252.193.24,29/Feb/2020:11:04:32 +0300,DELETE,/usr/admin,500,4992
2,203.216.96.23,22/Feb/2021:12:57:07 +0300,PUT,/images/banner.jpg,303,5055
3,33.175.116.61,28/Mar/2021:10:25:26 +0300,DELETE,/usr/admin/developer,403,4997
4,72.193.29.169,21/Mar/2022:01:23:38 +0300,GET,/download/file.pdf,304,4994


# • Converting Date and Time into workable format was divided into two.

In [87]:
weblogs['DATE&TIME'] = pd.to_datetime(weblogs['DATE&TIME'], format='%d/%b/%Y:%H:%M:%S %z')

weblogs['DATE'] = weblogs['DATE&TIME'].dt.date
weblogs['TIME'] = weblogs['DATE&TIME'].dt.time

weblogs = weblogs.drop(columns=["DATE&TIME"])

weblogs.to_csv('data/data.csv', index=False)

# • In order for the RAG model to produce meaningful answers, all information was brought together in a single column as a sentence.

In [88]:
weblogs['LOG_CONTENT'] = weblogs.apply(
    lambda row: (
        f"The client with IP address {row['IP']}, sent and accessed the request method named {row['REQUEST_METHOD']}, request to the {row['URL']}, address "
        f"at {row['TIME']}, on {row['DATE']}, This request was responded with a {row['STATUS_CODE']}, status code "
        f"and a size of {row['SIZE']}."
    ),
    axis=1
)


cleaned_data.head()


Unnamed: 0,IP,REQUEST_METHOD,URL,STATUS_CODE,SIZE,DATE,TIME,LOG_CONTENT
0,114.42.187.178,DELETE,/blog/latest-news,500,4962,2021-11-01,02:58:49,"The client with IP address 114.42.187.178, sen..."
1,106.200.197.40,PUT,/images/banner.jpg,304,4952,2022-06-16,11:28:41,"The client with IP address 106.200.197.40, sen..."
2,187.186.31.77,DELETE,/contact/,404,4981,2021-08-30,09:06:31,"The client with IP address 187.186.31.77, sent..."
3,193.58.194.239,DELETE,/images/banner.jpg,304,5002,2021-04-01,09:58:13,"The client with IP address 193.58.194.239, sen..."
4,83.98.129.151,DELETE,/blog/latest-news,500,5003,2020-11-08,05:11:55,"The client with IP address 83.98.129.151, sent..."


# • Other columns were discarded. Data was made vectorizable

In [89]:
log_content_data = weblogs[['LOG_CONTENT']]

log_content_data.to_csv('data/cleaned_data.csv', index=False)