# • Converted the log file into a Data Frame to manipulate it.

In [78]:
import pandas as pd
import re

file_path = 'weblogdata.log'

columns = ['IP', 'DATE&TIME', 'REQUEST_METHOD', 'URL', 'STATUS_CODE', 'SIZE', 'USER_AGENT', 'RANDOM_LOG_NUMBER']

with open(file_path, 'r') as file:
    lines = file.readlines()

weblogs = []
for line in lines:
    match = re.match(r'(\S+) - - \[(.*?)\] "(.*?) (.*?) (.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (\d+)', line)
    if match:
        ip = match.group(1)
        datetime = match.group(2)
        request_method = match.group(3)
        url = match.group(4)
        protocol_version = match.group(5)
        status_code = match.group(6)
        size = match.group(7)
        user_agent = match.group(9)
        random_log_number = match.group(10)

        weblogs.append([ip, datetime, request_method, url, status_code, size, user_agent, random_log_number])

weblogs = pd.DataFrame(weblogs, columns=columns)

weblogs.head()


Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE,USER_AGENT,RANDOM_LOG_NUMBER
0,126.12.38.164,25/Mar/2020:12:37:01 +0300,DELETE,/usr/register,502,4911,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,2524
1,22.229.149.200,10/Sep/2020:08:16:28 +0300,GET,/images/banner.jpg,200,4924,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,4620
2,192.156.48.167,07/Jan/2021:07:57:09 +0300,PUT,/api/v1/data,500,4935,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,3248
3,53.251.166.120,05/Jun/2023:03:30:04 +0300,PUT,/blog/latest-news,502,5040,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:8...,1817
4,115.161.186.58,19/Jul/2023:07:26:59 +0300,GET,/blog/latest-news,502,5072,Mozilla/5.0 (Android 10; Mobile; rv:84.0) Geck...,1433


# • Removed columns that are not relevant to user questions

In [79]:
weblogs = weblogs.drop(columns=["USER_AGENT", "RANDOM_LOG_NUMBER"])
weblogs.head()

Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE
0,126.12.38.164,25/Mar/2020:12:37:01 +0300,DELETE,/usr/register,502,4911
1,22.229.149.200,10/Sep/2020:08:16:28 +0300,GET,/images/banner.jpg,200,4924
2,192.156.48.167,07/Jan/2021:07:57:09 +0300,PUT,/api/v1/data,500,4935
3,53.251.166.120,05/Jun/2023:03:30:04 +0300,PUT,/blog/latest-news,502,5040
4,115.161.186.58,19/Jul/2023:07:26:59 +0300,GET,/blog/latest-news,502,5072


# • Converting Date and Time into workable format was divided into two.

In [80]:
weblogs['DATE&TIME'] = pd.to_datetime(weblogs['DATE&TIME'], format='%d/%b/%Y:%H:%M:%S %z')

weblogs['DATE'] = weblogs['DATE&TIME'].dt.date
weblogs['TIME'] = weblogs['DATE&TIME'].dt.time

weblogs = weblogs.drop(columns=["DATE&TIME"])

weblogs.to_csv('data.csv', index=False)

# • In order for the RAG model to produce meaningful answers, all information was brought together in a single column as a sentence.

In [83]:
weblogs['LOG_CONTENT'] = weblogs.apply(
    lambda row: (
        f"The client with IP address {row['IP']}, sent and accessed the request method named {row['REQUEST_METHOD']}, request to the {row['URL']}, address "
        f"at {row['TIME']}, on {row['DATE']}, This request was responded with a {row['STATUS_CODE']}, status code "
        f"and a size of {row['SIZE']}."
    ),
    axis=1
)


cleaned_data.head()


Unnamed: 0,IP,REQUEST_METHOD,URL,STATUS_CODE,SIZE,DATE,TIME,LOG_CONTENT
0,114.42.187.178,DELETE,/blog/latest-news,500,4962,2021-11-01,02:58:49,"The client with IP address 114.42.187.178, sen..."
1,106.200.197.40,PUT,/images/banner.jpg,304,4952,2022-06-16,11:28:41,"The client with IP address 106.200.197.40, sen..."
2,187.186.31.77,DELETE,/contact/,404,4981,2021-08-30,09:06:31,"The client with IP address 187.186.31.77, sent..."
3,193.58.194.239,DELETE,/images/banner.jpg,304,5002,2021-04-01,09:58:13,"The client with IP address 193.58.194.239, sen..."
4,83.98.129.151,DELETE,/blog/latest-news,500,5003,2020-11-08,05:11:55,"The client with IP address 83.98.129.151, sent..."


# • Other columns were discarded. Data was made vectorizable

In [84]:
# Select only the LOG_CONTENT column
log_content_data = weblogs[['LOG_CONTENT']]

# Save the new DataFrame with only the LOG_CONTENT column to a CSV file
log_content_data.to_csv('cleaned_data.csv', index=False)