In [58]:
import pandas as pd
import re

# Log dosyasının yolunu belirleme
file_path = 'weblogdata.log'

# Sütun başlıklarını belirleme
columns = ['IP', 'DATE&TIME', 'REQUEST_METHOD', 'URL', 'STATUS_CODE', 'SIZE', 'USER_AGENT', 'RANDOM_LOG_NUMBER']

# Log dosyasını okuma
with open(file_path, 'r') as file:
    lines = file.readlines()

# Her satırı düzenli ifadelerle doğru şekilde ayırarak sütunlara ayırma
weblogs = []
for line in lines:
    match = re.match(r'(\S+) - - \[(.*?)\] "(.*?) (.*?) (.*?)" (\d+) (\d+) "(.*?)" "(.*?)" (\d+)', line)
    if match:
        ip = match.group(1)
        datetime = match.group(2)
        request_method = match.group(3)
        url = match.group(4)
        protocol_version = match.group(5)
        status_code = match.group(6)
        size = match.group(7)
        user_agent = match.group(9)
        random_log_number = match.group(10)

        weblogs.append([ip, datetime, request_method, url, status_code, size, user_agent, random_log_number])

# DataFrame oluşturma
weblogs = pd.DataFrame(weblogs, columns=columns)

# DataFrame'in ilk birkaç satırını görüntüleme
weblogs.head()


Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE,USER_AGENT,RANDOM_LOG_NUMBER
0,114.42.187.178,01/Nov/2021:02:58:49 +0300,DELETE,/blog/latest-news,500,4962,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:8...,1396
1,106.200.197.40,16/Jun/2022:11:28:41 +0300,PUT,/images/banner.jpg,304,4952,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,4657
2,187.186.31.77,30/Aug/2021:09:06:31 +0300,DELETE,/contact/,404,4981,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,3002
3,193.58.194.239,01/Apr/2021:09:58:13 +0300,DELETE,/images/banner.jpg,304,5002,Mozilla/5.0 (iPhone; CPU iPhone OS 12_4_9 like...,4530
4,83.98.129.151,08/Nov/2020:05:11:55 +0300,DELETE,/blog/latest-news,500,5003,Mozilla/5.0 (Linux; Android 10; ONEPLUS A6000)...,561


In [59]:
weblogs = weblogs.drop(columns=["USER_AGENT", "RANDOM_LOG_NUMBER"])
weblogs.head()

Unnamed: 0,IP,DATE&TIME,REQUEST_METHOD,URL,STATUS_CODE,SIZE
0,114.42.187.178,01/Nov/2021:02:58:49 +0300,DELETE,/blog/latest-news,500,4962
1,106.200.197.40,16/Jun/2022:11:28:41 +0300,PUT,/images/banner.jpg,304,4952
2,187.186.31.77,30/Aug/2021:09:06:31 +0300,DELETE,/contact/,404,4981
3,193.58.194.239,01/Apr/2021:09:58:13 +0300,DELETE,/images/banner.jpg,304,5002
4,83.98.129.151,08/Nov/2020:05:11:55 +0300,DELETE,/blog/latest-news,500,5003


In [60]:
weblogs['DATE&TIME'] = pd.to_datetime(weblogs['DATE&TIME'], format='%d/%b/%Y:%H:%M:%S %z')

# 'Date' ve 'Time' olarak ayır
weblogs['DATE'] = weblogs['DATE&TIME'].dt.date
weblogs['TIME'] = weblogs['DATE&TIME'].dt.time

weblogs = weblogs.drop(columns=["DATE&TIME"])


weblogs.to_csv('data.csv', index=False)
weblogs.head()

Unnamed: 0,IP,REQUEST_METHOD,URL,STATUS_CODE,SIZE,DATE,TIME
0,114.42.187.178,DELETE,/blog/latest-news,500,4962,2021-11-01,02:58:49
1,106.200.197.40,PUT,/images/banner.jpg,304,4952,2022-06-16,11:28:41
2,187.186.31.77,DELETE,/contact/,404,4981,2021-08-30,09:06:31
3,193.58.194.239,DELETE,/images/banner.jpg,304,5002,2021-04-01,09:58:13
4,83.98.129.151,DELETE,/blog/latest-news,500,5003,2020-11-08,05:11:55


In [62]:
cleaned_data = pd.read_csv('data.csv')

cleaned_data['LOG_CONTENT'] = cleaned_data.apply(
    lambda row: (
        f"The client with IP address {row['IP']} sent a {row['REQUEST_METHOD']} request to the {row['URL']} address "
        f"at {row['TIME']} on {row['DATE']}. This request was responded with a {row['STATUS_CODE']} status code "
        f"and a size of {row['SIZE']}."
    ),
    axis=1
)

# Güncellenmiş DataFrame'i kontrol et
print(cleaned_data.head())

# Veriyi tekrar CSV dosyasına kaydetmek isterseniz
cleaned_data.to_csv('cleaned_data.csv', index=False)

               IP REQUEST_METHOD                 URL  STATUS_CODE  SIZE  \
0  114.42.187.178         DELETE   /blog/latest-news          500  4962   
1  106.200.197.40            PUT  /images/banner.jpg          304  4952   
2   187.186.31.77         DELETE           /contact/          404  4981   
3  193.58.194.239         DELETE  /images/banner.jpg          304  5002   
4   83.98.129.151         DELETE   /blog/latest-news          500  5003   

         DATE      TIME                                        LOG_CONTENT  
0  2021-11-01  02:58:49  The client with IP address 114.42.187.178 sent...  
1  2022-06-16  11:28:41  The client with IP address 106.200.197.40 sent...  
2  2021-08-30  09:06:31  The client with IP address 187.186.31.77 sent ...  
3  2021-04-01  09:58:13  The client with IP address 193.58.194.239 sent...  
4  2020-11-08  05:11:55  The client with IP address 83.98.129.151 sent ...  
