In [3]:
import pandas as pd

df = pd.read_csv("Log_dataset_combined.csv")
df

Unnamed: 0,Timestamp,IP_Address,Request_Type,Status_Code,Anomaly_Flag,User_Agent,Session_ID,Location
0,2023-01-01 00:00:00,202.118.116.11,GET,403,0,Edge,4835,Brazil
1,2023-01-01 00:01:00,38.30.40.178,DELETE,301,0,Bot,3176,China
2,2023-01-01 00:02:00,209.5.148.15,POST,500,0,Opera,4312,China
3,2023-01-01 00:03:00,211.116.60.71,GET,301,0,Bot,1003,France
4,2023-01-01 00:04:00,170.166.36.145,POST,404,0,Firefox,1428,Germany
...,...,...,...,...,...,...,...,...
89995,2025-06-06 04:37:50,66.35.22.135,POST,200,0,Edge,2601,Gambia
89996,2025-02-19 17:23:07,130.239.188.233,GET,500,0,Edge,1951,Norway
89997,2025-06-13 05:01:31,14.183.150.102,GET,500,0,Chrome,2640,Gibraltar
89998,2025-07-13 02:34:03,174.160.87.44,POST,404,0,Firefox,2404,Bulgaria


In [4]:
df.isnull().sum()


Timestamp       0
IP_Address      0
Request_Type    0
Status_Code     0
Anomaly_Flag    0
User_Agent      0
Session_ID      0
Location        0
dtype: int64

In [5]:
df['IP_Address'].nunique()

33979

In [6]:
df['IP_Address'].value_counts().head(10)

IP_Address
15.6.62.53         49
60.21.118.239      13
177.157.198.23     12
169.237.89.18      11
18.109.115.239     11
7.139.89.24        11
85.114.198.91      11
195.110.254.130    11
29.8.50.172        11
3.9.126.206        10
Name: count, dtype: int64

In [15]:
# Clean & normalize values
df['Request_Type'] = df['Request_Type'].str.upper().fillna("UNKNOWN")
df['Status_Code'] = pd.to_numeric(df['Status_Code'], errors='coerce').fillna(0).astype(int)
df['User_Agent'] = df['User_Agent'].fillna("UNKNOWN")
df['Session_ID'] = df['Session_ID'].fillna("UNKNOWN")
df['Location'] = df['Location'].fillna("UNKNOWN")
df['Anomaly_Flag'] = df['Anomaly_Flag'].fillna(0).astype(int)


In [20]:
import pandas as pd

# 1. Grouping by IP
grouped = df.groupby('IP_Address')

# 2. Total requests per IP
total_requests = grouped.size().rename("Total_Requests")   #This counts how many rows (requests) each IP made.

# 3. Request type distribution
'''
Counts number of times each request type (GET, POST, DELETE, etc.) was used per IP
Then divides each by total requests to get percentage.
unstack(fill_value=0) handles missing request types for some IPs.
'''
request_type_counts = grouped['Request_Type'].value_counts().unstack(fill_value=0)
request_type_perc = request_type_counts.div(request_type_counts.sum(axis=1), axis=0)
request_type_perc.columns = [f"{col}_Perc" for col in request_type_perc.columns]

# 4. Status code classes (4xx, 5xx)
'''
This adds a new column Status_Class with values like 4xx, 5xx, or Other.
Used for calculating error rate per IP.
'''
def classify_status(code):
    if 400 <= code < 500:
        return '4xx'
    elif 500 <= code < 600:
        return '5xx'
    else:
        return 'Other'

#Counting how often each status class occurs per IP and Dividing by total requests to get percentage
df['Status_Class'] = df['Status_Code'].apply(classify_status)
status_class_counts = df.groupby('IP_Address')['Status_Class'].value_counts().unstack(fill_value=0)
status_class_perc = status_class_counts.div(status_class_counts.sum(axis=1), axis=0)
status_class_perc.columns = [f"{col}_Perc" for col in status_class_perc.columns]

# 5. Unique counts
'''
Calculates number of different User Agents, Sessions, Locations used by each IP
Higher values = more variation = may be suspicious
'''
unique_user_agents = grouped['User_Agent'].nunique().rename("Unique_User_Agents")
unique_sessions = grouped['Session_ID'].nunique().rename("Unique_Sessions")
unique_locations = grouped['Location'].nunique().rename("Unique_Locations")

# 6. Anomaly count
'''
Counting how many requests from an IP were marked Anomaly_Flag = 1

'''
anomaly_count = grouped['Anomaly_Flag'].sum().rename("Anomaly_Count")

# 7. Most frequent user agent/location
'''
Finds the most frequently used User Agent and Location per IP
'''
top_user_agent = grouped['User_Agent'].agg(lambda x: x.value_counts().idxmax()).rename("Top_User_Agent")
top_location = grouped['Location'].agg(lambda x: x.value_counts().idxmax()).rename("Top_Location")

# 8. Combining everything
ip_level_features = pd.concat([
    total_requests,
    request_type_perc,
    status_class_perc,
    unique_user_agents,
    unique_sessions,
    unique_locations,
    anomaly_count,
    top_user_agent,
    top_location
], axis=1)

# 9. Replacing NaNs with 0
ip_level_features.fillna(0, inplace=True)

ip_level_features.reset_index().head()


Unnamed: 0,IP_Address,Total_Requests,DELETE_Perc,GET_Perc,POST_Perc,PUT_Perc,4xx_Perc,5xx_Perc,Other_Perc,Unique_User_Agents,Unique_Sessions,Unique_Locations,Anomaly_Count,Top_User_Agent,Top_Location
0,1.0.25.247,3,0.666667,0.0,0.0,0.333333,0.333333,0.0,0.666667,2,3,3,0,Opera,Mongolia
1,1.1.180.252,4,0.25,0.25,0.5,0.0,0.25,0.25,0.5,2,4,4,0,Firefox,Montenegro
2,1.1.21.105,7,0.142857,0.285714,0.428571,0.142857,0.428571,0.285714,0.285714,4,7,7,1,Safari,Australia
3,1.1.62.217,4,0.0,0.5,0.25,0.25,0.0,0.75,0.25,3,4,4,0,Chrome,Switzerland
4,1.100.140.68,4,0.0,0.5,0.5,0.0,0.5,0.0,0.5,3,4,4,0,Bot,Belize


In [21]:
ip_level_features

Unnamed: 0_level_0,Total_Requests,DELETE_Perc,GET_Perc,POST_Perc,PUT_Perc,4xx_Perc,5xx_Perc,Other_Perc,Unique_User_Agents,Unique_Sessions,Unique_Locations,Anomaly_Count,Top_User_Agent,Top_Location
IP_Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0.25.247,3,0.666667,0.000000,0.000000,0.333333,0.333333,0.000000,0.666667,2,3,3,0,Opera,Mongolia
1.1.180.252,4,0.250000,0.250000,0.500000,0.000000,0.250000,0.250000,0.500000,2,4,4,0,Firefox,Montenegro
1.1.21.105,7,0.142857,0.285714,0.428571,0.142857,0.428571,0.285714,0.285714,4,7,7,1,Safari,Australia
1.1.62.217,4,0.000000,0.500000,0.250000,0.250000,0.000000,0.750000,0.250000,3,4,4,0,Chrome,Switzerland
1.100.140.68,4,0.000000,0.500000,0.500000,0.000000,0.500000,0.000000,0.500000,3,4,4,0,Bot,Belize
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99.96.148.251,2,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,2,2,2,0,Opera,Marshall Islands
99.96.176.36,5,0.400000,0.200000,0.400000,0.000000,0.200000,0.200000,0.600000,3,5,5,0,Firefox,Libyan Arab Jamahiriya
99.97.118.185,1,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,1,1,1,0,Bot,Antarctica (the territory South of 60 deg S)
99.97.154.134,1,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,1.000000,1,1,1,0,Safari,USA


In [23]:
ip_level_features.reset_index().to_csv("IP_Level_log_dataset.csv", index=False)
