In [638]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re # regex
from urllib.parse import urlparse, parse_qs # urlparse & parse_qs
from scipy.stats import entropy # Cho url_entropy ở

In [639]:
path = '/kaggle/input/labeldata/normal_labeled.log'

In [640]:
import re
from typing import Union, Iterable, List, Dict, Optional
import pandas as pd

# ──────────────────────────────────────────────────────────────
# 1) Pre-compile regex (Nginx / Apache “combined” format)
#    - STRICT   : có dấu " quanh request + referrer + user-agent
#    - FALLBACK : thiếu hoặc hỏng dấu " (bắt thoáng hơn)
# ──────────────────────────────────────────────────────────────
NGINX_REGEX_STRICT = re.compile(
    r'(?P<ip>\S+)\s+-\s+-\s+'                              # IP - -
    r'\[(?P<timestamp>[^\]]+)]\s+'                         # [timestamp]
    r'"(?P<method>[A-Z]+)\s+'                              # "METHOD␣
    r'(?P<url>.+?)\s+'                                     # URL (non-greedy)
    r'(?P<protocol>[A-Z]+/\d(?:\.\d)?)"\s+'                # PROTOCOL"
    r'(?P<status>\d{3}|-)\s+'                              # status
    r'(?P<size>\d+|-)\s+'                                  # size
    r'"(?P<referrer>[^"]*)"\s+'                            # "referrer"
    r'"(?P<user_agent>[^"]*)"'                            # "user-agent"
    r'(?:[ \t]+(?P<label>[01]))?$',        #  ← thêm nhóm label tuỳ chọn
    flags=re.IGNORECASE,
)

NGINX_REGEX_FALLBACK = re.compile(
    r'(?P<ip>\S+)\s+-\s+-\s+'                              # IP - -
    r'\[(?P<timestamp>[^\]]+)]\s+'                         # [timestamp]
    r'(?P<method>[A-Z]+)\s+'                               # METHOD
    r'(?P<url>.+?)\s+'                                     # URL
    r'(?P<protocol>[A-Z]+/\d(?:\.\d)?)\s+'                 # PROTOCOL
    r'(?P<status>\d{3}|-)\s+'                              # status
    r'(?P<size>\d+|-)\s+'                                  # size
    r'(?P<referrer>\S+|-)\s+'                              # referrer (không quotes)
    r'(?P<user_agent>.+)'                                 # user-agent (còn lại)
    r'(?:[ \t]+(?P<label>[01]))?$',        #  ← thêm nhóm label tuỳ chọn

    flags=re.IGNORECASE,
)

# Gộp thành tuple để lần lượt thử
NGINX_COMBINED_PATTERNS = (NGINX_REGEX_STRICT, NGINX_REGEX_FALLBACK)

# ──────────────────────────────────────────────────────────────
# 2) Tiện ích: loại bỏ ký tự control (nếu log bị lẫn \x00 …)
# ──────────────────────────────────────────────────────────────
def strip_control(s: str) -> str:
    """Remove leading control chars (0x00–0x1F) ở đầu dòng."""
    return re.sub(r'^[\x00-\x1F]+', "", s)

# ──────────────────────────────────────────────────────────────
# 3) Hàm wrapper parse_nginx_log
# ──────────────────────────────────────────────────────────────
def parse_nginx_log(
    source: Union[str, Iterable[str]],
    patterns: Iterable[re.Pattern] = NGINX_COMBINED_PATTERNS,
    as_dataframe: bool = True,
    encoding: Optional[str] = "utf-8",
) -> Union[pd.DataFrame, List[Dict[str, str]]]:
    """
    Parse log Nginx / Apache (combined) thành list[dict] hoặc pandas.DataFrame.

    Args:
        source (str | Iterable[str]):
            • Chuỗi đường dẫn file, hoặc
            • Iterable (list, generator, ...) các dòng log.
        patterns (Iterable[re.Pattern]): Danh sách regex sẽ thử lần lượt.
        as_dataframe (bool): True -> trả về DataFrame, False -> list[dict].
        encoding (str | None): Encoding khi mở file (nếu source là path).

    Returns:
        pandas.DataFrame | list[dict]
    """
    # 1) Lấy iterator dòng log
    if isinstance(source, str):                # truyền path
        fh = open(source, "r", encoding=encoding, errors="replace")
        lines = fh
        close_file = True
    else:                                      # iterable dòng
        lines = source
        close_file = False

    # 2) Parse
    parsed: List[Dict[str, str]] = []
    for raw_line in lines:
        line = strip_control(raw_line.rstrip("\n"))
        for pat in patterns:
            m = pat.match(line)
            if m:
                parsed.append(m.groupdict())
                break                          # matched → sang dòng kế
        # nếu muốn ghi lại MISS, thêm else: missed.append(line)

    # 3) Đóng file nếu cần
    if close_file:
        fh.close()

    # 4) Trả kết quả
    return pd.DataFrame(parsed) if as_dataframe else parsed

In [641]:
# stores output in parsed_log.csv
import pandas as pd
df = parse_nginx_log(path)

In [642]:
df.dropna(subset=['label'], inplace=True)
df['label'] = df['label'].astype(int)

In [643]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42,
    stratify=df['label'] # Rất quan trọng để giữ tỷ lệ label trong cả 2 tập
)

print("Kích thước tập Train:", train_df.shape)
print("Kích thước tập Test:", test_df.shape)

Kích thước tập Train: (83946, 10)
Kích thước tập Test: (20987, 10)


In [644]:
df.isna().sum()/len(df)

ip            0.0
timestamp     0.0
method        0.0
url           0.0
protocol      0.0
status        0.0
size          0.0
referrer      0.0
user_agent    0.0
label         0.0
dtype: float64

In [645]:
df.sample(5)

Unnamed: 0,ip,timestamp,method,url,protocol,status,size,referrer,user_agent,label
17774,203.0.113.182,06/Jun/2025:12:08:55 +0000,POST,/dashboard,HTTP/1.1,200,1362,https://twitter.com/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{0}.0.{1}.{2} Safari/537.36",0
44553,192.168.1.184,10/Jun/2025:10:50:15 +0000,POST,/user/762,HTTP/1.1,200,1446,https://twitter.com/,Twitterbot/1.0,0
19018,10.0.0.243,05/Jun/2025:13:55:04 +0000,GET,/admin,HTTP/1.1,200,4856,https://www.facebook.com/,"Mozilla/5.0 (Linux; Android {0}; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{1}.0.{2}.{3} Mobile Safari/537.36",0
82813,101.102.79.236,04/Jun/2025:19:40:42 +0000,POST,/api/v1/products,HTTP/1.1,200,2382,https://www.linkedin.com/,Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm),0
84886,45.67.248.81,09/Jun/2025:05:46:50 +0000,GET,/user/959,HTTP/1.1,200,2337,https://example.com/,Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm),0


## Time-based feature handling

In [646]:
def timestamp_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Nhận vào một DataFrame chứa cột 'timestamp' và trả về DataFrame
    đã được bổ sung đầy đủ các feature về thời gian.
    """
    df['timestamp_dt'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z', errors='coerce')
    
    df = df.sort_values('timestamp_dt').reset_index(drop=True)

    df['hour_of_day'] = df['timestamp_dt'].dt.hour
    df['day_of_week'] = df['timestamp_dt'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    def get_part_of_day(hour):
        if 5 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 17:
            return 'afternoon'
        elif 17 <= hour < 21:
            return 'evening'
        else:
            return 'night'
    df['part_of_day'] = df['hour_of_day'].apply(get_part_of_day)

    df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    df['time_since_last_event'] = df['timestamp_dt'].diff().dt.total_seconds().fillna(0)
    df.drop(columns=["timestamp_dt"], inplace=True)
    df.drop(columns=["timestamp"], inplace=True)

    return df

In [647]:
df = timestamp_features(df)
df.sample(5)

Unnamed: 0,ip,method,url,protocol,status,size,referrer,user_agent,label,hour_of_day,day_of_week,is_weekend,part_of_day,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,time_since_last_event
49006,10.0.0.203,GET,/wp-admin,HTTP/1.1,200,1160,-,"Mozilla/5.0 (Linux; Android {0}; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{1}.0.{2}.{3} Mobile Safari/537.36",0,10,5,1,morning,0.5,-0.866025,-0.974928,-0.222521,1.0
63008,203.0.113.58,POST,/api/v1/products,HTTP/1.1,200,2478,https://www.linkedin.com/,"Mozilla/5.0 (iPhone; CPU iPhone OS {0}_{1} like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{2}.0 Mobile/15E148 Safari/604.1",0,12,6,1,afternoon,1.224647e-16,-1.0,-0.781831,0.62349,4.0
56257,45.67.44.199,POST,/admin/login,HTTP/1.1,200,4114,https://example.com/blog/blog,"Mozilla/5.0 (Linux; Android {0}; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{1}.0.{2}.{3} Mobile Safari/537.36",0,0,6,1,night,0.0,1.0,-0.781831,0.62349,4.0
66927,203.0.113.229,POST,/search,HTTP/1.1,200,1324,https://twitter.com/,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html),0,20,6,1,evening,-0.8660254,0.5,-0.781831,0.62349,2.0
101191,172.16.0.223,GET,/admin,HTTP/1.1,200,4451,-,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{0}.0) Gecko/20100101 Firefox/{0}.0,0,14,2,0,afternoon,-0.5,-0.866025,0.974928,-0.222521,12.0


# Feature extraction process

## URL Feature handling

## URL Suspicious Patterns

In [648]:
import pandas as pd
import re
from urllib.parse import unquote_plus
import base64

# ==============================================================================
# HÀM PHÂN TÍCH CUỐI CÙNG (DANH SÁCH PATTERN ĐÃ ĐƯỢC DỌN DẸP)
# ==============================================================================
def final_url_analyzer(url_string: str) -> tuple:
    """
    Hàm cuối cùng, với danh sách pattern đã được dọn dẹp và sửa lỗi.
    """
    # --- BỘ QUY TẮC ĐÃ ĐƯỢC TINH GỌN VÀ SỬA LỖI ---
    patterns = [
        # --- SQL Injection ---
        r'(union\s+select)',
        r'(select\s+.*\s+from)',
        r'(insert\s+into)',
        r'(delete\s+from)',
        r'(drop\s+table)',
        r'(--|#|\/\*|;\s*--)',
        r'(load_file\s*\()',
        r'(information_schema\.)',
        r'(pg_sleep|waitfor\s+delay|sleep|benchmark)\s*\(',
        r'(xp_cmdshell)',
        # <<< Sửa lỗi cho log #7: bắt OR '1'='1 và các dạng tương tự >>>
        r"(?:'|\")\s*or\s+(?:'|\")?.+?(?:'|\")?\s*=\s*(?:'|\")?.+?(?:'|\")?",

        # --- XSS ---
        r'(<script)',
        r'(<iframe)',
        r'(<svg)',
        r'(<img\s+[^>]*src\s*=\s*[\'"]?javascript:)',
        r'(on(error|load|mouseover)\s*=)',
        r'(eval\s*\()',
        r'(document\.cookie)',
        r'(alert\s*\()',

        # --- Path Traversal & File Inclusion ---
        r'(\.\.\/|\.\.\\|%2e%2e|%c0%ae)',
        r'(etc\/(passwd|shadow))',
        r'(proc\/(self|environ))',
        r'(boot\.ini|win\.ini)',
        r'(php|file|data):\/\/',

        # --- Command Injection ---
        r'(&&|;|\||`|\$\()',
        r'(\b(cat|ls|whoami|id|wget|curl|bash|sh|cmd)\s+)',
        r'(\/bin\/(ba)?sh)',
    ]
    combined_pattern = re.compile('|'.join(patterns), re.IGNORECASE)

    strings_to_check = set()
    try:
        strings_to_check.add(url_string)
        decoded_url = unquote_plus(unquote_plus(url_string))
        strings_to_check.add(decoded_url)
    except:
        decoded_url = url_string

    # Tách các phần và thử decode hex/base64
    parts = re.split(r'[=,&;/?]', decoded_url)
    for part in parts:
        part = part.strip()
        if len(part) < 4:
            continue
        # hex
        try:
            if all(c in '0123456789abcdefABCDEF' for c in part) and len(part) % 2 == 0:
                strings_to_check.add(bytes.fromhex(part).decode('utf-8', 'ignore'))
        except:
            pass
        # base64
        try:
            missing_padding = len(part) % 4
            if missing_padding:
                part += '=' * (4 - missing_padding)
            strings_to_check.add(base64.b64decode(part).decode('utf-8', 'ignore'))
        except:
            pass

    # Quét từng chuỗi
    for text in strings_to_check:
        match = combined_pattern.search(text)
        if match:
            return 1

    return 0



In [649]:
df[df[['url', 'method', 'protocol']].isnull().any(axis=1)]['url']


Series([], Name: url, dtype: object)

In [650]:
df[['is_suspicious']] = df['url'].apply(lambda x: pd.Series(final_url_analyzer(x)))
df

Unnamed: 0,ip,method,url,protocol,status,size,referrer,user_agent,label,hour_of_day,day_of_week,is_weekend,part_of_day,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,time_since_last_event,is_suspicious
0,55.11.206.13,GET,/search?q=PGJvZHkgb25sb2FkPWFsZXJ0KCdYU1MnKT4=,HTTP/1.0,500,3780,http://evil.com,sqlmap/1.4.12.1#dev,1,6,6,1,morning,1.000000,6.123234e-17,-0.781831,0.623490,0.0,1
1,55.11.206.13,GET,/search?q=PGJvZHkgb25sb2FkPWFsZXJ0KCdYU1MnKT4=,HTTP/2.0,499,604,-,sqlmap/1.4.12.1#dev,1,6,6,1,morning,1.000000,6.123234e-17,-0.781831,0.623490,6.0,1
2,55.11.206.13,DELETE,/search?q=<a href='javascript:alert(1)'>click</a>,HTTP/1.0,404,1730,-,sqlmap/1.4.12.1#dev,1,6,6,1,morning,1.000000,6.123234e-17,-0.781831,0.623490,1.0,1
3,55.11.206.13,DELETE,/run?cmd=%2526%2526%2520whoami,HTTP/1.0,403,211,-,sqlmap/1.4.12.1#dev,1,6,6,1,morning,1.000000,6.123234e-17,-0.781831,0.623490,0.0,1
4,55.11.206.13,GET,/search?q=PGJvZHkgb25sb2FkPWFsZXJ0KCdYU1MnKT4=,HTTP/2.0,405,451,http://evil.com,sqlmap/1.4.12.1#dev,1,6,6,1,morning,1.000000,6.123234e-17,-0.781831,0.623490,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104928,172.16.0.31,GET,/about,HTTP/1.1,200,3880,https://www.linkedin.com/,"Mozilla/5.0 (iPhone; CPU iPhone OS {0}_{1} like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{2}.0 Mobile/15E148 Safari/604.1",0,21,2,0,night,-0.707107,7.071068e-01,0.974928,-0.222521,1.0,0
104929,198.51.100.178,GET,/category/655/items,HTTP/1.1,200,1363,https://www.bing.com/,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html),0,21,2,0,night,-0.707107,7.071068e-01,0.974928,-0.222521,6.0,0
104930,203.0.113.250,POST,/product/720/detail,HTTP/1.1,200,465,https://example.com/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{0}.0.{1}.{2} Safari/537.36",0,21,2,0,night,-0.707107,7.071068e-01,0.974928,-0.222521,4.0,0
104931,198.51.100.133,POST,/sitemap.xml,HTTP/1.1,404,2857,https://twitter.com/,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{0}_{1}) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{2}.0 Safari/605.1.15",0,21,2,0,night,-0.707107,7.071068e-01,0.974928,-0.222521,12.0,0


## User Agent Features

In [651]:
from user_agents import parse

def calculate_entropy(text_string: str) -> float:
    """
    Tính entropy của chuỗi ký tự (dựa trên xác suất xuất hiện ký tự).
    """
    import math
    from collections import Counter

    if not text_string:
        return 0.0

    counts = Counter(text_string)
    total = len(text_string)
    entropy = -sum((count / total) * math.log2(count / total) for count in counts.values())
    return entropy


def user_agent_features(df):
    """
    Thêm các đặc trưng liên quan đến User-Agent vào DataFrame đầu vào.
    """
    df = df.copy()  # tránh tác động trực tiếp

    df['ua_parsed'] = df['user_agent'].astype(str).apply(parse)

    # 1. Trình duyệt (Browser Family)
    df['ua_browser_family'] = df['ua_parsed'].apply(lambda ua: ua.browser.family)

    # 2. Phiên bản trình duyệt (Major Version)
    df['ua_browser_version_major'] = df['ua_parsed'].apply(lambda ua: ua.browser.version[0] if ua.browser.version else None)

    # 3. Hệ điều hành (OS Family)
    df['ua_os_family'] = df['ua_parsed'].apply(lambda ua: ua.os.family)

    # 4. Phiên bản hệ điều hành (Major Version)
    df['ua_os_version_major'] = df['ua_parsed'].apply(lambda ua: ua.os.version[0] if ua.os.version else None)

    # 5. Thiết bị (Device Family/Brand)
    df['ua_device_family'] = df['ua_parsed'].apply(lambda ua: ua.device.family)
    df['ua_device_brand'] = df['ua_parsed'].apply(lambda ua: ua.device.brand)

    # 6-10. Các flag nhận diện thiết bị
    df['ua_is_bot'] = df['ua_parsed'].apply(lambda ua: int(ua.is_bot))
    df['ua_is_mobile'] = df['ua_parsed'].apply(lambda ua: int(ua.is_mobile))
    df['ua_is_tablet'] = df['ua_parsed'].apply(lambda ua: int(ua.is_tablet))
    df['ua_is_pc'] = df['ua_parsed'].apply(lambda ua: int(ua.is_pc))
    df['ua_is_touch_capable'] = df['ua_parsed'].apply(lambda ua: int(ua.is_touch_capable))

    # 11. Độ dài chuỗi User-Agent
    df['ua_length'] = df['user_agent'].astype(str).apply(len)

    # 12. Entropy của User-Agent
    df['ua_entropy'] = df['user_agent'].astype(str).apply(calculate_entropy)

    tools = ['sqlmap', 'curl', 'wget', 'nmap', 'nikto', 'fuzz', 'hydra']
    df['ua_is_tool'] = df['user_agent'].str.lower().apply(lambda ua: any(tool in ua for tool in tools)).astype(int)

    # Dọn bộ nhớ
    df.drop('ua_parsed', axis=1, inplace=True)

    return df

In [652]:
df = user_agent_features(df)
df.sample(5)

Unnamed: 0,ip,method,url,protocol,status,size,referrer,user_agent,label,hour_of_day,...,ua_device_family,ua_device_brand,ua_is_bot,ua_is_mobile,ua_is_tablet,ua_is_pc,ua_is_touch_capable,ua_length,ua_entropy,ua_is_tool
44196,172.16.0.196,POST,/dashboard,HTTP/1.1,500,2036,https://www.bing.com/,Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm),0,0,...,Spider,Spider,1,0,0,0,0,71,4.542884,0
84212,45.67.177.198,POST,/admin,HTTP/1.1,404,162,https://www.facebook.com/,Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm),0,5,...,Spider,Spider,1,0,0,0,0,71,4.542884,0
54457,198.51.100.114,POST,/admin,HTTP/1.1,200,972,https://www.google.com/search?q=news,"Mozilla/5.0 (Linux; Android {0}; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{1}.0.{2}.{3} Mobile Safari/537.36",0,20,...,Samsung SM-G973F,Samsung,0,1,0,0,1,123,5.225211,0
79740,192.168.1.164,GET,/wp-login.php,HTTP/1.1,200,570,-,"Mozilla/5.0 (Linux; Android {0}; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{1}.0.{2}.{3} Mobile Safari/537.36",0,21,...,Samsung SM-G973F,Samsung,0,1,0,0,1,123,5.225211,0
97484,10.0.0.160,GET,/products/530,HTTP/1.1,200,2349,https://www.facebook.com/,"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{0}.0.{1}.{2} Safari/537.36",0,7,...,Other,,0,0,0,1,0,115,5.226284,0


In [653]:
cols_to_show_ua = [
    'user_agent', 'ua_browser_family', 'ua_os_family', 'ua_device_brand',
    'ua_is_bot', 'ua_is_mobile', 'ua_is_pc', 'ua_length', 'ua_entropy', 'ua_'
]

print("\n--- Features từ User-Agent ---")
print(df[[col for col in cols_to_show_ua if col in df.columns]].head())


--- Features từ User-Agent ---
            user_agent ua_browser_family ua_os_family ua_device_brand  \
0  sqlmap/1.4.12.1#dev             Other        Other            None   
1  sqlmap/1.4.12.1#dev             Other        Other            None   
2  sqlmap/1.4.12.1#dev             Other        Other            None   
3  sqlmap/1.4.12.1#dev             Other        Other            None   
4  sqlmap/1.4.12.1#dev             Other        Other            None   

   ua_is_bot  ua_is_mobile  ua_is_pc  ua_length  ua_entropy  
0          0             0         0         19    3.747413  
1          0             0         0         19    3.747413  
2          0             0         0         19    3.747413  
3          0             0         0         19    3.747413  
4          0             0         0         19    3.747413  


In [654]:
def status_features(df):
    """
    Trích xuất các feature từ cột 'status' và 'size' trong log web.

    Args:
        df (pd.DataFrame): DataFrame chứa ít nhất 2 cột: 'status' (int), 'size' (int)

    Returns:
        pd.DataFrame: DataFrame gốc kèm thêm các cột đặc trưng mới.
    """
    # Kiểm tra cột trước
    if 'status' not in df.columns or 'size' not in df.columns:
        raise ValueError("DataFrame cần có cột 'status' và 'size'.")

    df = df.copy()
    df['status'] = pd.to_numeric(df['status'], errors='coerce').fillna(0).astype(int)
    df['size'] = pd.to_numeric(df['size'], errors='coerce').fillna(0).astype(int)
    # 1. 4xx - lỗi phía client
    df['status_is_client_error'] = df['status'].apply(lambda x: 1 if 400 <= x < 500 else 0)

    # 2. 5xx - lỗi phía server
    df['status_is_server_error'] = df['status'].apply(lambda x: 1 if 500 <= x < 600 else 0)

    # 3. Lỗi nói chung
    df['status_is_error'] = ((df['status_is_client_error'] == 1) | (df['status_is_server_error'] == 1)).astype(int)

    # 4. Thành công (2xx)
    df['status_is_success'] = df['status'].apply(lambda x: 1 if 200 <= x < 300 else 0)

    # 5. Redirect (3xx)
    df['status_is_redirect'] = df['status'].apply(lambda x: 1 if 300 <= x < 400 else 0)

    # 6. Response size bằng 0
    df['size_is_zero'] = df['size'].apply(lambda x: 1 if x == 0 else 0)

    return df


In [655]:
df = status_features(df)
df.sample(5)

Unnamed: 0,ip,method,url,protocol,status,size,referrer,user_agent,label,hour_of_day,...,ua_is_touch_capable,ua_length,ua_entropy,ua_is_tool,status_is_client_error,status_is_server_error,status_is_error,status_is_success,status_is_redirect,size_is_zero
46618,45.67.189.226,POST,/robots.txt,HTTP/1.1,200,1328,-,"Mozilla/5.0 (iPhone; CPU iPhone OS {0}_{1} like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{2}.0 Mobile/15E148 Safari/604.1",0,5,...,1,139,5.126522,0,0,0,0,1,0,0
73761,45.67.23.34,GET,/admin,HTTP/1.1,200,3097,https://www.google.com/search?q=tech,Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm),0,9,...,0,71,4.542884,0,0,0,0,1,0,0
65482,10.0.0.220,GET,/admin/login,HTTP/1.1,500,735,https://www.google.com/,"Mozilla/5.0 (Linux; Android {0}; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{1}.0.{2}.{3} Mobile Safari/537.36",0,17,...,1,123,5.225211,0,0,1,1,0,0,0
70688,203.0.113.29,GET,/contact,HTTP/1.1,200,4678,-,Twitterbot/1.0,0,3,...,0,14,3.46772,0,0,0,0,1,0,0
65848,198.51.100.83,POST,/home,HTTP/1.1,200,2600,https://www.linkedin.com/,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html),0,18,...,0,72,4.45746,0,0,0,0,1,0,0


In [656]:
from urllib.parse import urlparse

def referrer_features(df):
    df = df.copy()
    
    df['referrer_len'] = df['referrer'].astype(str).apply(len)
    df['referrer_entropy'] = df['referrer'].astype(str).apply(calculate_entropy)
    df['referrer_is_empty'] = df['referrer'].apply(lambda x: 1 if x == '-' or pd.isna(x) or str(x).strip() == '' else 0)

    def get_referrer_domain(ref):
        ref_str = str(ref)
        if pd.isna(ref) or ref_str == '-' or not ref_str.startswith('http'):
            return 'none'
        try:
            parsed = urlparse(ref_str)
            return parsed.netloc.lower() if parsed.netloc else 'unknown_format_but_not_empty'
        except:
            return 'parse_error'

    df['referrer_domain'] = df['referrer'].apply(get_referrer_domain)
    df['referrer_is_external_or_valid'] = df['referrer_domain'].apply(
        lambda x: 0 if x in ['none', 'parse_error', 'unknown_format_but_not_empty'] else 1
    )

    return df


In [657]:
df = referrer_features(df)
df.sample(3)

Unnamed: 0,ip,method,url,protocol,status,size,referrer,user_agent,label,hour_of_day,...,status_is_server_error,status_is_error,status_is_success,status_is_redirect,size_is_zero,referrer_len,referrer_entropy,referrer_is_empty,referrer_domain,referrer_is_external_or_valid
92250,198.51.100.56,GET,/css/style.css,HTTP/1.1,200,840,https://twitter.com/,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{0}_{1}) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{2}.0 Safari/605.1.15",0,21,...,0,0,1,0,0,20,3.503702,0,twitter.com,1
33003,45.67.140.13,POST,/products/317,HTTP/1.1,200,348,https://example.com/blog/news,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html),0,3,...,0,0,1,0,0,29,4.004364,0,example.com,1
52540,45.67.0.20,POST,/user/144,HTTP/1.1,200,1996,https://twitter.com/,"Mozilla/5.0 (iPhone; CPU iPhone OS {0}_{1} like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{2}.0 Mobile/15E148 Safari/604.1",0,16,...,0,0,1,0,0,20,3.503702,0,twitter.com,1


In [658]:
# def behavior_features(df):
#     df = df.copy()

#     df['ip_request_count_total'] = df.groupby('ip')['ip'].transform('count')
#     df['ip_error_rate'] = df.groupby('ip')['status_is_error'].transform('mean')
#     df['ip_avg_query_count'] = df.groupby('ip')['url_query_count'].transform('mean')

#     return df


In [659]:
# df = behavior_features(df)
# df.sample(3)

## Data preprocessing 2

In [660]:
len(num_col)

19

In [661]:
len(cat_col)

7

In [662]:
cat_col

['method',
 'protocol',
 'part_of_day',
 'ua_browser_family',
 'ua_os_family',
 'ua_device_family',
 'referrer_domain']

In [663]:
# Đã trích xuất nên drop
#df.drop(columns=["ip", "url", "referrer"], inplace=True)
df.drop(columns=['user_agent'], inplace=True)

In [664]:
df.drop(columns=["ip", "url", "referrer"], inplace=True)


In [665]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104933 entries, 0 to 104932
Data columns (total 40 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   method                         104933 non-null  object 
 1   protocol                       104933 non-null  object 
 2   status                         104933 non-null  int64  
 3   size                           104933 non-null  int64  
 4   label                          104933 non-null  int64  
 5   hour_of_day                    104933 non-null  int32  
 6   day_of_week                    104933 non-null  int32  
 7   is_weekend                     104933 non-null  int64  
 8   part_of_day                    104933 non-null  object 
 9   hour_sin                       104933 non-null  float64
 10  hour_cos                       104933 non-null  float64
 11  day_of_week_sin                104933 non-null  float64
 12  day_of_week_cos               

In [666]:
df.isna().sum()/len(df)

method                           0.000000
protocol                         0.000000
status                           0.000000
size                             0.000000
label                            0.000000
hour_of_day                      0.000000
day_of_week                      0.000000
is_weekend                       0.000000
part_of_day                      0.000000
hour_sin                         0.000000
hour_cos                         0.000000
day_of_week_sin                  0.000000
day_of_week_cos                  0.000000
time_since_last_event            0.000000
is_suspicious                    0.000000
ua_browser_family                0.000000
ua_browser_version_major         0.761734
ua_os_family                     0.000000
ua_os_version_major              0.745590
ua_device_family                 0.000000
ua_device_brand                  0.285906
ua_is_bot                        0.000000
ua_is_mobile                     0.000000
ua_is_tablet                     0

In [667]:
df.drop(columns=['ua_device_brand', 'ua_browser_version_major', 'ua_os_version_major'], inplace=True)

In [668]:

cat_col = df.select_dtypes(include=["object", "category"]).columns.tolist()

num_col = df.select_dtypes(include=["number"]).columns.tolist()


In [670]:
# Cell 7: Áp dụng Feature Engineering
print("Processing Train set...")
train_featured = timestamp_features(train_df)
train_featured['is_suspicious'] = train_featured['url'].apply(final_url_analyzer)
train_featured = user_agent_features(train_featured)
train_featured = status_features(train_featured)
train_featured = referrer_features(train_featured)
# ... nếu có behavior_features, bạn phải fit trên train và transform cả hai ...


print("Processing Test set...")
test_featured = timestamp_features(test_df)
test_featured['is_suspicious'] = test_featured['url'].apply(final_url_analyzer)
test_featured = user_agent_features(test_featured)
test_featured = status_features(test_featured)
test_featured = referrer_features(test_featured)

Processing Train set...
Processing Test set...


In [671]:
df.isna().sum()/len(df)

method                           0.0
protocol                         0.0
status                           0.0
size                             0.0
label                            0.0
hour_of_day                      0.0
day_of_week                      0.0
is_weekend                       0.0
part_of_day                      0.0
hour_sin                         0.0
hour_cos                         0.0
day_of_week_sin                  0.0
day_of_week_cos                  0.0
time_since_last_event            0.0
is_suspicious                    0.0
ua_browser_family                0.0
ua_os_family                     0.0
ua_device_family                 0.0
ua_is_bot                        0.0
ua_is_mobile                     0.0
ua_is_tablet                     0.0
ua_is_pc                         0.0
ua_is_touch_capable              0.0
ua_length                        0.0
ua_entropy                       0.0
ua_is_tool                       0.0
status_is_client_error           0.0
s

In [None]:
df = df[~df['label'].isna()]

In [672]:
# Cell 8: Chuẩn bị X, y
TARGET = 'label'

# Dọn dẹp các cột không cần thiết cho model
cols_to_drop = [
    'ip', 'url', 'referrer', 'user_agent', 
    'ua_browser_version_major', 'ua_os_version_major', 'ua_device_brand' # các cột có nhiều NaN bạn đã xóa
]

X_train = train_featured.drop(columns=[TARGET] + [col for col in cols_to_drop if col in train_featured.columns])
y_train = train_featured[TARGET]

X_test = test_featured.drop(columns=[TARGET] + [col for col in cols_to_drop if col in test_featured.columns])
y_test = test_featured[TARGET]

# Đảm bảo các cột trong X_train và X_test khớp nhau
X_test = X_test[X_train.columns]

# Cell 9: Huấn luyện CatBoost (giống như code của bạn)
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report

cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

model = CatBoostClassifier(
    verbose=100, 
    random_state=42,
    # Thêm các tham số chống overfitting nếu cần
    # auto_class_weights='Balanced', # Thử cái này nếu dữ liệu mất cân bằng
    # early_stopping_rounds=50 
)

# Để dùng early_stopping_rounds, bạn cần fit với eval_set
# model.fit(train_pool, eval_set=test_pool) 
model.fit(train_pool) # Hoặc fit như cũ

# Đánh giá
y_pred = model.predict(test_pool)
print(classification_report(y_test, y_pred))

Learning rate set to 0.06831
0:	learn: 0.5196565	total: 102ms	remaining: 1m 41s
100:	learn: 0.0000584	total: 7.72s	remaining: 1m 8s
200:	learn: 0.0000485	total: 14.3s	remaining: 57s
300:	learn: 0.0000484	total: 18.9s	remaining: 44s
400:	learn: 0.0000484	total: 24s	remaining: 35.8s
500:	learn: 0.0000484	total: 28.8s	remaining: 28.7s
600:	learn: 0.0000484	total: 33.2s	remaining: 22s
700:	learn: 0.0000484	total: 37.7s	remaining: 16.1s
800:	learn: 0.0000484	total: 42.1s	remaining: 10.5s
900:	learn: 0.0000484	total: 46.6s	remaining: 5.12s
999:	learn: 0.0000484	total: 51s	remaining: 0us
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       1.00      1.00      1.00       987

    accuracy                           1.00     20987
   macro avg       1.00      1.00      1.00     20987
weighted avg       1.00      1.00      1.00     20987



In [None]:
df.columns