# 로그 파일 생성
- 다양한 형태의 로그 파일(JSON, CSV, 텍스트)을 파이썬으로 읽어와 분석하는 예제

In [1]:
import random
from datetime import datetime, timedelta, timezone
import pathlib

# -------- Helpers --------
METHODS = ["GET", "POST", "PUT", "DELETE", "PATCH"]
PATHS = [
    "/", "/index.html", "/login", "/logout", "/api/v1/items", "/api/v1/items/42",
    "/search?q=python", "/static/app.js", "/static/style.css", "/healthz"
]
STATUSES = [200, 200, 200, 201, 301, 302, 400, 401, 403, 404, 500, 502, 503]
SIZES = list(range(128, 5243))
REFERRERS = [
    "-", "https://www.google.com/", "https://www.bing.com/",
    "https://example.com/landing", "https://twitter.com/someone/status/1",
]
UAS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
    "(KHTML, like Gecko) Version/17.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:125.0) Gecko/20100101 Firefox/125.0",
    "curl/8.6.0",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 "
    "(KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"
]
IPS = [
    "192.168.0.10", "10.0.0.5", "172.16.1.23", "203.0.113.45",
    "198.51.100.77", "8.8.8.8", "52.79.152.10", "211.231.99.142"
]
VHOSTS = ["example.com", "shop.example.com", "api.example.com"]

def random_timestamp():
    # within the last 48 hours
    now = datetime.now(timezone.utc)
    offset = timedelta(seconds=random.randint(0, 172800))
    ts = now - offset
    # Apache/Nginx access log timestamp like: 14/Aug/2025:09:15:31 +0000
    return ts.strftime("%d/%b/%Y:%H:%M:%S %z")

def pick(items):
    return random.choice(items)

def random_query_path():
    p = pick(PATHS)
    # small chance to add a tracking query to a non-query path
    if "?" not in p and random.random() < 0.25:
        p += f"?utm_source={random.choice(['google','newsletter','twitter'])}&utm_campaign={random.randint(1,9)}"
    return p

def gen_apache_combined():
    # Apache combined log format:
    # %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"
    ip = pick(IPS)
    identd = "-"
    user = pick(["-", "alice", "bob", "svc-api"])
    ts = random_timestamp()
    method = pick(METHODS)
    path = random_query_path()
    proto = pick(["HTTP/1.1", "HTTP/2.0"])
    request = f"{method} {path} {proto}"
    status = pick(STATUSES)
    size = pick(SIZES)
    referer = pick(REFERRERS)
    ua = pick(UAS)
    return f'{ip} {identd} {user} [{ts}] "{request}" {status} {size} "{referer}" "{ua}"'

def gen_nginx_combined():
    # Typical Nginx combined-like format with vhost and request time
    # $remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent
    # "$http_referer" "$http_user_agent" "$host" $request_time
    ip = pick(IPS)
    user = pick(["-", "carol", "dave", "svc-web"])
    ts = random_timestamp()
    method = pick(METHODS)
    path = random_query_path()
    proto = pick(["HTTP/1.1", "HTTP/2.0"])
    request = f"{method} {path} {proto}"
    status = pick(STATUSES)
    size = pick(SIZES)
    referer = pick(REFERRERS)
    ua = pick(UAS)
    host = pick(VHOSTS)
    req_time = f"{random.uniform(0.001, 3.5):.3f}"
    return f'{ip} - {user} [{ts}] "{request}" {status} {size} "{referer}" "{ua}" "{host}" {req_time}'

def gen_malformed():
    # Intentionally broken / partial lines to test parser robustness
    candidates = [
        "BROKEN LINE WITHOUT FIELDS",
        f'{pick(IPS)} - - [{random_timestamp()}] "GET /incomplete HTTP/1.1" - - "-" "-"',
        f'{pick(IPS)} - - [bad/time/stamp] "POST /api HTTP/1.1" 200 123 "-" "-"',
        f'"{pick(UAS)}" {pick(STATUSES)}', # starts with UA
        "" # empty line
    ]
    return pick(candidates)

# -------- Generate file --------
pathlib.Path("./data").mkdir(parents=True, exist_ok=True)
out_path = pathlib.Path("./data/sample.log")

n_apache = 300
n_nginx = 300
n_bad = 20

with out_path.open("w", encoding="utf-8") as f:
    for _ in range(n_apache):
        f.write(gen_apache_combined() + "\n")
    for _ in range(n_nginx):
        f.write(gen_nginx_combined() + "\n")
    for _ in range(n_bad):
        f.write(gen_malformed() + "\n")

print(f"Generated {n_apache} Apache logs, {n_nginx} Nginx logs, and {n_bad} malformed logs.")
print(f"File saved to: {out_path.resolve()}")
print("\n--- Preview (first 10 lines) ---")
try:
    with out_path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= 10:
                break
            print(line.strip())
except FileNotFoundError:
    print("Error: sample.log not found.")

Generated 300 Apache logs, 300 Nginx logs, and 20 malformed logs.
File saved to: /home/evan/de_etl_book_tutorial/ch07/data/sample.log

--- Preview (first 10 lines) ---
52.79.152.10 - - [16/Aug/2025:09:46:08 +0000] "POST / HTTP/2.0" 301 763 "https://www.bing.com/" "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Mobile/15E148 Safari/604.1"
198.51.100.77 - - [16/Aug/2025:06:15:55 +0000] "POST /login HTTP/1.1" 401 2642 "https://www.bing.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
172.16.1.23 - bob [17/Aug/2025:01:57:59 +0000] "GET /index.html HTTP/2.0" 503 2794 "https://www.bing.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15"
52.79.152.10 - - [18/Aug/2025:03:46:09 +0000] "GET /login HTTP/2.0" 503 3825 "https://www.google.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb