In [None]:
# 取得全部上游数据的文件路径
from typing import List
from pathlib import Path
from common import UPSTREAM_DIR

upstream_files: List[Path] = []
for filepath in UPSTREAM_DIR.glob('*.log'):
    upstream_files.append(filepath)

upstream_files[:3]

In [None]:
# 加载ip-地理信息数据
from IP2Location import IP2Location
from common import BASE_DIR
ip2location = IP2Location(BASE_DIR / 'lib' / 'IP2LOCATION-LITE-DB11.BIN')

In [None]:
# 解析上游数据
import re
import pprint
pattern = re.compile(r'^([\d/]+\s[\d:.]+)\s([\d.]+):(\d+)\s?(.*?)\s?(\S*)\s?(\S*)$')

all_data = []
for upstream_file in upstream_files:
    with open(upstream_file, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            if match := pattern.match(line):
                timestamp = match.group(1)
                ip = match.group(2)
                port = match.group(3)
                client = match.group(4)
                username = match.group(5)
                password = match.group(6)

                # 获取地理位置信息
                try:
                    record = ip2location.get_all(ip)
                    country_code = record.country_short
                    country_name = record.country_long
                    region = record.region
                    city = record.city
                    zipcode = record.zipcode
                    timezone = record.timezone
                except Exception as e:
                    # 处理查询失败的情况
                    pprint.pp(e)
                    country_code = 'N/A'
                    country_name = 'N/A'
                    region = 'N/A'
                    city = 'N/A'
                    zipcode = 'N/A'
                    timezone = 'N/A'

                entry = {
                    'timestamp': timestamp,
                    'ip': ip,
                    'port': port,
                    'client': client,
                    'username': username,
                    'password': password,
                    'country_code': country_code,
                    'country_name': country_name,
                    'region': region,
                    'city': city,
                    'zipcode': zipcode,
                    'timezone': timezone,
                }
                all_data.append(entry)

all_data[:3]

In [None]:
# CSV文件输出
import csv
from common import DOWNSTREAM_DIR

fieldnames = [
    'timestamp', 'ip', 'port', 
    'country_code', 'country_name', 'region', 'city', 'zipcode', 'timezone',
    'client', 'username', 'password',
]
csv_path = DOWNSTREAM_DIR / "output_4.csv"

with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for entry in all_data:
        writer.writerow(entry)
