### 필요한 모듈 가져오기

In [None]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import joblib

## TODO: 필요한 라이브러리 추가
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier  # !pip install xgboost 먼저 한 번만 실행
from scipy.stats import entropy





### seed 값 고정

In [2]:
# 시드값이 고정이 되어 있지 않을 시 시드 값을 고정하여 모델을 학습 후 평가
## TODO: torch, tensorflow 등 사용 시에도 동일한 SEED 값 설정

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

### 데이터셋 불러오기

In [3]:
df = pd.read_csv('./train.csv')

In [4]:
df.head()

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,protocol,num_packets,time_sequence,length_sequence,direction_sequence,label
0,150.242.169.98,51776,35.244.247.133,443,TCP,25,"[1625572258.480587, 1625572258.480702, 1625572...","[1470, 1470, 610, 40, 104, 210, 390, 454, 620,...","[-1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, ...",38
1,172.16.30.110,49294,93.158.134.119,443,TCP,107,"[1607051927.782215, 1607051927.782324, 1607051...","[1450, 1450, 1450, 221, 52, 52, 52, 52, 132, 1...","[-1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -...",40
2,192.168.50.107,55750,38.121.72.166,443,TCP,213,"[1611731487.806609, 1611731487.806611, 1611731...","[1500, 636, 1151, 52, 52, 116, 52, 98, 95, 87,...","[-1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, -1, -1,...",57
3,150.242.169.99,50485,59.108.138.228,443,TCP,12,"[1612662987.601396, 1612662987.601527, 1612662...","[1500, 1500, 315, 40, 120, 311, 311, 102, 40, ...","[-1, -1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1]",29
4,150.242.169.100,61777,109.244.244.164,443,TCP,18,"[1621999755.048606, 1621999755.048879, 1621999...","[1472, 1472, 1272, 1095, 40, 40, 40, 40, 120, ...","[-1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, -1,...",70


### 전처리

In [5]:
# JSON 형식의 문자열을 리스트로 변환

df['length_sequence'] = df['length_sequence'].apply(json.loads)
df['direction_sequence'] = df['direction_sequence'].apply(json.loads)
df['time_sequence'] = df['time_sequence'].apply(json.loads)

In [None]:
import numpy as np


def preprocess(df):
    """
    df: columns
        - 'length_sequence'     : list/array of packet lengths
        - 'direction_sequence'  : list/array of directions (1, -1)
        - 'time_sequence'       : list/array of timestamps (sorted)

    return:
        - inputs: list of feature vectors (list[float])
    """
    inputs = []

    for _, row in df.iterrows():
        # -----------------------------------------------------
        # 0. 시퀀스 타입을 list로 강제 변환 (np.array 안전 처리)
        # -----------------------------------------------------
        seq_len = list(row['length_sequence'])
        seq_dir = list(row['direction_sequence'])
        seq_time = list(row['time_sequence'])

        total = len(seq_len)

        # 혹시라도 완전 빈 플로우가 들어오는 경우를 방어
        if total == 0:
            # 길이 0인 경우는 모든 feature를 0으로 한 벡터 하나를 넣고 continue
            # (데이터 자체를 미리 필터링하는 게 더 좋긴 합니다)
            base = [3000] * 20
            feature = base + [0.0] * 40  # 아래 실제 feature 개수와 맞게 조정 필요
            inputs.append(feature)
            continue

        # ------------------------------
        # 1) base 20 (처음 20개 패킷 길이)
        # ------------------------------
        if total >= 20:
            base = seq_len[:20]
        else:
            base = seq_len + [3000] * (20 - total)

        # ------------------------------
        # 2) Length stats
        # ------------------------------
        seq_len_arr = np.array(seq_len, dtype=float)

        avg_len = float(np.mean(seq_len_arr))
        std_len = float(np.std(seq_len_arr))
        max_len = float(np.max(seq_len_arr))
        min_len = float(np.min(seq_len_arr))
        med_len = float(np.median(seq_len_arr))
        p25_len = float(np.percentile(seq_len_arr, 25))
        p75_len = float(np.percentile(seq_len_arr, 75))
        range_len = float(max_len - min_len)

        # diffs
        if total > 1:
            diffs = np.diff(seq_len_arr)
            len_diff_mean = float(np.mean(diffs))
            len_diff_std  = float(np.std(diffs))
            len_diff_max  = float(np.max(diffs))
            len_diff_min  = float(np.min(diffs))

            # 길이 증가/감소 비율
            pos_ratio = float(np.sum(diffs > 0) / len(diffs))
            neg_ratio = float(np.sum(diffs < 0) / len(diffs))
        else:
            len_diff_mean = len_diff_std = len_diff_max = len_diff_min = 0.0
            pos_ratio = neg_ratio = 0.0

        # bucket ratios
        small  = sum(l < 100        for l in seq_len)
        medium = sum(100 <= l < 1000 for l in seq_len)
        large  = sum(l >= 1000       for l in seq_len)

        ratio_small  = small  / total
        ratio_medium = medium / total
        ratio_large  = large  / total

        # ------------------------------
        # 3) Direction features
        # ------------------------------
        # 방향 값이 1/-1 외에 다른 값이 있을 수도 있으니 sum(...) 형태로 계산
        out_count = sum(1 for d in seq_dir if d == 1)
        in_count  = sum(1 for d in seq_dir if d == -1)

        outbound_ratio = out_count / total
        inbound_ratio  = in_count  / total

        direction_changes = sum(
            1 for i in range(total - 1)
            if seq_dir[i] != seq_dir[i + 1]
        )

        def max_run(lst, target):
            c = best = 0
            for x in lst:
                if x == target:
                    c += 1
                    best = max(best, c)
                else:
                    c = 0
            return best

        max_out_run = max_run(seq_dir, 1)
        max_in_run  = max_run(seq_dir, -1)

        # --- 개선: 길이 정규화된 방향 feature ---
        if total > 1:
            dir_change_ratio = direction_changes / (total - 1 + 1e-6)
        else:
            dir_change_ratio = 0.0

        max_out_run_ratio = max_out_run / (total + 1e-6)
        max_in_run_ratio  = max_in_run  / (total + 1e-6)

        # --- 개선: 방향별 길이 통계 ---
        out_lens = [l for l, d in zip(seq_len, seq_dir) if d == 1]
        in_lens  = [l for l, d in zip(seq_len, seq_dir) if d == -1]

        if out_lens:
            out_lens_arr = np.array(out_lens, dtype=float)
            avg_out_len  = float(np.mean(out_lens_arr))
            std_out_len  = float(np.std(out_lens_arr))
        else:
            avg_out_len = std_out_len = 0.0

        if in_lens:
            in_lens_arr = np.array(in_lens, dtype=float)
            avg_in_len  = float(np.mean(in_lens_arr))
            std_in_len  = float(np.std(in_lens_arr))
        else:
            avg_in_len = std_in_len = 0.0

        # ------------------------------
        # 4) IAT (time difference)
        # ------------------------------
        if len(seq_time) > 1:
            seq_time_arr = np.array(seq_time, dtype=float)
            iat = np.diff(seq_time_arr)

            avg_iat    = float(np.mean(iat))
            std_iat    = float(np.std(iat))
            max_iat    = float(np.max(iat))
            min_iat    = float(np.min(iat))
            median_iat = float(np.median(iat))
            p75_iat    = float(np.percentile(iat, 75))

            # skewness 근사
            iat_skew = float(3 * (avg_iat - median_iat) / (std_iat + 1e-6))

            # length–IAT correlation
            if len(iat) > 1 and (len(iat) == len(seq_len_arr) or len(iat) == len(seq_len_arr) - 1):
                corr = float(np.corrcoef(seq_len_arr[:len(iat)], iat)[0, 1])
                if np.isnan(corr):
                    corr = 0.0
            else:
                corr = 0.0

            # IAT peak count
            iat_peaks = int(np.sum(iat > (avg_iat + 2 * std_iat)))

            # --- 로그 스케일 feature (분포 완화용) ---
            log_avg_iat = float(np.log1p(max(avg_iat, 0.0)))
            log_max_iat = float(np.log1p(max(max_iat, 0.0)))

        else:
            avg_iat = std_iat = max_iat = min_iat = median_iat = p75_iat = 0.0
            iat_skew = 0.0
            corr = 0.0
            iat_peaks = 0
            log_avg_iat = 0.0
            log_max_iat = 0.0

        # duration
        if len(seq_time) > 1:
            duration = float(seq_time[-1] - seq_time[0])
        else:
            duration = 0.0

        log_duration = float(np.log1p(max(duration, 0.0)))

        # ------------------------------
        # 5) window-based features (길이 기반)
        # ------------------------------
        if total >= 5:
            first_mean = float(np.mean(seq_len_arr[:5]))
            last_mean  = float(np.mean(seq_len_arr[-5:]))
        else:
            first_mean = last_mean = avg_len

        # 개선: total < 3인 경우 중간 구간 mean이 비어 NaN 나오는 문제 방지
        if total >= 3:
            start_idx = total // 3
            end_idx   = 2 * total // 3
            if end_idx > start_idx:
                middle_mean = float(np.mean(seq_len_arr[start_idx:end_idx]))
            else:
                middle_mean = avg_len
        else:
            middle_mean = avg_len

        # ------------------------------
        # 6) 패킷 수
        # ------------------------------
        total_packets = total

        # ------------------------------
        # 7) 기타 중요한 feature들
        # ------------------------------
        # 길이 분포 entropy
        vals, counts = np.unique(seq_len_arr, return_counts=True)
        length_entropy = float(entropy(counts))

        # burst score (정규화된 변동성)
        burst_score = float(std_len / (avg_len + 1e-6))

        # 시작/끝 패킷 크기
        first_len = float(seq_len[0])
        last_len  = float(seq_len[-1])

        # length normalization ratio
        len_norm_ratio = float((avg_len - min_len) / (range_len + 1e-6))

        # ------------------------------
        # 최종 feature vector
        #   - 기존 feature 유지 + 개선/추가 feature 뒤에 붙이기
        # ------------------------------
        feature = base + [
            # --- length 기본 통계 ---
            avg_len, std_len, max_len, min_len, med_len,
            p25_len, p75_len, range_len,
            # --- 길이 diff ---
            len_diff_mean, len_diff_std, len_diff_max, len_diff_min,
            # --- 길이 bucket 비율 ---
            ratio_small, ratio_medium, ratio_large,
            # --- 방향 ---
            outbound_ratio, inbound_ratio, direction_changes,
            max_out_run, max_in_run,
            # --- IAT ---
            avg_iat, std_iat, max_iat, min_iat, median_iat, p75_iat,
            # --- duration & packet count ---
            duration,
            total_packets,
            # --- window-based length ---
            first_mean, middle_mean, last_mean,
            # --- IAT 추가 통계 ---
            iat_skew, corr, iat_peaks,
            # --- 기타 ---
            length_entropy, burst_score,
            first_len, last_len,
            pos_ratio, neg_ratio,
            len_norm_ratio,
            # ============================
            #   여기부터 개선/추가 feature
            # ============================
            dir_change_ratio,
            max_out_run_ratio, max_in_run_ratio,
            avg_out_len, std_out_len,
            avg_in_len, std_in_len,
            log_duration,
            log_avg_iat, log_max_iat
        ]

        inputs.append(feature)

    return inputs


In [None]:
# 전처리한 데이터를 열에 추가
df['input'] = preprocess(df)

### 학습, 검증 데이터 나누기

In [8]:
# 학습과 검증 데이터 분리
X_train, X_valid, y_train, y_valid = train_test_split(df['input'], df['label'], test_size=0.2, random_state=SEED)

# 모델 입력을 위해 리스트 형태로 변환
X_train = X_train.tolist()
y_train = y_train.tolist()

X_valid = X_valid.tolist()
y_valid = y_valid.tolist()

### 모델 학습

In [None]:
# === 개별 모델들 ===

rf_model = RandomForestClassifier(
    n_estimators=800,
    max_depth=40,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1,
    random_state=SEED
)

xgb_model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.8,
    objective='multi:softprob',
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=SEED
)

# === 소프트 보팅 앙상블 ===
model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model)
    ],
    voting='soft',
    weights=[1, 1]
)

# 학습
model.fit(X_train, y_train)



### 모델 검증

In [10]:
pred = model.predict(X_valid)
print(f"Accuracy: {accuracy_score(y_valid, pred):.4f}")
print(f"F1 Score: {f1_score(y_valid, pred, average='macro'):.4f}")

Accuracy: 0.9127
F1 Score: 0.9122


In [11]:
train_pred = model.predict(X_train)
print("Train Accuracy:", accuracy_score(y_train, train_pred))
print("Train F1:", f1_score(y_train, train_pred, average='macro'))

Train Accuracy: 1.0
Train F1: 1.0


### 제출 파일 생성

In [None]:
test_df = pd.read_csv('./testcase.csv')

In [None]:
test_df.head()

Unnamed: 0,src_ip,src_port,dst_ip,dst_port,protocol,num_packets,time_sequence,length_sequence,direction_sequence
0,192.168.50.244,56819,192.0.77.32,443,TCP,27,"[1617178697.800111, 1617178697.800114, 1617178...","[1500, 248, 1500, 1500, 40, 40, 1040, 40, 160,...","[-1, -1, -1, -1, 1, 1, -1, 1, -1, 1, 1, 1, -1,..."
1,150.242.169.98,61627,99.86.202.24,443,TCP,21,"[1615726680.921434, 1615726680.921536, 1615726...","[1280, 1280, 1280, 1280, 1280, 772, 772, 52, 5...","[-1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1,..."
2,150.242.169.100,62098,182.254.59.150,443,TCP,24,"[1620715845.65485, 1620715845.654968, 16207158...","[1500, 1500, 1216, 1268, 41, 40, 120, 600, 438...","[-1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -..."
3,150.242.169.99,61645,184.25.254.123,443,TCP,49,"[1618907087.044121, 1618907087.044251, 1618907...","[1500, 1500, 1252, 1500, 52, 52, 52, 1349, 52,...","[-1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1..."
4,192.168.50.213,47564,59.111.181.35,443,TCP,64,"[1610526084.314479, 1610526084.314679, 1610526...","[1500, 1500, 781, 52, 52, 52, 132, 1500, 1500,...","[-1, -1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1,..."


In [None]:
test_df['length_sequence'] = test_df['length_sequence'].apply(json.loads)
test_df['direction_sequence'] = test_df['direction_sequence'].apply(json.loads)
test_df['time_sequence'] = test_df['time_sequence'].apply(json.loads)

In [None]:
# 테스트 데이터 전처리 수행

test_df['input'] = preprocess(test_df)
X_test = test_df['input'].tolist()

In [None]:
# 모델 예측

pred_test = model.predict(X_test)

In [None]:
# 결과물 저장
## TODO: 팀 번호에 맞게 수정해서 저장

TEAM = 3

###########################################################
# 아래 코드는 수정 X                                       #
# torch 또는 tensorflow로 모델 학습 시 모델 저장 함수만 수정 #
###########################################################
submit_df = pd.DataFrame({'label': pred_test})
submit_df.to_csv(f'./res/{TEAM}_submission.csv', index=False)

## TODO: torch 또는 tensorflow로 모델 학습 시 모델 저장 함수만 수정
joblib.dump(model, f'./res/{TEAM}_model.pkl')
###########################################################
# 위 코드는 수정 X                                         #
# torch 또는 tensorflow로 모델 학습 시 모델 저장 함수만 수정 #
###########################################################

['./res/3_model.pkl']