In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np



In [71]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load Dataset

In [None]:
train_data_df = pd.read_csv('train_data.csv')
train_data_df.head()

train_data_df.info()

train_label_df = pd.read_csv("train_label.csv")
train_label_df.head()

train_label_df = pd.read_csv("train_label.csv")
train_label_df.head()

test_data_df = pd.read_csv('test_data.csv') 
test_data_df.head()

test_label_df = pd.read_csv("sample_submission.csv")
test_label_df.head()

## 데이터 전처리


In [None]:
# 라벨 병합
train = train_data_df.merge(train_label_df, on="Id")
test = test_data_df.copy()

# 불필요 제거
drop_cols = ["Id"]
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

#컬럼 그룹 정의
target = "Anomaly Scores"

numeric_cols = [
    "Source Port",
    "Destination Port",
    "Packet Length"
]

high_cardinality_cols = [
    "Source IP Address",
    "Destination IP Address",
    "Attack Signature"
]

categorical_cols = [
    "Protocol",
    "Packet Type",
    "Traffic Type",
    "Malware Indicators",
    "Alerts/Warnings",
    "Action Taken",
    "Severity Level",
    "User Information",
    "Device Information",
    "Network Segment",
    "Geo-location Data",
    "Log Source"
]

# 결측치 처리
# 수치형
for col in numeric_cols:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(train[col].median())

# 문자열
for col in categorical_cols + high_cardinality_cols:
    train[col] = train[col].fillna("Unknown")
    test[col] = test[col].fillna("Unknown")
# 문자열 결측 → "Unknown"이라는 하나의 범주로 유지
# 수치 결측 → 중앙값/(-1) + 필요 시 is_missing 플래그 가 낫다는 gpt의 판단


#로그 변환 / Packet Length에 log1p 적용
train["Packet_Length_log"] = np.log1p(train["Packet Length"])
test["Packet_Length_log"] = np.log1p(test["Packet Length"])

train.drop(columns=["Packet Length"], inplace=True)
test.drop(columns=["Packet Length"], inplace=True)

# 빈도 기반 인코딩 IP, Signature
def frequency_encoding(train, test, col):
    freq = train[col].value_counts()
    train[col + "_freq"] = train[col].map(freq)
    test[col + "_freq"] = test[col].map(freq)
    test[col + "_freq"] = test[col + "_freq"].fillna(0)
    return train, test

for col in high_cardinality_cols:
    train, test = frequency_encoding(train, test, col)


train.drop(columns=high_cardinality_cols, inplace=True)
test.drop(columns=high_cardinality_cols, inplace=True)


# Severity, Actioin 규칙 기반 수치화(SOC 도메인 반영)
severity_map = {
    "Low": 0,
    "Medium": 1,
    "High": 2,
    "Critical": 3
}

#Severity_num == Severity Level 수치화
train["Severity_num"] = train["Severity Level"].map(severity_map).fillna(0)
test["Severity_num"] = test["Severity Level"].map(severity_map).fillna(0)

train["Action_Block"] = (train["Action Taken"] == "Block").astype(int)
test["Action_Block"] = (test["Action Taken"] == "Block").astype(int)


# 타깃 인코딩
def target_encoding(train, test, col, target):
    means = train.groupby(col)[target].mean()
    train[col + "_te"] = train[col].map(means)
    test[col + "_te"] = test[col].map(means)
    global_mean = train[target].mean()
    test[col + "_te"] = test[col + "_te"].fillna(global_mean)
    return train, test

te_cols = [
    "Protocol",
    "Packet Type",
    "Traffic Type",
    "Alerts/Warnings",
    "Malware Indicators",
    "Network Segment",
    "Log Source"
]

for col in te_cols:
    train, test = target_encoding(train, test, col, target)

train.drop(columns=te_cols, inplace=True)
test.drop(columns=te_cols, inplace=True)


# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

le_cols = train.select_dtypes(include="object").columns

for col in le_cols:
    le = LabelEncoder()
    all_data = pd.concat([train[col], test[col]])
    le.fit(all_data)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

test



In [None]:
# 데이터 분리
from sklearn.model_selection import train_test_split

X = train.drop(columns=[target])
y = train[target]

X_test = test.copy()

random_state = 42

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state = random_state
)



# Modeling

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lightgbm import LGBMRegressor


In [None]:

numeric_features = X.columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features)
    ],
    remainder="drop"
)

model = LGBMRegressor(
    n_estimators=2000,          # 크게 잡고
    learning_rate=0.03,         # 더 천천히
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cat_model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=5,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    early_stopping_rounds=200,
    verbose=False
)

cat_model.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val)
)



pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])


pipeline.fit(
    X_train, y_train,
    model__eval_set=[(X_val, y_val)],
    model__eval_metric="rmse",
    model__callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
)


from sklearn.metrics import mean_squared_error
import numpy as np

# Validation RMSE
val_pred_cat = cat_model.predict(X_val)
rmse_cat = np.sqrt(mean_squared_error(y_val, val_pred_cat))
print(f"CatBoost Validation RMSE: {rmse_cat:.5f}")

test_pred_cat = cat_model.predict(X_test)

submission = pd.read_csv("sample_submission.csv")
submission["Anomaly Scores"] = test_pred_cat
submission.to_csv("test_submission_catboost.csv", index=False)





CatBoost Validation RMSE: 28.88267
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1821
[LightGBM] [Info] Number of data points in the train set: 25600, number of used features: 18
[LightGBM] [Info] Start training from score 50.096059
LGBM Validation RMSE: 28.90625

Optimal Blending Weight (CatBoost): 0.75
Optimal Blending Weight (LGBM): 0.25
Blended Validation RMSE: 28.87879


In [77]:
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [78]:
target_scaler = StandardScaler()
y_train = y_train.values
y_train = y_train.reshape(-1, 1)
y_val = y_val.values
y_val = y_val.reshape(-1, 1)

y_train_scaled = target_scaler.fit_transform(y_train)
y_val_scaled = target_scaler.transform(y_val)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

train_pred = pipeline.predict(X_train)
val_pred = pipeline.predict(X_val)

print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, train_pred)):.5f}")
print(f"Validation RMSE: {np.sqrt(mean_squared_error(y_val, val_pred)):.5f}")


Train RMSE: 28.83369
Validation RMSE: 28.90753




# Test Submission

In [None]:
# ❌ categorical_features, numerical_features, OneHotEncoder 전부 제거
# ❌ 새로운 preprocessor 만들지 않음
# ❌ lr, target_scaler 사용하지 않음

# ===============================
# 수정된 test submission 코드
# ===============================

# 이미 전처리 완료된 test 데이터 사용
X_test = test.copy()   # ← 중요: test_data_df ❌, test ⭕


# 이미 학습된 pipeline으로 예측
test_pred = pipeline.predict(X_test)


# sample_submission 기준으로 제출 파일 생성
submission = pd.read_csv("sample_submission.csv")
submission["Anomaly Scores"] = test_pred


# CSV 저장
submission.to_csv("test_submission.csv", index=False)


최종 제출 파일 (test_submission_blended_optimized.csv) 생성이 완료되었습니다.
