In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import re
from urllib.parse import urlparse

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
# 학습/평가 데이터 로드
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [4]:
# '[.]'을 '.'으로 복구
train_df['URL'] = train_df['URL'].str.replace(r'\[\.\]', '.', regex=True)
test_df['URL'] = test_df['URL'].str.replace(r'\[\.\]', '.', regex=True)

In [5]:
def extract_additional_features(df):
    
    # URL 내 숫자 개수 계산
    def count_digits(URL):
        return sum(c.isdigit() for c in URL)
    
    # URL 내 대문자 개수 계산
    def count_uppercase(URL):
        return sum(c.isupper() for c in URL)
    
    # URL이 IP 주소를 포함하는지 확인
    def contains_ip(URL):
        return int(bool(re.search(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', URL)))
    
    # URL에 의심스러운 키워드가 포함되어 있는지 확인
    def contains_suspicious_keywords(URL):
        suspicious_keywords = ["login", "bank", "secure", "update", "verify", "account", "password"]
        return int(any(keyword in URL.lower() for keyword in suspicious_keywords))
    
    # URL 디렉토리 개수 계산
    def count_path_depth(URL):
        parsed_url = urlparse(URL)
        return parsed_url.path.count('/')
    
    df['digit_count'] = df['URL'].apply(count_digits)
    df['uppercase_count'] = df['URL'].apply(count_uppercase)
    df['ip_address_flag'] = df['URL'].apply(contains_ip)
    df['suspicious_keyword_flag'] = df['URL'].apply(contains_suspicious_keywords)
    df['path_depth'] = df['URL'].apply(count_path_depth)
    
    return df


In [6]:
# 기존 특징 생성 (URL 길이, 서브도메인 개수, 특수 문자 개수)
train_df['length'] = train_df['URL'].str.len()
test_df['length'] = test_df['URL'].str.len()

train_df['subdomain_count'] = train_df['URL'].str.split('.').apply(lambda x: len(x) - 2)
test_df['subdomain_count'] = test_df['URL'].str.split('.').apply(lambda x: len(x) - 2)

train_df['special_char_count'] = train_df['URL'].apply(lambda x: sum(1 for c in x if c in '-_/'))
test_df['special_char_count'] = test_df['URL'].apply(lambda x: sum(1 for c in x if c in '-_/'))

# 추가 특징 추출
train_df = extract_additional_features(train_df)
test_df = extract_additional_features(test_df)

In [7]:
features = ['length', 'subdomain_count', 'special_char_count', 'digit_count', 'uppercase_count', 'ip_address_flag', 'suspicious_keyword_flag', 'path_depth']
X = train_df[features]
y = train_df['label']
X_test = test_df[features]

In [9]:
kf = KFold(n_splits=4, shuffle=True, random_state=42)
models = []  
auc_scores = []

for idx, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    print('-'*40)
    print(f'Fold {idx + 1} 번째 XGBoost 모델을 학습합니다.')
    print('Epoch|         Train AUC             |         Validation AUC')
    
    # XGBoost 모델 학습
    model = XGBClassifier(
        n_estimators=100, 
        max_depth=6,       
        learning_rate=0.1, 
        random_state=42,
        use_label_encoder=False, 
        tree_method='gpu_hist' ,
        eval_metric="auc",        
    )
    
    # 학습 및 Validation 성능 모니터링
    eval_set = [(X_train, y_train), (X_val, y_val)]
    model.fit(
        X_train, y_train,
        eval_set=eval_set,
        verbose=True
    )
    
    models.append(model)  # 모델 저장
    
    # 검증 데이터 예측 및 ROC-AUC 계산
    y_val_pred_prob = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, y_val_pred_prob)
    print(f"Fold {idx + 1} CV ROC-AUC: {auc:.4f}")
    print('-'*40)
    auc_scores.append(auc)

print(f"K-Fold 평균 ROC-AUC: {np.mean(auc_scores):.4f}")

----------------------------------------
Fold 1 번째 XGBoost 모델을 학습합니다.
Epoch|         Train AUC             |         Validation AUC
[0]	validation_0-auc:0.88143	validation_1-auc:0.88113
[1]	validation_0-auc:0.88696	validation_1-auc:0.88660
[2]	validation_0-auc:0.89022	validation_1-auc:0.88981
[3]	validation_0-auc:0.89100	validation_1-auc:0.89053
[4]	validation_0-auc:0.89211	validation_1-auc:0.89168
[5]	validation_0-auc:0.89227	validation_1-auc:0.89184
[6]	validation_0-auc:0.89347	validation_1-auc:0.89300
[7]	validation_0-auc:0.89430	validation_1-auc:0.89382
[8]	validation_0-auc:0.89451	validation_1-auc:0.89403
[9]	validation_0-auc:0.89481	validation_1-auc:0.89431
[10]	validation_0-auc:0.89568	validation_1-auc:0.89518
[11]	validation_0-auc:0.89569	validation_1-auc:0.89519
[12]	validation_0-auc:0.89611	validation_1-auc:0.89561
[13]	validation_0-auc:0.89615	validation_1-auc:0.89564
[14]	validation_0-auc:0.89639	validation_1-auc:0.89588
[15]	validation_0-auc:0.89669	validation_1-auc:0.8961

In [10]:
test_probabilities = np.zeros(len(X_test))  

for model in models:
    test_probabilities += model.predict_proba(X_test)[:, 1] 


test_probabilities /= len(models)

print('Inference Done.')

Inference Done.


In [11]:
# 결과 저장
test_df['probability'] = test_probabilities
test_df[['ID', 'probability']].to_csv('./submission.csv', index=False)
print('Done.')

Done.
