# BDA Contest - High Performance Model
- 전처리 + 피처 엔지니어링 + 앙상블 모델 + F1 최적화
- Raw Data에서 직접 불러옴

In [None]:
# 패키지 설치
!pip install xgboost lightgbm catboost -q

import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

SEED = 42
np.random.seed(SEED)
print('Setup Complete!')

In [None]:
# Raw Data 로드 (GitHub에서 직접)
RAW_TRAIN_URL = 'https://raw.githubusercontent.com/choicompany/bdadacon/refs/heads/main/rawdata/train.csv'
RAW_TEST_URL = 'https://raw.githubusercontent.com/choicompany/bdadacon/refs/heads/main/rawdata/test.csv'

train_raw = pd.read_csv(RAW_TRAIN_URL)
test_raw = pd.read_csv(RAW_TEST_URL)

print(f'Train: {train_raw.shape}, Test: {test_raw.shape}')
print(f"Target: 0={sum(train_raw['completed']==0)}, 1={sum(train_raw['completed']==1)}")

In [None]:
# 피처 엔지니어링 함수
def count_items(text):
    if pd.isna(text) or str(text).strip() == '': return 0
    return str(text).count(',') + 1

def text_length(text):
    if pd.isna(text): return 0
    return len(str(text))

def clean_text(text):
    if pd.isna(text): return 'Unknown'
    return re.sub(r'[^a-zA-Z0-9가-힣]', '', str(text))

# Target & IDs 분리
y = train_raw['completed'].copy()
test_ids = test_raw['ID'].copy()

# Train/Test 합치기
train_x = train_raw.drop(columns=['completed', 'ID'])
test_x = test_raw.drop(columns=['ID'])
train_x['is_train'] = 1
test_x['is_train'] = 0
combined = pd.concat([train_x, test_x], ignore_index=True)

print('Data merged for preprocessing')

In [None]:
# 파생 피처 생성
print('Creating derived features...')

# Null count (성실도)
combined['null_count'] = combined.isnull().sum(axis=1)

# 텍스트 길이
for col in ['whyBDA', 'what_to_gain', 'hope_for_group', 'incumbents_lecture_scale_reason']:
    if col in combined.columns:
        combined[f'{col}_len'] = combined[col].apply(text_length)

# 항목 개수 (콤마 기준)
for col in ['certificate_acquisition', 'desired_certificate', 'desired_job', 'onedayclass_topic']:
    if col in combined.columns:
        combined[f'{col}_count'] = combined[col].apply(count_items)

# 전공 관련
combined['major_data'] = combined['major_data'].astype(str).apply(lambda x: 1 if x.lower()=='true' else 0)
combined['is_student'] = (combined['job'] == '대학생').astype(int)

# 불필요 컬럼 제거
combined = combined.drop(columns=['generation'], errors='ignore')

print('Derived features created!')

In [None]:
# 결측치 & 인코딩
print('Encoding...')
cat_cols = combined.select_dtypes(include=['object']).columns.tolist()
num_cols = combined.select_dtypes(include=['number', 'bool']).columns.tolist()
if 'is_train' in num_cols: num_cols.remove('is_train')

for col in num_cols:
    combined[col] = combined[col].fillna(-1)

for col in cat_cols:
    combined[col] = combined[col].fillna('Unknown').apply(clean_text)

# High cardinality -> Label Encoding
# Low cardinality -> One-Hot
high_card = [c for c in cat_cols if combined[c].nunique() > 15]
low_card = [c for c in cat_cols if combined[c].nunique() <= 15]

for col in high_card:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

if low_card:
    combined = pd.get_dummies(combined, columns=low_card, drop_first=False, dtype=int)

print(f'Encoding done! High card: {len(high_card)}, Low card: {len(low_card)}')

In [None]:
# Train/Test 분리 & NumPy 변환
X = combined[combined['is_train'] == 1].drop(columns=['is_train']).reset_index(drop=True)
X_test = combined[combined['is_train'] == 0].drop(columns=['is_train']).reset_index(drop=True)

# 컬럼명 정리 (XGBoost 호환)
def clean_cols(cols):
    seen = {}
    result = []
    for c in cols:
        c = re.sub(r'[\[\]<>\s]', '_', str(c))
        if c in seen:
            seen[c] += 1
            result.append(f'{c}_{seen[c]}')
        else:
            seen[c] = 0
            result.append(c)
    return result

X.columns = clean_cols(X.columns)
X_test.columns = clean_cols(X_test.columns)

# NumPy 변환
X_np = X.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)
y_np = y.values.astype(int)

print(f'Train: {X_np.shape}, Test: {X_test_np.shape}')

In [None]:
# 모델 학습 (OOF Ensemble)
N_FOLDS = 5
scale = sum(y_np == 0) / sum(y_np == 1)
print(f'Class imbalance ratio: {scale:.2f}')

def get_oof(model_class, params, X, y, X_test, n_folds=5):
    oof = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
        model = model_class(**params)
        model.fit(X[tr_idx], y[tr_idx])
        oof[val_idx] = model.predict_proba(X[val_idx])[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_folds
    return oof, test_preds

In [None]:
# XGBoost
print('Training XGBoost...')
xgb_oof, xgb_test = get_oof(xgb.XGBClassifier, {
    'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05,
    'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': scale,
    'random_state': SEED, 'use_label_encoder': False, 'eval_metric': 'logloss', 'n_jobs': -1
}, X_np, y_np, X_test_np)
print(f"  XGBoost F1: {f1_score(y_np, (xgb_oof >= 0.5).astype(int)):.4f}")

In [None]:
# LightGBM
print('Training LightGBM...')
lgb_oof, lgb_test = get_oof(lgb.LGBMClassifier, {
    'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05,
    'subsample': 0.8, 'colsample_bytree': 0.8, 'scale_pos_weight': scale,
    'random_state': SEED, 'verbose': -1, 'n_jobs': -1
}, X_np, y_np, X_test_np)
print(f"  LightGBM F1: {f1_score(y_np, (lgb_oof >= 0.5).astype(int)):.4f}")

In [None]:
# CatBoost
print('Training CatBoost...')
cat_oof, cat_test = get_oof(CatBoostClassifier, {
    'iterations': 500, 'depth': 5, 'learning_rate': 0.05,
    'random_seed': SEED, 'verbose': 0, 'class_weights': {0: 1, 1: scale}
}, X_np, y_np, X_test_np)
print(f"  CatBoost F1: {f1_score(y_np, (cat_oof >= 0.5).astype(int)):.4f}")

In [None]:
# 앙상블 (Stacking)
oof_stack = np.column_stack([xgb_oof, lgb_oof, cat_oof])
test_stack = np.column_stack([xgb_test, lgb_test, cat_test])

# Meta model
meta = LogisticRegression(random_state=SEED, max_iter=1000)
meta.fit(oof_stack, y_np)

final_oof = meta.predict_proba(oof_stack)[:, 1]
final_test_probs = meta.predict_proba(test_stack)[:, 1]

print(f'Meta weights: {meta.coef_[0]}')

In [None]:
# Threshold 최적화 (F1 Score 기준)
best_f1, best_th = 0, 0.5
for th in np.arange(0.2, 0.8, 0.01):
    f1 = f1_score(y_np, (final_oof >= th).astype(int))
    if f1 > best_f1:
        best_f1, best_th = f1, th

print(f'Best Threshold: {best_th:.2f}')
print(f'Best OOF F1: {best_f1:.4f}')

# 최종 예측
final_preds = (final_test_probs >= best_th).astype(int)
print(f'Predicted 0: {sum(final_preds==0)}, 1: {sum(final_preds==1)}')

In [None]:
# 제출 파일 생성
submission = pd.DataFrame({'ID': test_ids, 'completed': final_preds})
submission.to_csv('submission.csv', index=False)
print('Saved: submission.csv')
print(submission.head(10))

In [None]:
# Colab 다운로드
from google.colab import files
files.download('submission.csv')