In [73]:
import pandas as pd
import numpy as np

train = pd.read_csv("open/train.csv")
test = pd.read_csv("open/test.csv")

In [74]:
def convert_value(value):
    if pd.isna(value):
        return np.nan
    elif "1500-2500" in value:
        return 1
    elif "2500-3500" in value:
        return 2
    elif "3500-4500" in value:
        return 3
    elif "4500-6000" in value:
        return 4
    elif "6000이상" in value:
        return 5


# 데이터 변환
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(convert_value)
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(convert_value)

### 결측치 처리

In [75]:
# '분야' 결측치 처리
train['분야'] = train['분야'].fillna('NULL')
test['분야'] = test['분야'].fillna('NULL')

In [76]:
train['기업가치(백억원)'] = train.groupby('분야')['기업가치(백억원)'].transform(lambda x: x.fillna(x.mean()))
test['기업가치(백억원)'] = test.groupby('분야')['기업가치(백억원)'].transform(lambda x: x.fillna(x.mean()))

In [77]:
numerical_cols = ['직원 수','고객수(백만명)']
for col in numerical_cols:
    train[col] = train[col].fillna(train[col].mean())
    test[col] = test[col].fillna(test[col].mean())

In [78]:
from sklearn.preprocessing import LabelEncoder

# 투자단계 인코딩
investment_mapping = {
    "Seed": 0,
    "Series A": 1,
    "Series B": 2,
    "Series C": 3,
    "IPO": 4
}

# 인수여부와 상장여부 인코딩
binary_mapping = {"Yes": 1, "No": 0}

train["인수여부"] = train["인수여부"].map(binary_mapping)
test["인수여부"] = test["인수여부"].map(binary_mapping)

train["상장여부"] = train["상장여부"].map(binary_mapping)
test["상장여부"] = test["상장여부"].map(binary_mapping)


train["투자단계"] = train["투자단계"].map(investment_mapping)
test["투자단계"] = test["투자단계"].map(investment_mapping)


# 범주형 인코딩
categorical_features = ['국가', '분야']
for feature in categorical_features:
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    test[feature] = le.transform(test[feature].astype(str))

## Feature Engineering

In [79]:
# 최대 설립년도 계산
max_year = train["설립연도"].max()

# 지속기간 파생변수 생성
train["지속기간"] = max_year - train["설립연도"] + 1
test["지속기간"] = max_year - test["설립연도"] + 1

In [80]:
# 고객수와 비율 맞추기
train["팔로워"] = train["SNS 팔로워 수(백만명)"]*100
test["팔로워"] = test["SNS 팔로워 수(백만명)"]*100

In [81]:
# 0인 경우 변경
train["팔로워"] = train["팔로워"].replace(0, 100)
test["팔로워"] = test["팔로워"].replace(0, 100)

In [82]:
train["팔로워대비고객수"] = train['고객수(백만명)'] / train["팔로워"]
test["팔로워대비고객수"] = test['고객수(백만명)'] / test["팔로워"]

In [83]:
train["총투자금대비연매출"] = train['연매출(억원)'] / train["총 투자금(억원)"]
test["총투자금대비연매출"] = test['연매출(억원)'] / test["총 투자금(억원)"]

In [84]:
train["연매출대비직원수"] = train['직원 수'] / train["총 투자금(억원)"]
test["연매출대비직원수"] = test['직원 수'] / test["총 투자금(억원)"]

In [85]:
# 피쳐 중요도 낮은 칼럼 drop
train = train.drop(columns=["ID","설립연도","SNS 팔로워 수(백만명)","투자단계","인수여부"])
X_test = test.drop(columns=["ID","설립연도","SNS 팔로워 수(백만명)","투자단계","인수여부"])

In [86]:
# 데이터 분리
X_train = train.drop(columns=['성공확률'])
y_train = train['성공확률']

In [87]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler

# scaler = RobustScaler()

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

## Modeling

In [88]:
rf_features = [
    "국가", "분야", "직원 수",
    "상장여부", "고객수(백만명)", "총 투자금(억원)",
    "연매출(억원)", "기업가치(백억원)", "지속기간",
    "팔로워대비고객수", "총투자금대비연매출", "연매출대비직원수"
]

In [89]:
xgb_features = [
    "지속기간", "직원 수", "고객수(백만명)", "총 투자금(억원)","팔로워",
    "연매출(억원)", "기업가치(백억원)",
    "팔로워대비고객수", "총투자금대비연매출", "연매출대비직원수"
]

In [90]:
# X_rf, X_xgb 생성
X_rf = train[rf_features]
X_xgb = train[xgb_features]
y = train["성공확률"]

# 테스트셋도 동일하게 처리
X_rf_test = test[rf_features]
X_xgb_test = test[xgb_features]

In [91]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler_rf = MinMaxScaler()
X_rf_scaled = scaler_rf.fit_transform(X_rf)
X_rf_test_scaled = scaler_rf.transform(X_rf_test)

scaler_xgb = MinMaxScaler()
X_xgb_scaled = scaler_xgb.fit_transform(X_xgb)
X_xgb_test_scaled = scaler_xgb.transform(X_xgb_test)

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import optuna
from sklearn.model_selection import cross_val_score
# Optuna - XGBoost (n_estimators 고정)
# 모델 출력 지움
# optuna.logging.set_verbosity()

def objective_xgb(trial):
    model = XGBRegressor(
        n_estimators=1000,  # 고정
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        max_depth=trial.suggest_int('max_depth', 3, 30),
        min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
        subsample=trial.suggest_float('subsample', 0.5, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        n_jobs=-1
    )
    return cross_val_score(model, X_xgb_scaled, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)

xgb_model = XGBRegressor(**study_xgb.best_params, n_estimators=1000, random_state=42, n_jobs=-1)

# Trial 31 finished with value: -0.19781690928957624 and parameters: {'learning_rate': 0.011493418991538797, 'max_depth': 26, 'min_child_weight': 1, 'subsample': 0.986837779638244, 'colsample_bytree': 0.8150052541533243, 'reg_alpha': 0.134242767179849, 'reg_lambda': 0.6501537277300832}. Best is trial 31 with value: -0.19781690928957624.

In [None]:
# Optuna - RandomForest (n_estimators 고정)
def objective_rf(trial):
    model = RandomForestRegressor(
        n_estimators=1000,  # 고정
        max_depth=trial.suggest_int('max_depth', 3, 20),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 4),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        random_state=42,
        n_jobs=-1
    )
    return cross_val_score(model, X_rf_scaled, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=50)

rf_model = RandomForestRegressor(**study_rf.best_params, n_estimators=1000, random_state=42, n_jobs=-1)

# Trial 18 finished with value: -0.20069994722175294 and parameters: {'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 18 with value: -0.20069994722175294.

In [None]:
from lightgbm import LGBMRegressor

def objective_lgbm(trial):
    model = LGBMRegressor(
        n_estimators=1000,  # 고정
        learning_rate=trial.suggest_float('learning_rate', 0.01, 0.3),
        max_depth=trial.suggest_int('max_depth', 3, 30),
        num_leaves=trial.suggest_int('num_leaves', 20, 300),
        min_child_samples=trial.suggest_int('min_child_samples', 5, 30),
        subsample=trial.suggest_float('subsample', 0.5, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.5, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 1.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.0, 1.0),
        random_state=42,
        n_jobs=-1
    )
    return cross_val_score(model, X_xgb_scaled, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lgbm, n_trials=50)

lgbm_model = LGBMRegressor(**study_lgbm.best_params, n_estimators=1000, random_state=42, n_jobs=-1)

# Trial 45 finished with value: -0.2006139422257776 and parameters: {'learning_rate': 0.03983658262105122, 'max_depth': 22, 'num_leaves': 267, 'min_child_samples': 6, 'subsample': 0.5808921038696877, 'colsample_bytree': 0.5772412168053515, 'reg_alpha': 0.48219141773712604, 'reg_lambda': 0.9574894664559317}. Best is trial 45 with value: -0.2006139422257776.

In [None]:
xgb_model.fit(X_xgb_scaled, y)
rf_model.fit(X_rf_scaled, y)
lgbm_model.fit(X_xgb_scaled, y)

In [97]:
xgb_preds = xgb_model.predict(X_xgb_test_scaled)
rf_preds = rf_model.predict(X_rf_test_scaled)
lgbm_preds = lgbm_model.predict(X_xgb_test_scaled)

In [98]:
test_preds = (6 * rf_preds + 2 * xgb_preds + 2 * lgbm_preds) / 10

In [100]:
# sample_submission 파일을 불러옵니다.
submit = pd.read_csv('open/sample_submission.csv')

In [101]:
submit['성공확률'] = test_preds
submit["성공확률"] = submit["성공확률"].round(1)

In [102]:
# submit.to_csv('./final2_2.csv',index=False)

In [103]:
submit.to_csv('./f59.csv',index=False)