## LightGBM 모델 학습

### encoding

In [None]:
categorical_cols = ['제조사', '모델', '차량상태', '구동방식', '사고이력']

encoder = LabelEncoder()
for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col].astype(str))
    test_df[col] = encoder.transform(test_df[col].astype(str))



In [None]:
# Random Forest Imputation
def fill_missing_with_random_forest(df, target_col, features):
    df_notnull = df[df[target_col].notnull()]
    df_null = df[df[target_col].isnull()]

    rf_model = RandomForestRegressor(random_state=42, n_estimators=100)
    rf_model.fit(df_notnull[features], df_notnull[target_col])

    predicted_values = rf_model.predict(df_null[features])
    df.loc[df[target_col].isnull(), target_col] = predicted_values

    return df[target_col]

# features
features_for_impute = [ '주행거리(km)', '보증기간(년)', '차량상태']

train_df['배터리용량'] = fill_missing_with_random_forest(
    train_df, '배터리용량', features_for_impute
)
test_df['배터리용량'] = fill_missing_with_random_forest(
    test_df, '배터리용량', features_for_impute
)


# 결측치가 채워진 데이터프레임 표시
print("Updated DataFrame with Imputed Battery Capacity:")
print(train_df.head())
print(train_df.isnull().sum())

Updated DataFrame with Imputed Battery Capacity:
           ID  제조사  모델  차량상태     배터리용량  구동방식  주행거리(km)  보증기간(년)  사고이력  연식(년)  \
0  TRAIN_0000    4  16     1  86.07700     0     13642        0     0      2   
1  TRAIN_0001    3  10     1  56.00000     1     10199        6     0      0   
2  TRAIN_0002    0  17     0  91.20000     0      2361        7     0      0   
3  TRAIN_0003    0  12     1  52.85864     0     21683        3     0      0   
4  TRAIN_0004    1  19     2  61.01800     0    178205        1     0      0   

   가격(백만원)  
0   159.66  
1    28.01  
2    66.27  
3    99.16  
4    62.02  
ID          0
제조사         0
모델          0
차량상태        0
배터리용량       0
구동방식        0
주행거리(km)    0
보증기간(년)     0
사고이력        0
연식(년)       0
가격(백만원)     0
dtype: int64


In [None]:
# Feature engineering
train_df['주행거리(km)'].replace(0, np.nan, inplace=True)
train_df['가격_주행거리ratio'] = train_df['가격(백만원)'] / train_df['주행거리(km)']
test_df['가격_주행거리ratio'] = 0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['주행거리(km)'].replace(0, np.nan, inplace=True)


### scaling

In [None]:
numerical_cols = ['배터리용량', '주행거리(km)', '보증기간(년)', '연식(년)', '가격_주행거리ratio']
target = '가격(백만원)'
for col in ['주행거리(km)']:
    train_df[col] = np.log1p(train_df[col])
    test_df[col] = np.log1p(test_df[col])

# Scale features and target
scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_df[categorical_cols + numerical_cols])
test_features = scaler.transform(test_df[categorical_cols + numerical_cols])

target_scaler = MinMaxScaler()
train_target = target_scaler.fit_transform(train_df[[target]])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(train_features, train_target, test_size=0.2, random_state=42)


### LightGBM 모델 학습



In [None]:

#  Optuna
def objective(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 31, 63),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 1.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 0.9),
        'bagging_freq': 1,
    }
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train.ravel())
    val_data = lgb.Dataset(X_val, label=y_val.ravel(), reference=train_data)

    # Manual early stopping
    gbm = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        callbacks=[LightGBMPruningCallback(trial, "rmse")],
    )
    y_pred = gbm.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, y_pred))

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, n_jobs=-1)

# Best parameters
print("Best Parameters:", study.best_params)
print("Best RMSE:", study.best_value)



[I 2024-12-08 06:05:05,021] A new study created in memory with name: no-name-fc0dcc14-4f02-4e82-8c4a-f0546a1d2035
[I 2024-12-08 06:05:14,000] Trial 6 finished with value: 0.013513947965745836 and parameters: {'num_leaves': 35, 'learning_rate': 0.04363167118120697, 'max_depth': 13, 'lambda_l1': 0.9369898490277732, 'lambda_l2': 0.09734407133269829, 'feature_fraction': 0.7870252746170974, 'bagging_fraction': 0.7137732548826172}. Best is trial 6 with value: 0.013513947965745836.
[I 2024-12-08 06:05:14,857] Trial 5 finished with value: 0.012462483832349698 and parameters: {'num_leaves': 60, 'learning_rate': 0.049473545888901384, 'max_depth': 12, 'lambda_l1': 0.7102129255546076, 'lambda_l2': 0.9378193254577419, 'feature_fraction': 0.7833425631678086, 'bagging_fraction': 0.7263518913582748}. Best is trial 5 with value: 0.012462483832349698.
[I 2024-12-08 06:05:15,629] Trial 2 finished with value: 0.012268211782178287 and parameters: {'num_leaves': 37, 'learning_rate': 0.03759296783332592, 'ma

Best Parameters: {'num_leaves': 40, 'learning_rate': 0.0446119695459899, 'max_depth': 15, 'lambda_l1': 0.015346158133627885, 'lambda_l2': 0.10503107890171282, 'feature_fraction': 0.8852103231452658, 'bagging_fraction': 0.745886611097825}
Best RMSE: 0.007274149725104183


In [None]:

# Train final model with best parameters
best_params = study.best_params
train_data = lgb.Dataset(X_train, label=y_train.ravel())
val_data = lgb.Dataset(X_val, label=y_val.ravel(), reference=train_data)

# Train the final model with manual early stopping
num_boost_round = 1000
patience = 50
best_iteration = 0
best_score = float('inf')

for i in range(1, num_boost_round + 1):
    final_model = lgb.train(
        best_params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=i
    )
    y_pred = final_model.predict(X_val, num_iteration=i)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    if i % 10 == 0:
        print(f"==============Iteration {i}, RMSE: {rmse}==============")
    if rmse < best_score:
        best_score = rmse
        best_iteration = i
    elif i - best_iteration >= patience:
        print(f"Early stopping at iteration {best_iteration}")
        break


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 814
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 10
[LightGBM] [Info] Start training from score 0.349934
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 814
[LightGBM] [Info] Number of data points in the train set: 5997, number of used features: 10
[LightGBM] [Info] Start training from score 0.349934
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8

In [None]:
# Predict on test data
y_pred_scaled = final_model.predict(test_features) #, num_iteration=best_iteration)
y_pred_actual = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Save predictions to submission file
submission = pd.DataFrame({'ID': test_df['ID'], '가격(백만원)': y_pred_actual.flatten()})
submission.to_csv("/content/drive/MyDrive/prog_team/submission_v9.csv", index=False)

print("Submission file saved.")

Submission file saved.
