In [None]:

# ✅ 2탄: Feature Engineering → 학습 → 저장 → 예측 샘플

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# 1️⃣ 파이프라인 출력 데이터 로드
df = pd.read_csv('foreign_visitors_pipeline_ready.csv')

# 2️⃣ X, y 분리
X = df[['country_code', 'purpose_code', 'year', 'month',
        'is_peak', 'is_holiday',
        'lag_1', 'lag_3', 'lag_6',
        'rolling_mean_3', 'rolling_mean_6', 'rolling_mean_12',
        'quarter']]
y = df['visitors_num']

# 3️⃣ Train / Test Split
train = df[df['year'] < 2024]
test = df[df['year'] >= 2024]

X_train = train[X.columns]
y_train = train['visitors_num']
X_test = test[X.columns]
y_test = test['visitors_num']

# 4️⃣ RandomForest 모델 학습
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5️⃣ 예측 & 평가
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5

print(f'MAE: {mae:.2f}, RMSE: {rmse:.2f}')

# 6️⃣ 모델 저장
joblib.dump(model, 'foreign_visitors_model_rf.joblib')

# 7️⃣ 샘플 예측
sample_input = X_test.iloc[0:1]
sample_pred = model.predict(sample_input)
print(f"샘플 입력값 예측: {round(sample_pred[0])}명")

sample_input
