In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:90% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:2px;}
div.CodeMirror {font-family:Consolas; font-size:10pt;}
div.text_cell_render.rendered_html{font-size:10pt;}
div.output {font-size:10pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:10pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:11pt;padding:4px;}
table.dataframe{font-size:10px;}
</style>
"""))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from prophet import Prophet
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# ✅ 한글 깨짐 방지
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# ✅ 데이터 로드
df = pd.read_csv('../../jin/data/processed/외국인입국자_전처리완료_딥러닝용.csv', encoding='utf-8')

# ✅ 사용자 입력 함수
def get_valid_input(column_name, allow_blank=False):
    unique_vals = df[column_name].unique()
    while True:
        user_input = input(f"{column_name} 입력 (없으면 Enter): ").strip().lower().replace(' ', '')
        if allow_blank and user_input == "":
            return None
        match = None
        for val in unique_vals:
            val_clean = val.strip().lower().replace(' ', '')
            if user_input == val_clean:
                match = val
                break
        if match:
            return match
        else:
            print(f"❌ 일치하는 {column_name} 없음. 다시 입력.")

# ✅ 국가 & 목적 입력
country = get_valid_input('국적')
purpose_input = get_valid_input('목적', allow_blank=True)

# ✅ 국가로 필터
df = df[df['국적'] == country]

# ✅ 목적 루프 대상
purpose_list = [purpose_input] if purpose_input else df['목적'].unique()

# ✅ 공통 Label Encoding
le_season = LabelEncoder()
le_corona = LabelEncoder()
df['계절'] = le_season.fit_transform(df['계절'])
df['코로나기간'] = le_corona.fit_transform(df['코로나기간'])
df['연월'] = df['연도'].astype(str) + '-' + df['월'].astype(str).str.zfill(2)

# ✅ 특이점 Dummy + 스케일업
df['성수기'] = df['월'].apply(lambda x: 1 if x in [7, 8, 12] else 0) * 1.5
df['명절'] = df['월'].apply(lambda x: 1 if x in [1, 2, 9, 10] else 0) * 2.0

# ✅ Y축 Formatter
def to_unit(x, pos):
    if x >= 10000:
        return f'{x/10000:.1f}만'
    else:
        return f'{int(x)}명'

# ✅ 목적별 루프
for purpose in purpose_list:
    temp_df = df[df['목적'] == purpose] if purpose_input else df[df['목적'] == purpose]
    if temp_df.empty:
        print(f"⚠️ 데이터 없음: {purpose}")
        continue

    # ✅ Feature Engineering
    temp_df = temp_df.copy()
    temp_df['lag_1'] = temp_df['입국자수'].shift(1)
    temp_df['lag_3'] = temp_df['입국자수'].shift(3)
    temp_df['lag_12'] = temp_df['입국자수'].shift(12)
    temp_df['입국자수_cumsum'] = temp_df['입국자수'].cumsum()
    temp_df = temp_df.dropna()

    features = [
        '연도', '월', '분기', '계절', '코로나기간', '성수기', '명절',
        'lag_1', 'lag_3', 'lag_12', '입국자수_cumsum'
    ]
    X = temp_df[features]
    y = temp_df['입국자수']
    ym = temp_df['연월'].reset_index(drop=True)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test, ym_train, ym_test = train_test_split(
        X_scaled, y, ym, test_size=0.2, random_state=42, shuffle=False
    )

    params = {'n_estimators': [100], 'learning_rate': [0.05], 'max_depth': [3]}
    grid = GridSearchCV(XGBRegressor(), params, cv=2)
    grid.fit(X_train, y_train)
    xgb_model = grid.best_estimator_

    y_pred = xgb_model.predict(X_test)
    results = pd.DataFrame({
        '실제값': y_test.values,
        '예측값': y_pred,
        '연월': ym_test.values
    })
    results['date'] = pd.to_datetime(results['연월'])
    temp_df['date'] = pd.to_datetime(temp_df['연월'])

    # ✅ Prophet
    prophet_df = temp_df[['연월', '입국자수', '월', '코로나기간', '성수기', '명절']].copy()
    prophet_df.rename(columns={'연월': 'ds', '입국자수': 'y'}, inplace=True)
    prophet_df['ds'] = pd.to_datetime(prophet_df['ds'])

    m = Prophet(yearly_seasonality=True, seasonality_mode='multiplicative')
    m.add_regressor('코로나기간')
    m.add_regressor('성수기')
    m.add_regressor('명절')
    m.fit(prophet_df)

    future = m.make_future_dataframe(periods=12, freq='M')
    future['월'] = future['ds'].dt.month
    future['코로나기간'] = 0
    future['성수기'] = future['월'].apply(lambda x: 1 if x in [7, 8, 12] else 0) * 1.5
    future['명절'] = future['월'].apply(lambda x: 1 if x in [1, 2, 9, 10] else 0) * 2.0

    forecast = m.predict(future)
    forecast['date'] = forecast['ds']

    # ✅ ✅ ✅ 과거 + 미래 연월 Tick 통합
    all_dates = pd.concat([temp_df['date'], forecast['date']]).drop_duplicates().sort_values()
    year_ticks = all_dates[all_dates.dt.month == 1]
    
    # ✅ 비교 그래프 출력
    fig, ax = plt.subplots(figsize=(14, 6))
    ax.plot(temp_df['date'], temp_df['입국자수'], label='실제값', color='black')
    ax.plot(results['date'], results['예측값'], linestyle='--', label='XGBoost 예측')
    ax.plot(forecast['date'], forecast['yhat'], linestyle='-', label='Prophet 예측')

    ax.set_xticks(year_ticks)
    ax.set_xticklabels([d.strftime('%Y-%m') for d in year_ticks], rotation=45)
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(to_unit))

    ax.set_title(f"{country} - {purpose} 입국자수 예측 (Prophet & XGBoost)")
    ax.set_xlabel('연월')
    ax.set_ylabel('입국자수')
    ax.legend()
    plt.tight_layout()
    plt.show()
