In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from prophet import Prophet
import matplotlib.pyplot as plt

# ✅ 한글 깨짐 방지
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# 1️⃣ CSV 로드
df = pd.read_csv('../../jin/data/processed/외국인입국자_전처리완료_딥러닝용.csv', encoding='utf-8')
df = df.fillna(0)

# 2️⃣ 사용자 입력 검증 함수
def get_valid_input(column_name):
    unique_vals = df[column_name].unique()
    while True:
        user_input = input(f"{column_name} 입력: ").strip().lower().replace(' ', '')
        match = None
        for val in unique_vals:
            val_clean = val.strip().lower().replace(' ', '')
            if user_input == val_clean:
                match = val
                break
        if match:
            return match
        else:
            print(f"❌ 일치하는 {column_name} 없음. 다시 입력하세요.")

# 3️⃣ 국가 + 목적 입력
country = get_valid_input('국적')
purpose = get_valid_input('목적')

# 4️⃣ 조건 필터링
df = df[(df['국적'] == country) & (df['목적'] == purpose)]

# 5️⃣ Feature 엔지니어링: Lag, 누적, 분기, 계절
df['lag_1'] = df['입국자수'].shift(1)
df['lag_3'] = df['입국자수'].shift(3)
df['lag_12'] = df['입국자수'].shift(12)
df['입국자수_cumsum'] = df['입국자수'].cumsum()
df = df.dropna()

# 6️⃣ Feature 선택
features = ['연도', '월', '분기', '계절', 'lag_1', 'lag_3', 'lag_12', '입국자수_cumsum']
X = df[features]
y = df['입국자수']

# 7️⃣ 정규화
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 8️⃣ 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 9️⃣ XGBoost + 하이퍼파라미터 튜닝
params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}
grid = GridSearchCV(XGBRegressor(), params, cv=2)
grid.fit(X_train, y_train)
xgb_model = grid.best_estimator_

# 🔟 예측 + 평가
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5

print(f"✅ XGBoost MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# 1️⃣1️⃣ 중요도 그래프
plt.figure(figsize=(10, 6))
plt.barh(features, xgb_model.feature_importances_)
plt.title('XGBoost Feature 중요도')
plt.show()

# 1️⃣2️⃣ Prophet용 Long형 + 휴일
prophet_df = df[['연월', '입국자수']].copy()
prophet_df.rename(columns={'연월': 'ds', '입국자수': 'y'}, inplace=True)
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'])

m = Prophet()
m.add_country_holidays(country_name='KR')  # 한국 휴일

m.fit(prophet_df)

future = m.make_future_dataframe(periods=12, freq='M')
forecast = m.predict(future)

fig = m.plot(forecast)
plt.title('Prophet 예측 (한국 휴일 포함)')
plt.show()

print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(12))


Importing plotly failed. Interactive plots will not work.


국적 입력: 미국
목적 입력: 
❌ 일치하는 목적 없음. 다시 입력하세요.
목적 입력: 관광


ValueError: could not convert string to float: '겨울'

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from prophet import Prophet
import matplotlib.pyplot as plt

# ✅ 한글 폰트 깨짐 방지
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# 1️⃣ CSV 로드
df = pd.read_csv('../../jin/data/processed/외국인입국자_전처리완료_딥러닝용.csv', encoding='utf-8')
df = df.fillna(0)

# 2️⃣ 사용자 입력 검증 함수
def get_valid_input(column_name, allow_blank=False):
    unique_vals = df[column_name].unique()
    while True:
        user_input = input(f"{column_name} 입력 (없으면 Enter): ").strip().lower().replace(' ', '')
        if allow_blank and user_input == "":
            return None
        match = None
        for val in unique_vals:
            val_clean = val.strip().lower().replace(' ', '')
            if user_input == val_clean:
                match = val
                break
        if match:
            return match
        else:
            print(f"❌ 일치하는 {column_name} 없음. 다시 입력하세요.")

# 3️⃣ 국가 입력 (필수)
country = get_valid_input('국적')

# 4️⃣ 목적 입력 (선택)
purpose = get_valid_input('목적', allow_blank=True)

# 5️⃣ 조건 필터링 (Wide 기준)
df = df[df['국적'] == country]
if purpose:
    df = df[df['목적'] == purpose]

if df.empty:
    raise ValueError("⚠️ 조건에 맞는 데이터가 없습니다. 프로그램을 종료합니다.")

# ✅ Wide → Long (Prophet & ML 동일)
date_cols = [col for col in df.columns if '년' in col and '월' in col]

long_df = df.melt(
    id_vars=['국적', '목적', '연도', '월', '분기', '계절', '코로나기간'],
    value_vars=date_cols,
    var_name='연월',
    value_name='입국자수'
)

# ✅ 연월 처리
long_df['연월'] = long_df['연월'].str.replace('년', '-').str.replace('월', '')
long_df['ds'] = pd.to_datetime(long_df['연월'] + '-01')
long_df['입국자수'] = long_df['입국자수'].astype(str).str.replace(',', '').astype(float)

# ✅ Feature Engineering
long_df['lag_1'] = long_df['입국자수'].shift(1)
long_df['lag_3'] = long_df['입국자수'].shift(3)
long_df['lag_12'] = long_df['입국자수'].shift(12)
long_df['입국자수_cumsum'] = long_df['입국자수'].cumsum()

long_df = long_df.dropna()

# ✅ Feature 선택
features = ['연도', '월', '분기', '계절', '코로나기간', 'lag_1', 'lag_3', 'lag_12', '입국자수_cumsum']
X = long_df[features]
y = long_df['입국자수']

# ✅ 정규화
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# ✅ 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ✅ XGBoost + GridSearchCV
params = {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1], 'max_depth': [3, 5]}
grid = GridSearchCV(XGBRegressor(), params, cv=2)
grid.fit(X_train, y_train)
xgb_model = grid.best_estimator_

# ✅ 예측 + 평가
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"✅ XGBoost MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# ✅ 중요도 그래프
plt.figure(figsize=(10, 6))
plt.barh(features, xgb_model.feature_importances_)
plt.title('XGBoost Feature 중요도')
plt.show()

# ✅ Prophet 모델
prophet_df = long_df[['ds', '입국자수']].copy()
prophet_df.rename(columns={'입국자수': 'y'}, inplace=True)

m = Prophet()
m.add_country_holidays(country_name='KR')
m.fit(prophet_df)

future = m.make_future_dataframe(periods=12, freq='M')
forecast = m.predict(future)

# ✅ Prophet 시각화
fig = m.plot(forecast)
plt.title('Prophet 예측 (한국 휴일 포함)')
plt.show()

# ✅ Prophet 결과
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(12))


국적 입력 (없으면 Enter): 영국
목적 입력 (없으면 Enter): 


ValueError: could not convert string to float: '겨울'

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from prophet import Prophet
import matplotlib.pyplot as plt

# ✅ 한글 폰트 깨짐 방지
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# 1️⃣ CSV 로드
df = pd.read_csv('외국인입국자_전처리완료_딥러닝용.csv', encoding='cp949')

# 2️⃣ 연월 컬럼 안전 필터링
if '연월' in df.columns:
    df = df[df['연월'].astype(str).str.contains('년|월|\\-')]

# 3️⃣ 사용자 입력 검증 함수
def get_valid_input(column_name, allow_blank=False):
    unique_vals = df[column_name].unique()
    while True:
        user_input = input(f"{column_name} 입력 (없으면 Enter): ").strip().lower().replace(' ', '')
        if allow_blank and user_input == "":
            return None
        match = None
        for val in unique_vals:
            val_clean = val.strip().lower().replace(' ', '')
            if user_input == val_clean:
                match = val
                break
        if match:
            return match
        else:
            print(f"❌ 일치하는 {column_name} 없음. 다시 입력하세요.")

# 4️⃣ 국가 입력 (필수)
country = get_valid_input('국적')

# 5️⃣ 목적 입력 (선택)
purpose = get_valid_input('목적', allow_blank=True)

# 6️⃣ 조건 필터링
df = df[df['국적'] == country]
if purpose:
    df = df[df['목적'] == purpose]

if df.empty:
    raise ValueError("⚠️ 조건에 맞는 데이터가 없습니다. 프로그램 종료.")

# 7️⃣ Feature Engineering
df['lag_1'] = df['입국자수'].shift(1)
df['lag_3'] = df['입국자수'].shift(3)
df['lag_12'] = df['입국자수'].shift(12)
df['입국자수_cumsum'] = df['입국자수'].cumsum()
df = df.dropna()

# 8️⃣ Feature 선택
features = ['연도', '월', '분기', '계절', '코로나기간', 'lag_1', 'lag_3', 'lag_12', '입국자수_cumsum']
X = df[features]
y = df['입국자수']

# 9️⃣ 정규화
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 🔟 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# 1️⃣1️⃣ XGBoost + GridSearchCV
params = {'n_estimators': [100], 'learning_rate': [0.05], 'max_depth': [3]}
grid = GridSearchCV(XGBRegressor(), params, cv=2)
grid.fit(X_train, y_train)
xgb_model = grid.best_estimator_

# 1️⃣2️⃣ 예측 + 평가
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"✅ XGBoost MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# 1️⃣3️⃣ Feature 중요도
plt.figure(figsize=(10, 6))
plt.barh(features, xgb_model.feature_importances_)
plt.title('XGBoost Feature 중요도')
plt.show()

# 1️⃣4️⃣ Prophet Long 변환 + 안전한 연월 변환
df['연월'] = df['연월'].astype(str).str.replace('년', '-').str.replace('월', '')
df['ds'] = pd.to_datetime(df['연월'] + '-01', errors='coerce')
df = df[df['ds'].notna()]  # NaT 제거

prophet_df = df[['ds', '입국자수']].rename(columns={'입국자수': 'y'})

m = Prophet()
m.add_country_holidays(country_name='KR')
m.fit(prophet_df)

future = m.make_future_dataframe(periods=12, freq='M')
forecast = m.predict(future)

fig = m.plot(forecast)
plt.title('Prophet 예측 (한국 휴일 포함)')
plt.show()

print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(12))
