In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:90% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:2px;}
div.CodeMirror {font-family:Consolas; font-size:10pt;}
div.text_cell_render.rendered_html{font-size:10pt;}
div.output {font-size:10pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:10pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:11pt;padding:4px;}
table.dataframe{font-size:10px;}
</style>
"""))

In [2]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 1️⃣ CSV 파일 읽기
df = pd.read_csv('목적별_국적별_입국자수_.csv')

# 2️⃣ 'visitors' → 숫자형으로 변환
df['visitors_num'] = df['visitors'].astype(str).str.replace(',', '').astype(float)

# 3️⃣ Lag Feature
df = df.sort_values(['country_code', 'purpose_code', 'year', 'month'])
df['lag_1'] = df.groupby(['country_code', 'purpose_code'])['visitors_num'].shift(1)
df['lag_3'] = df.groupby(['country_code', 'purpose_code'])['visitors_num'].shift(3)
df['lag_6'] = df.groupby(['country_code', 'purpose_code'])['visitors_num'].shift(6)

# 4️⃣ Rolling Mean
df['rolling_mean_3'] = df.groupby(['country_code', 'purpose_code'])['visitors_num'].shift(1).rolling(3).mean()
df['rolling_mean_6'] = df.groupby(['country_code', 'purpose_code'])['visitors_num'].shift(1).rolling(6).mean()
df['rolling_mean_12'] = df.groupby(['country_code', 'purpose_code'])['visitors_num'].shift(1).rolling(12).mean()

# 5️⃣ 분기 변수
df['quarter'] = ((df['month'] - 1) // 3) + 1

# 6️⃣ 명절 Dummy
df['is_holiday'] = df['month'].apply(lambda x: 1 if x in [1, 2, 9, 10] else 0)

# 7️⃣ 로그변환
df['visitors_log'] = np.log1p(df['visitors_num'])

# 8️⃣ 결측치 채우기
df['lag_1'] = df['lag_1'].fillna(df['visitors_num'])
df['lag_3'] = df['lag_3'].fillna(df['visitors_num'])
df['lag_6'] = df['lag_6'].fillna(df['visitors_num'])
df['rolling_mean_3'] = df['rolling_mean_3'].fillna(df['visitors_num'])
df['rolling_mean_6'] = df['rolling_mean_6'].fillna(df['visitors_num'])
df['rolling_mean_12'] = df['rolling_mean_12'].fillna(df['visitors_num'])

# ✅ 저장
df.to_csv('foreign_visitors_pipeline_ready.csv', index=False)
df


Unnamed: 0,country_kr,purpose_kr,ym,year,month,visitors,country_en,purpose_en,country_code,purpose_code,...,lag_1,rolling_mean_3,visitors_num,lag_3,lag_6,rolling_mean_6,rolling_mean_12,quarter,is_holiday,visitors_log
11550,아르헨티나,상용,2019-01,2019,1,2,Argentina,Business,0,1,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,1,1.098612
11551,아르헨티나,상용,2019-02,2019,2,2,Argentina,Business,0,1,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,1,1.098612
11552,아르헨티나,상용,2019-03,2019,3,3,Argentina,Business,0,1,...,2.0,3.0,3.0,3.0,3.0,3.0,3.0,1,0,1.386294
11553,아르헨티나,상용,2019-04,2019,4,8,Argentina,Business,0,1,...,3.0,2.333333,8.0,2.0,8.0,8.0,8.0,2,0,2.197225
11554,아르헨티나,상용,2019-05,2019,5,4,Argentina,Business,0,1,...,8.0,4.333333,4.0,2.0,4.0,4.0,4.0,2,0,1.609438
11555,아르헨티나,상용,2019-06,2019,6,2,Argentina,Business,0,1,...,4.0,5.0,2.0,3.0,2.0,2.0,2.0,2,0,1.098612
11556,아르헨티나,상용,2019-07,2019,7,1,Argentina,Business,0,1,...,2.0,4.666667,1.0,8.0,2.0,3.5,1.0,3,0,0.693147
11557,아르헨티나,상용,2019-08,2019,8,3,Argentina,Business,0,1,...,1.0,2.333333,3.0,4.0,2.0,3.333333,3.0,3,0,1.386294
11558,아르헨티나,상용,2019-09,2019,9,1,Argentina,Business,0,1,...,3.0,2.0,1.0,2.0,3.0,3.5,1.0,3,1,0.693147
11559,아르헨티나,상용,2019-10,2019,10,1,Argentina,Business,0,1,...,1.0,1.666667,1.0,1.0,8.0,3.166667,1.0,4,1,0.693147


TypeError: 'DataFrame' object is not callable

In [3]:

# ✅ 2탄: Feature Engineering → 학습 → 저장 → 예측 샘플

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# 1️⃣ 파이프라인 출력 데이터 로드
df = pd.read_csv('foreign_visitors_pipeline_ready.csv')

# 2️⃣ X, y 분리
X = df[['country_code', 'purpose_code', 'year', 'month',
        'is_peak', 'is_holiday',
        'lag_1', 'lag_3', 'lag_6',
        'rolling_mean_3', 'rolling_mean_6', 'rolling_mean_12',
        'quarter']]
y = df['visitors_num']

# 3️⃣ Train / Test Split
train = df[df['year'] < 2024]
test = df[df['year'] >= 2024]

X_train = train[X.columns]
y_train = train['visitors_num']
X_test = test[X.columns]
y_test = test['visitors_num']

# 4️⃣ RandomForest 모델 학습
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 5️⃣ 예측 & 평가
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5

print(f'MAE: {mae:.2f}, RMSE: {rmse:.2f}')

# 6️⃣ 모델 저장
joblib.dump(model, 'foreign_visitors_model_rf.joblib')

# 7️⃣ 샘플 예측
sample_input = X_test.iloc[0:1]
sample_pred = model.predict(sample_input)
print(f"샘플 입력값 예측: {round(sample_pred[0])}명")

sample_input


MAE: 1071.66, RMSE: 7452.41
샘플 입력값 예측: 2명


Unnamed: 0,country_code,purpose_code,year,month,is_peak,is_holiday,lag_1,lag_3,lag_6,rolling_mean_3,rolling_mean_6,rolling_mean_12,quarter
60,0,1,2024,1,0,1,1.0,1.0,1.0,1.333333,1.333333,1.166667,1
