In [3]:
import pandas as pd

In [57]:
df=pd.read_csv("data/travel_ko.csv",index_col=0)

In [58]:
# '검색일자' 컬럼을 datetime 형식으로 변환
df['검색일자'] = pd.to_datetime(df['검색일자'], format='%Y-%m')

# 연도, 월 컬럼 생성
df['연도'] = df['검색일자'].dt.year
df['월'] = df['검색일자'].dt.month

In [59]:
from sklearn.preprocessing import LabelEncoder

In [60]:
le_country=LabelEncoder()
le_continent=LabelEncoder()
df['국가_encoded']=le_country.fit_transform(df['국가'])
df['대륙_encoded']=le_continent.fit_transform(df['대륙'])

In [61]:
df[df['국가']=='대만']

Unnamed: 0,검색일자,국가,대륙,총검색량,연도,월,국가_encoded,대륙_encoded
0,2019-07-01,대만,아시아,223094,2019,7,3,1
14,2019-08-01,대만,아시아,219715,2019,8,3,1
28,2019-09-01,대만,아시아,208974,2019,9,3,1
42,2019-10-01,대만,아시아,247156,2019,10,3,1
55,2019-11-01,대만,아시아,293831,2019,11,3,1
...,...,...,...,...,...,...,...,...
908,2024-10-01,대만,아시아,486287,2024,10,3,1
916,2024-11-01,대만,아시아,299853,2024,11,3,1
924,2024-12-01,대만,아시아,330340,2024,12,3,1
936,2025-01-01,대만,아시아,203313,2025,1,3,1


In [62]:
X=df[['국가_encoded','대륙_encoded','연도','월']]
y=df['총검색량']

In [63]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [66]:
X_train[X_train['국가_encoded']==3]

Unnamed: 0,국가_encoded,대륙_encoded,연도,월
465,3,1,2021,8
292,3,1,2020,12
916,3,1,2024,11
118,3,1,2020,3
55,3,1,2019,11
...,...,...,...,...
504,3,1,2021,10
871,3,1,2024,6
99,3,1,2020,2
614,3,1,2022,6


In [67]:
from xgboost import XGBRegressor
xgb_model=XGBRegressor(n_estimators=80,learning_rate=0.05,random_state=42)
xgb_model.fit(X_train,y_train)

In [68]:
y_pred=xgb_model.predict(X_test)

In [69]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9304010272026062

In [71]:
# 2025년, 2026년 데이터를 생성하여 예측
future_years = [2025, 2026]
future_months = list(range(1, 13))  # 1월~12월

# 기존 데이터에서 국적 리스트 추출
countries = df['국가'].unique()
continents=df['대륙'].unique()

# 예측을 위한 데이터 생성
future_data = []
for year in future_years:
    for month in future_months:
        for country in countries:
            continent = df[df['국가'] == country]['대륙'].iloc[0]  # 국가에 해당하는 대륙 찾기
            country_encoded = le_country.transform([country])[0]
            continent_encoded = le_continent.transform([continent])[0]
            future_data.append([year, month, country_encoded, continent_encoded])

# 데이터프레임 변환
future_df = pd.DataFrame(future_data, columns=['연도', '월'] + list([countries])+list([continents]))
future_df = pd.DataFrame(future_data, columns=['연도', '월', '국가_encoded', '대륙_encoded'])
future_df = future_df.astype(float)
expected_features = xgb_model.get_booster().feature_names
future_df = future_df[expected_features]

# XGBoost 모델을 사용하여 미래 입국자 수 예측
future_predictions_xgb = xgb_model.predict(future_df)

# 결과 데이터프레임 생성
future_df['검색량'] = future_predictions_xgb

In [72]:
future_df

Unnamed: 0,국가_encoded,대륙_encoded,연도,월,검색량
0,3.0,1.0,2025.0,1.0,201001.218750
1,4.0,1.0,2025.0,1.0,-3854.477051
2,5.0,1.0,2025.0,1.0,-3854.477051
3,9.0,1.0,2025.0,1.0,29576.939453
4,10.0,3.0,2025.0,1.0,3781.870605
...,...,...,...,...,...
1171,14.0,0.0,2026.0,12.0,98201.257812
1172,35.0,1.0,2026.0,12.0,75339.632812
1173,34.0,1.0,2026.0,12.0,75339.632812
1174,24.0,1.0,2026.0,12.0,15732.022461


In [None]:
#각 국적지역-국적 컬럼을 원래처럼 변환
future_df['국가'] = future_df[countries].idxmax(axis=1)

# 년, 월, 국적지역, 예측 입국자수 컬럼만 선택
future_df = future_df[['년', '월', '국적지역', '입국자수']]
#국적지역_을 제거
future_df['국적지역'] = future_df['국적지역'].str.replace('국적지역_', '')
#국적지역별로 정렬
future_df = future_df.sort_values(by=['국적지역', '년', '월'])
#예측입국자수 정수로 변환
future_df['입국자수'] = future_df['입국자수'].astype(int).reset_index(drop=True)

future_df

KeyError: "None of [Index(['대만', '라오스', '러시아', '몽골', '미국', '베트남', '스위스', '싱가포르', '인도네시아', '일본',\n       '조지아', '중국', '태국', '필리핀', '스페인', '터키', '피지', '이탈리아', '포르투갈', '뉴질랜드',\n       '몰타', '아랍에미리트', '아르헨티나', '이스라엘', '팔라우', '모로코', '벨기에', '이집트', '캐나다',\n       '그리스', '인도', '탄자니아', '프랑스', '체코', '오스트리아', '영국여행', '호주', '볼리비아',\n       '말레이시아', '콜롬비아', '영국', '케냐', '쿠바', '에스토니아', '브라질', '캄보디아', '카타르', '요르단',\n       '네팔'],\n      dtype='object')] are in the [columns]"