In [210]:
import pandas as pd

In [244]:
df=pd.read_csv("data/travel_ko.csv")

In [245]:
df

Unnamed: 0,검색일자,국가,대륙,총검색량
0,2019-07,괌,오세아니아,53503
1,2019-07,대만,아시아,223094
2,2019-07,라오스,아시아,3911
3,2019-07,러시아,아시아,162850
4,2019-07,몽골,아시아,66430
...,...,...,...,...
950,2025-02,인도네시아,아시아,31679
951,2025-02,일본,아시아,224782
952,2025-02,중국,아시아,52809
953,2025-02,태국,아시아,24522


In [246]:
df[df['국가']=='영국']

Unnamed: 0,검색일자,국가,대륙,총검색량
238,2020-09,영국,유럽,1006
344,2021-02,영국,유럽,360
388,2021-04,영국,유럽,257


In [247]:
# '검색일자' 컬럼을 datetime 형식으로 변환
df['검색일자'] = pd.to_datetime(df['검색일자'], format='%Y-%m')

# 연도, 월 컬럼 생성
df['연도'] = df['검색일자'].dt.year
df['월'] = df['검색일자'].dt.month

In [248]:
from sklearn.preprocessing import LabelEncoder

In [249]:
le_country=LabelEncoder()
le_continent=LabelEncoder()
df['국가_encoded']=le_country.fit_transform(df['국가'])
df['대륙_encoded']=le_continent.fit_transform(df['대륙'])

In [250]:
df[df['총검색량']==0]

Unnamed: 0,검색일자,국가,대륙,총검색량,연도,월,국가_encoded,대륙_encoded
934,2025-01-01,괌,오세아니아,0,2025,1,0,4
935,2025-01-01,네팔,아시아,0,2025,1,2,2
937,2025-01-01,미국,북아메리카,0,2025,1,11,1
940,2025-01-01,인도,아시아,0,2025,1,28,2
945,2025-01-01,프랑스,유럽,0,2025,1,45,5


In [251]:
X=df[['국가_encoded','대륙_encoded','연도','월']]
y=df['총검색량']

In [252]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [253]:
from xgboost import XGBRegressor
xgb_model=XGBRegressor(n_estimators=80,learning_rate=0.05,random_state=42)
xgb_model.fit(X_train,y_train)

In [254]:
y_pred=xgb_model.predict(X_test)

In [255]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8799415230751038

In [256]:
# 2025년, 2026년 데이터를 생성하여 예측
future_years = [2025, 2026]
future_months = list(range(1, 13))  # 1월~12월

# 기존 데이터에서 국적 리스트 추출
countries = df['국가'].unique()
continents=df['대륙'].unique()

# 예측을 위한 데이터 생성
future_data = []
for year in future_years:
    for month in future_months:
        for country in countries:
            continent = df[df['국가'] == country]['대륙'].iloc[0]  # 국가에 해당하는 대륙 찾기
            country_encoded = le_country.transform([country])[0]
            continent_encoded = le_continent.transform([continent])[0]
            future_data.append([year, month, country_encoded, continent_encoded])

# 데이터프레임 변환
future_df = pd.DataFrame(future_data, columns=['연도', '월'] + list([countries])+list([continents]))
future_df = pd.DataFrame(future_data, columns=['연도', '월', '국가_encoded', '대륙_encoded'])
future_df = future_df.astype(float)
expected_features = xgb_model.get_booster().feature_names
future_df = future_df[expected_features]

# XGBoost 모델을 사용하여 미래 입국자 수 예측
future_predictions_xgb = xgb_model.predict(future_df)

# 결과 데이터프레임 생성
future_df['검색량'] = future_predictions_xgb

In [257]:
future_df['국가_encoded'] = future_df['국가_encoded'].astype(int)
future_df['대륙_encoded'] = future_df['대륙_encoded'].astype(int)
future_df['검색량'] = future_df['검색량'].astype(int)

In [258]:
future_df

Unnamed: 0,국가_encoded,대륙_encoded,연도,월,검색량
0,0,4,2025.0,1.0,-6248
1,4,2,2025.0,1.0,199116
2,5,2,2025.0,1.0,-18457
3,6,2,2025.0,1.0,-30934
4,10,2,2025.0,1.0,10728
...,...,...,...,...,...
1171,15,0,2026.0,12.0,56999
1172,35,2,2026.0,12.0,59569
1173,34,2,2026.0,12.0,59569
1174,24,2,2026.0,12.0,-14201


In [259]:
#각 국가, 대륙 컬럼을 원래처럼 변환
future_df['국가'] = le_country.inverse_transform(future_df['국가_encoded'])
future_df['대륙'] = le_continent.inverse_transform(future_df['대륙_encoded'])

# 연도, 월, 국가, 대륙, 예측 입국자수 컬럼만 선택
future_df = future_df[['연도', '월', '국가', '대륙', '검색량']]

#국가_을 제거
future_df['국가'] = future_df['국가'].str.replace('국가_', '')

#대륙_을 제거
future_df['대륙'] = future_df['대륙'].str.replace('대륙_', '')

#국가, 대륙별로 정렬
future_df = future_df.sort_values(by=['연도','월','검색량'],ascending=[True,True,False])

#예측총검색량 정수로 변환
future_df['검색량'] = future_df['검색량'].astype(int).reset_index(drop=True)
future_df['연도'] = future_df['연도'].astype(int).reset_index(drop=True)
future_df['월'] = future_df['월'].astype(int).reset_index(drop=True)

future_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df['국가'] = future_df['국가'].str.replace('국가_', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_df['대륙'] = future_df['대륙'].str.replace('대륙_', '')


Unnamed: 0,연도,월,국가,대륙,검색량
9,2025,1,일본,아시아,95857
1,2025,1,대만,아시아,199116
11,2025,1,중국,아시아,93500
20,2025,1,뉴질랜드,오세아니아,55149
5,2025,1,베트남,아시아,126278
...,...,...,...,...,...
1157,2026,12,그리스,유럽,-8801
1150,2026,12,아르헨티나,남아메리카,15567
1129,2026,12,라오스,아시아,133974
1130,2026,12,러시아,아시아,116537


In [260]:
print(future_df.duplicated().sum())  # 중복된 행 확인
print(future_df.isnull().sum())  # 결측치 확인

0
연도     0
월      0
국가     0
대륙     0
검색량    0
dtype: int64


In [261]:
df[df['연도']==2025]

Unnamed: 0,검색일자,국가,대륙,총검색량,연도,월,국가_encoded,대륙_encoded
934,2025-01-01,괌,오세아니아,0,2025,1,0,4
935,2025-01-01,네팔,아시아,0,2025,1,2,2
936,2025-01-01,대만,아시아,203313,2025,1,4,2
937,2025-01-01,미국,북아메리카,0,2025,1,11,1
938,2025-01-01,베트남,아시아,138505,2025,1,12,2
939,2025-01-01,싱가포르,아시아,112100,2025,1,18,2
940,2025-01-01,인도,아시아,0,2025,1,28,2
941,2025-01-01,인도네시아,아시아,72125,2025,1,29,2
942,2025-01-01,일본,아시아,660452,2025,1,30,2
943,2025-01-01,중국,아시아,164907,2025,1,32,2


In [262]:
future_df.loc[(future_df['대륙']=='유럽')&(future_df['연도']==2025)&(future_df['월']==2)]

Unnamed: 0,연도,월,국가,대륙,검색량
55,2025,2,스위스,유럽,71870
64,2025,2,스페인,유럽,32992
76,2025,2,벨기에,유럽,2652
59,2025,2,조지아,유럽,47754
65,2025,2,터키,유럽,13934
68,2025,2,포르투갈,유럽,5754
82,2025,2,프랑스,유럽,-12490
83,2025,2,체코,유럽,-14229
67,2025,2,이탈리아,유럽,5755
84,2025,2,오스트리아,유럽,-14229


In [263]:
# 모델저장
import joblib
joblib.dump(xgb_model, 'model/xgb_model.pkl')

['model/xgb_model.pkl']

In [264]:
# 예축한 데이터 저장
future_df.to_csv('data/xgb_travel.csv', index=False)

In [265]:
future_df

Unnamed: 0,연도,월,국가,대륙,검색량
9,2025,1,일본,아시아,95857
1,2025,1,대만,아시아,199116
11,2025,1,중국,아시아,93500
20,2025,1,뉴질랜드,오세아니아,55149
5,2025,1,베트남,아시아,126278
...,...,...,...,...,...
1157,2026,12,그리스,유럽,-8801
1150,2026,12,아르헨티나,남아메리카,15567
1129,2026,12,라오스,아시아,133974
1130,2026,12,러시아,아시아,116537
