# Tourist Data Processing
---
## 1. 데이터 다운로드
https://datalab.visitkorea.or.kr/datalab/portal/tst/getEntcnyFrgnCustForm.do
> 방한 외래관광객(국적별) > 목적별/국적별 > 월 > 2010. 01 ~ 2020. 05

---
## 2. 데이터 전처리

In [1]:
import pandas as pd 
kto_201901 = pd.read_excel('./files/kto_201901.xlsx', 
                          header=1,
                          usecols='A:G',
                          skipfooter=4)
kto_201901.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계
0,아시아주,765082,10837,1423,14087,125521,916950
1,일본,198805,2233,127,785,4576,206526
2,대만,86393,74,22,180,1285,87954
3,홍콩,34653,59,2,90,1092,35896
4,마카오,2506,2,0,17,45,2570


In [2]:
# 기준년월 컬럼 추가
kto_201901['기준년월']='2019-01'
kto_201901.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월
0,아시아주,765082,10837,1423,14087,125521,916950,2019-01
1,일본,198805,2233,127,785,4576,206526,2019-01
2,대만,86393,74,22,180,1285,87954,2019-01
3,홍콩,34653,59,2,90,1092,35896,2019-01
4,마카오,2506,2,0,17,45,2570,2019-01


In [3]:
#대륙 데이터를 지우고 국적만 남기기
kto_201901['국적'].unique()

array(['아시아주', '일본', '대만', '홍콩', '마카오', '태국', '말레이시아', '필리핀', '인도네시아',
       '싱가포르', '미얀마', '베트남', '인도', '스리랑카', '파키스탄', '방글라데시', '캄보디아', '몽골',
       '중국', '이란', '이스라엘', '터키', '우즈베키스탄', '카자흐스탄', 'GCC', '아시아 기타', '미주',
       '미국', '캐나다', '멕시코', '브라질', '미주 기타', '구주', '영국', '독일', '프랑스',
       '네덜란드', '스웨덴', '스위스', '이탈리아', '덴마크', '노르웨이', '벨기에', '오스트리아', '스페인',
       '그리스', '포르투갈', '핀란드', '아일랜드', '우크라이나', '러시아', '크로아티아', '루마니아',
       '불가리아', '폴란드', '구주 기타', '대양주', '오스트레일리아', '뉴질랜드', '대양주 기타',
       '아프리카주', '남아프리카공화국', '아프리카 기타', '기타대륙', '국적미상', '교포소계', '교포'],
      dtype=object)

In [4]:
continents=['아시아주','미주','구주','대양주','아프리카주','기타대륙','교포소계']
condition=kto_201901.국적.isin(continents)==False
kto_201901_country=kto_201901[condition]
kto_201901_country.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월
1,일본,198805,2233,127,785,4576,206526,2019-01
2,대만,86393,74,22,180,1285,87954,2019-01
3,홍콩,34653,59,2,90,1092,35896,2019-01
4,마카오,2506,2,0,17,45,2570,2019-01
5,태국,34004,37,199,96,6998,41334,2019-01


In [5]:
#인덱스 리셋
kto_201901_country_newindex=kto_201901_country.reset_index(drop=True)
kto_201901_country_newindex.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월
0,일본,198805,2233,127,785,4576,206526,2019-01
1,대만,86393,74,22,180,1285,87954,2019-01
2,홍콩,34653,59,2,90,1092,35896,2019-01
3,마카오,2506,2,0,17,45,2570,2019-01
4,태국,34004,37,199,96,6998,41334,2019-01


In [6]:
# 대륙 컬럼 추가
continents = ['아시아']*25 + ['아메리카']*5 + ['유럽']*23 + ['오세아니아']*3 \
+ ['아프리카']*2 + ['기타대륙'] + ['교포']
kto_201901_country_newindex['대륙'] = continents
kto_201901_country_newindex.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월,대륙
0,일본,198805,2233,127,785,4576,206526,2019-01,아시아
1,대만,86393,74,22,180,1285,87954,2019-01,아시아
2,홍콩,34653,59,2,90,1092,35896,2019-01,아시아
3,마카오,2506,2,0,17,45,2570,2019-01,아시아
4,태국,34004,37,199,96,6998,41334,2019-01,아시아


In [7]:
# 관광객 비율 컬럼 생성
kto_201901_country_newindex['관광객비율(%)'] = \
round(kto_201901_country_newindex['관광'] / kto_201901_country_newindex['계'] * 100, 1)
kto_201901_country_newindex.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월,대륙,관광객비율(%)
0,일본,198805,2233,127,785,4576,206526,2019-01,아시아,96.3
1,대만,86393,74,22,180,1285,87954,2019-01,아시아,98.2
2,홍콩,34653,59,2,90,1092,35896,2019-01,아시아,96.5
3,마카오,2506,2,0,17,45,2570,2019-01,아시아,97.5
4,태국,34004,37,199,96,6998,41334,2019-01,아시아,82.3


In [8]:
#내림차순 정렬
kto_201901_country_newindex.sort_values(by='관광객비율(%)', ascending=False)

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월,대륙,관광객비율(%)
1,대만,86393,74,22,180,1285,87954,2019-01,아시아,98.2
3,마카오,2506,2,0,17,45,2570,2019-01,아시아,97.5
2,홍콩,34653,59,2,90,1092,35896,2019-01,아시아,96.5
0,일본,198805,2233,127,785,4576,206526,2019-01,아시아,96.3
55,대양주 기타,555,3,4,0,52,614,2019-01,오세아니아,90.4
19,이스라엘,727,12,0,9,57,805,2019-01,아시아,90.3
53,오스트레일리아,12795,93,17,34,1346,14285,2019-01,오세아니아,89.6
23,GCC,1550,37,14,72,103,1776,2019-01,아시아,87.3
5,말레이시아,19043,95,7,99,2821,22065,2019-01,아시아,86.3
34,스웨덴,844,29,6,18,85,982,2019-01,유럽,85.9


In [9]:
# pivot table 생성
kto_201901_country_newindex.pivot_table(values = '관광객비율(%)',
                                       index = '대륙',
                                       aggfunc = 'mean')

Unnamed: 0_level_0,관광객비율(%)
대륙,Unnamed: 1_level_1
교포,0.0
기타대륙,61.1
아메리카,68.2
아시아,59.624
아프리카,32.7
오세아니아,84.833333
유럽,63.826087


In [10]:
#중국 국적을 필터링
condition = (kto_201901_country_newindex['국적'] == '중국')
kto_201901_country_newindex[condition]

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월,대륙,관광객비율(%)
17,중국,320113,2993,138,8793,60777,392814,2019-01,아시아,81.5


In [11]:
#1월에 우리나라를 방문하는 전체 관광객 수
tourist_sum = sum(kto_201901_country_newindex['관광'])
tourist_sum 

884293

In [12]:
# 전체비율 컬럼 생성
kto_201901_country_newindex['전체비율(%)'] = \
    round(kto_201901_country_newindex['관광'] / tourist_sum * 100, 1)
kto_201901_country_newindex.head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월,대륙,관광객비율(%),전체비율(%)
0,일본,198805,2233,127,785,4576,206526,2019-01,아시아,96.3,22.5
1,대만,86393,74,22,180,1285,87954,2019-01,아시아,98.2,9.8
2,홍콩,34653,59,2,90,1092,35896,2019-01,아시아,96.5,3.9
3,마카오,2506,2,0,17,45,2570,2019-01,아시아,97.5,0.3
4,태국,34004,37,199,96,6998,41334,2019-01,아시아,82.3,3.8


In [13]:
# 전체 비율을 기준으로 내림차순정렬
kto_201901_country_newindex.sort_values('전체비율(%)', ascending=False).head()

Unnamed: 0,국적,관광,상용,공용,유학/연수,기타,계,기준년월,대륙,관광객비율(%),전체비율(%)
17,중국,320113,2993,138,8793,60777,392814,2019-01,아시아,81.5,36.2
0,일본,198805,2233,127,785,4576,206526,2019-01,아시아,96.3,22.5
1,대만,86393,74,22,180,1285,87954,2019-01,아시아,98.2,9.8
25,미국,42989,418,2578,229,16523,62737,2019-01,아메리카,68.5,4.9
2,홍콩,34653,59,2,90,1092,35896,2019-01,아시아,96.5,3.9


---
## 3. 전처리 과정을 함수로 만들기

In [14]:
def create_kto_data(yy,mm):
    d_pth='./files/kto_{}{}.xlsx'.format(yy,mm)                   # set file path
    df=pd.read_excel(d_pth,header=1,skipfooter=4,usecols='A:G')  # load file to pd
    df['기준년월']='{}-{}'.format(yy,mm)                          # add '기준년월'column
    
    continents=['아시아주','미주','구주','대양주'\
                ,'아프리카주','기타대륙','교포소계']
    condition=df.국적.isin(continents)==False
    df_country=df[condition]                                     # remove continent
    
    df_country=df_country.reset_index(drop=True)
    continents = ['아시아']*25+['아메리카']*5+['유럽']*23 \
    +['오세아니아']*3+['아프리카']*2+['기타대륙']+['교포']
    df_country['대륙'] = continents                             # add 대륙 column
    
    df_country['관광객비율(%)'] =\
    round(df_country['관광'] / df_country['계'] * 100, 1)
    
    tourist_sum=sum(df_country['관광'])
    df_country['전체비율(%)'] = \
    round(df_country['관광'] / tourist_sum * 100, 1)
    
    return df_country

---
## 4. 전체 데이터 파일을 불러와서 하나로 합치기

In [15]:
df=pd.DataFrame()
for yy in range(2010,2021):
    for mm in range(1,13):
        try:
            tmp=create_kto_data(yy,str(mm).zfill(2))
            df=df.append(tmp,ignore_index=True)
        except:
            pass

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   국적        7500 non-null   object 
 1   관광        7500 non-null   int64  
 2   상용        7500 non-null   int64  
 3   공용        7500 non-null   int64  
 4   유학/연수     7500 non-null   int64  
 5   기타        7500 non-null   int64  
 6   계         7500 non-null   int64  
 7   기준년월      7500 non-null   object 
 8   대륙        7500 non-null   object 
 9   관광객비율(%)  7500 non-null   float64
 10  전체비율(%)   7500 non-null   float64
dtypes: float64(2), int64(6), object(3)
memory usage: 644.7+ KB


In [17]:
df.to_excel('./files/kto_total.xlsx',index=False)

---
## 5. 국가별 데이터를 구분하여 저장하기

In [18]:
cnts=df['국적'].unique()
for cnt in cnts:
    condition=(df['국적']==cnt)
    df_cnt=df[condition]
    d_pth='./files/country/{}.xlsx'.format(cnt)
    df_cnt.to_excel(d_pth,index=False)