In [81]:
import pandas as pd
import numpy as np
import re
from prophet import Prophet
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px


In [82]:
files = {"China": "china_tourist.csv", "Korea": "korean_tourist.csv", "Thailand": "thailand_tourist.csv", "Philippines": "philippines_tourist.csv", "Vietnam": "vietnam_tourist.csv"}

year_month = re.compile(r'(\d+)_(\w+?)_')

df = pd.DataFrame(columns = ["time", "variable", "value", "country"])

for c in files:
    
    year = "0"

    headers = []
    data = []   

    for line in open("foreign_tourist/" + files[c], 'r'):
        fields = [x.strip().replace(",", "") for x in line.split(";")]

        if headers == []:
            headers = fields
        else:
            match_year_month = year_month.match(fields[0])

            if match_year_month:
                year = match_year_month.group(1)
                month = match_year_month.group(2)
                
                time = year + " " + month
                data.append([time] + fields[1:])
            else:
                time = year + " " + fields[0].replace("_", "").replace(".", "")
                #print (c, line)
                data.append([time] + [int(x) for x in fields[1:]])
                #print (data)
    temp_df = pd.DataFrame(data, columns = headers)
    temp_df["time"] = pd.to_datetime(temp_df["time"], format='%Y %b')

    temp_df = pd.melt(temp_df, id_vars=['time'], value_vars=[x for x in temp_df.columns if x != "time"])
    temp_df["country"] = c

    df = pd.concat([df, temp_df])

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3600 entries, 0 to 719
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   time      3600 non-null   datetime64[ns]
 1   variable  3600 non-null   object        
 2   value     3600 non-null   object        
 3   country   3600 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 140.6+ KB


In [84]:
df.head()

Unnamed: 0,time,variable,value,country
0,2006-01-01,Total,68854,China
1,2006-02-01,Total,55842,China
2,2006-03-01,Total,70491,China
3,2006-04-01,Total,73932,China
4,2006-05-01,Total,64298,China


In [85]:
df.tail()

Unnamed: 0,time,variable,value,country
715,2020-08-01,Others,1104,Vietnam
716,2020-09-01,Others,2674,Vietnam
717,2020-10-01,Others,6056,Vietnam
718,2020-11-01,Others,14623,Vietnam
719,2020-12-01,Others,15651,Vietnam


In [86]:
temp_df

Unnamed: 0,time,variable,value,country
0,2006-01-01,Total,1643,Vietnam
1,2006-02-01,Total,2080,Vietnam
2,2006-03-01,Total,2246,Vietnam
3,2006-04-01,Total,2459,Vietnam
4,2006-05-01,Total,1918,Vietnam
...,...,...,...,...
715,2020-08-01,Others,1104,Vietnam
716,2020-09-01,Others,2674,Vietnam
717,2020-10-01,Others,6056,Vietnam
718,2020-11-01,Others,14623,Vietnam


In [87]:
temp_df = df[(df["variable"] == "Tourist") & (df["time"] >= "2007")].sort_values(["time", "country"])
fig = px.line(temp_df, x="time", y="value", color='country')
fig.update_layout(height=800, width=1600, title_text="Tourists by country over years",  xaxis_title="time", yaxis_title="value", font=dict(size=22))

fig.show()