In [3]:
import pandas as pd
import numpy as np

from prophet import Prophet
import matplotlib.pyplot as plt 
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [189]:
accomodation = pd.read_csv("accomodation_data.csv")
prefectures = pd.read_csv("Prefecture_Japanese_Area.tsv", sep="\t", thousands=",")

In [1]:
accomodation.head()

NameError: name 'accomodation' is not defined

In [191]:
accomodation.drop(["地域コード", "注記"], inplace=True, axis=1)
accomodation.rename(columns = {"時点": "time", "地域": "location", "延べ宿泊者数（総数）【人泊】": "accomodations"}, inplace=True)

In [192]:
accomodation.head()

Unnamed: 0,time,location,accomodations
0,2007年1月,全国,22136270
1,2007年2月,全国,23346870
2,2007年3月,全国,26997200
3,2007年4月,全国,23729790
4,2007年5月,全国,25636730


In [193]:
accomodation["time"] = accomodation["time"].str.replace('年','-').str.replace('月','')
accomodation = accomodation[~accomodation["time"].str.endswith("-")]
accomodation["time"] = pd.to_datetime(accomodation["time"].str.replace('年','-').str.replace('月',''), format='%Y-%m') 

In [194]:
accomodation

Unnamed: 0,time,location,accomodations
0,2007-01-01,全国,22136270
1,2007-02-01,全国,23346870
2,2007-03-01,全国,26997200
3,2007-04-01,全国,23729790
4,2007-05-01,全国,25636730
...,...,...,...
8059,2020-08-01,沖縄県,849500
8060,2020-09-01,沖縄県,884730
8061,2020-10-01,沖縄県,1285240
8062,2020-11-01,沖縄県,1415910


In [195]:
prefectures.head()

Unnamed: 0,Prefecture,Japanese,Area
0,Aichi,愛知県,5153.81
1,Akita,秋田県,11612.11
2,Aomori,青森県,9606.26
3,Chiba,千葉県,5156.15
4,Ehime,愛媛県,5676.44


In [196]:
assert(set(accomodation.location.to_list()) == set(prefectures.Japanese.to_list()))

In [197]:
accomodation = accomodation.merge(prefectures, left_on="location", right_on="Japanese")

In [198]:
accomodation.head()

Unnamed: 0,time,location,accomodations,Prefecture,Japanese,Area
0,2007-01-01,全国,22136270,Japan,全国,377915.0
1,2007-02-01,全国,23346870,Japan,全国,377915.0
2,2007-03-01,全国,26997200,Japan,全国,377915.0
3,2007-04-01,全国,23729790,Japan,全国,377915.0
4,2007-05-01,全国,25636730,Japan,全国,377915.0


In [199]:
accomodation.drop(["location", "Japanese"], axis=1, inplace=True)
accomodation.rename(columns = {"Prefecture": "location"}, inplace=True)

In [200]:
accomodation.to_csv("accomodation_processed.tsv", sep="\t", index=False)

In [4]:
accomodation = pd.read_csv("accomodation_processed.tsv", sep="\t")

In [5]:
accomodation.head()

Unnamed: 0,time,accomodations,location,Area
0,2007-01-01,22136270,Japan,377915.0
1,2007-02-01,23346870,Japan,377915.0
2,2007-03-01,26997200,Japan,377915.0
3,2007-04-01,23729790,Japan,377915.0
4,2007-05-01,25636730,Japan,377915.0


In [6]:
prophet_japan = accomodation[accomodation["location"] == "Japan"][["time", "accomodations"]]
prophet_japan.columns = ["ds", "y"]
prophet_japan["ds"] = pd.to_datetime(prophet_japan["ds"])

In [7]:
prophet_japan

Unnamed: 0,ds,y
0,2007-01-01,22136270
1,2007-02-01,23346870
2,2007-03-01,26997200
3,2007-04-01,23729790
4,2007-05-01,25636730
...,...,...
163,2020-08-01,26149030
164,2020-09-01,26020820
165,2020-10-01,32412890
166,2020-11-01,34501310


In [9]:
prophet_japan_2020 = prophet_japan[(prophet_japan["ds"] >= "2020-01-01")]
prophet_japan_before_2020 = prophet_japan[(prophet_japan["ds"] < "2020-01-01")]

In [10]:
m_japan = Prophet()
m_japan.fit(prophet_japan_before_2020)
future = pd.DataFrame(prophet_japan["ds"], columns=["ds"])
prediction = m_japan.predict(future)

INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


In [11]:
prediction.tail()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
163,2020-08-01,51521550.0,61563780.0,66077380.0,51359230.0,51678650.0,12353810.0,12353810.0,12353810.0,12353810.0,12353810.0,12353810.0,0.0,0.0,0.0,63875360.0
164,2020-09-01,51739400.0,49081410.0,53513280.0,51541560.0,51947010.0,-516313.6,-516313.6,-516313.6,-516313.6,-516313.6,-516313.6,0.0,0.0,0.0,51223090.0
165,2020-10-01,51950220.0,51058260.0,55311560.0,51717680.0,52205640.0,1205144.0,1205144.0,1205144.0,1205144.0,1205144.0,1205144.0,0.0,0.0,0.0,53155360.0
166,2020-11-01,52168070.0,49525130.0,54137850.0,51900350.0,52461410.0,-433148.8,-433148.8,-433148.8,-433148.8,-433148.8,-433148.8,0.0,0.0,0.0,51734920.0
167,2020-12-01,52378890.0,47350670.0,51804340.0,52068510.0,52710280.0,-2854831.0,-2854831.0,-2854831.0,-2854831.0,-2854831.0,-2854831.0,0.0,0.0,0.0,49524060.0


In [12]:
def plot_prophet(original, prediction, title):
    trace1 = go.Scatter(
    x=original["ds"],
    y=original["y"],
    mode='markers',
    name = "Actual",
    marker_color='rgba(152, 0, 0, .8)',
    marker={'size': 15}
    )

    trace2 = go.Scatter(
        x=prediction["ds"],
        y=prediction["yhat"],
        name = "Prophet's fit and prediction",
        marker_color='rgb(129, 119, 185, 1)'
    )

    trace3 = go.Scatter(
        x=prediction["ds"],
        y=prediction["yhat_upper"],
        mode='lines',
            marker=dict(color="#444"),
            line=dict(width=0),
            showlegend=False
    )

    trace4 = go.Scatter(
        x=prediction["ds"],
        y=prediction["yhat_lower"],
        marker=dict(color="#444"),
            line=dict(width=0),
            mode='lines',
            fillcolor='rgba(78, 119, 185, 0.4)',
            fill='tonexty',
            showlegend=False
    )



    fig = make_subplots()
    
    fig.add_trace(trace2)
    fig.add_trace(trace3)
    fig.add_trace(trace4)
    fig.add_trace(trace1)
    #fig.write_image("fig1.png")
    fig.update_layout(height=800, width=1600, title_text=title,  xaxis_title="Time", yaxis_title="Accomodations", font=dict(size=22))
    return fig

In [13]:
fig_1 = plot_prophet(prophet_japan, prediction, "Accomodation counts in Japan between 2007 and 2020")
fig_1.show()

In [14]:
def potential_loss(prediction, actual, time):
    temp_df = prediction[prediction["ds"] >= time][["ds", "yhat"]]
    temp_df = temp_df.merge(prophet_japan[prophet_japan["ds"] >= time][["ds", "y"]], left_on="ds", right_on="ds")
    temp_df["diff"] = temp_df["yhat"] - temp_df["y"]

    return temp_df["diff"].sum()

In [15]:

potential_loss(prediction, prophet_japan, "2020-02-01")

305851470.88618195

In [16]:
accomodation.head()

Unnamed: 0,time,accomodations,location,Area
0,2007-01-01,22136270,Japan,377915.0
1,2007-02-01,23346870,Japan,377915.0
2,2007-03-01,26997200,Japan,377915.0
3,2007-04-01,23729790,Japan,377915.0
4,2007-05-01,25636730,Japan,377915.0


In [17]:
prophet_japan.head()

Unnamed: 0,ds,y
0,2007-01-01,22136270
1,2007-02-01,23346870
2,2007-03-01,26997200
3,2007-04-01,23729790
4,2007-05-01,25636730


In [21]:
prophet_japan['previous_year'] = prophet_japan.groupby([prophet_japan['ds'].dt.month, prophet_japan['ds'].dt.day])['y'].shift()

In [25]:
prophet_japan[(prophet_japan["ds"] > "2008") & (prophet_japan["ds"] < "2010")]

Unnamed: 0,ds,y,previous_year
13,2008-02-01,23734590,23346870.0
14,2008-03-01,27229850,26997200.0
15,2008-04-01,23642670,23729790.0
16,2008-05-01,26128970,25636730.0
17,2008-06-01,23402130,23664790.0
18,2008-07-01,26565860,25362320.0
19,2008-08-01,33652720,33935790.0
20,2008-09-01,24941780,25650700.0
21,2008-10-01,27605970,27430380.0
22,2008-11-01,26884920,27087720.0


In [28]:
def yoy(df):
    df['previous_year'] = df.groupby([df['ds'].dt.month, df['ds'].dt.day])['y'].shift()
    df["yoy"] = 100 * (df["y"] - df["previous_year"]) / df["previous_year"]
    return df

In [30]:
df = yoy(prophet_japan)

In [32]:
df[(df["ds"] > "2019")]

Unnamed: 0,ds,y,previous_year,yoy
145,2019-02-01,43539370,38899650.0,11.927408
146,2019-03-01,51147600,45644860.0,12.055552
147,2019-04-01,50718730,42509270.0,19.312164
148,2019-05-01,51402690,44445160.0,15.65419
149,2019-06-01,45810390,41406270.0,10.63636
150,2019-07-01,51780530,47142860.0,9.837481
151,2019-08-01,63234040,59715640.0,5.891924
152,2019-09-01,48761240,44439890.0,9.724034
153,2019-10-01,50052850,46582920.0,7.448932
154,2019-11-01,49659370,45282650.0,9.665335


In [74]:
okinawa_df = accomodation[accomodation["location"] == "Okinawa"][["time", "accomodations"]]
okinawa_df.columns = ["ds", "y"]
okinawa_df["ds"] = pd.to_datetime(okinawa_df["ds"])

In [76]:
okinawa_df = yoy(okinawa_df)

In [77]:
okinawa_df[(okinawa_df["ds"] > "2019")]

Unnamed: 0,ds,y,previous_year,yoy
8041,2019-02-01,2411270,1932180.0,24.795309
8042,2019-03-01,2831050,2369460.0,19.48081
8043,2019-04-01,2784430,2105270.0,32.259995
8044,2019-05-01,2721360,2058330.0,32.212036
8045,2019-06-01,2695380,2193470.0,22.882009
8046,2019-07-01,3269120,2489970.0,31.291542
8047,2019-08-01,3523720,2900190.0,21.499626
8048,2019-09-01,2707780,2444030.0,10.791602
8049,2019-10-01,2713450,2365710.0,14.699181
8050,2019-11-01,2461550,2124010.0,15.891639


In [78]:
hokkaido_df = accomodation[accomodation["location"] == "Hokkaidō"][["time", "accomodations"]]
hokkaido_df.columns = ["ds", "y"]
hokkaido_df["ds"] = pd.to_datetime(hokkaido_df["ds"])
hokkaido_df = yoy(hokkaido_df)
hokkaido_df[(hokkaido_df["ds"] > "2019")]

Unnamed: 0,ds,y,previous_year,yoy
313,2019-02-01,3211400,3084420.0,4.116819
314,2019-03-01,2878620,2740050.0,5.057207
315,2019-04-01,2415820,2135820.0,13.109719
316,2019-05-01,3016680,2731360.0,10.446078
317,2019-06-01,3086700,3095940.0,-0.298455
318,2019-07-01,3634090,3626480.0,0.209845
319,2019-08-01,3916250,3952410.0,-0.914885
320,2019-09-01,3247260,2547810.0,27.452989
321,2019-10-01,3046530,2886370.0,5.548838
322,2019-11-01,2458660,2505860.0,-1.883585


In [73]:
import plotly.express as px

df = accomodation[accomodation["location"].isin(["Tokyo",  "Osaka", "Hokkaidō",  "Okinawa", "Kyoto"])].sort_values("time")
fig = px.line(df, x="time", y="accomodations", color='location')
fig.update_layout(height=800, width=1600, title_text="Accomodations of the top six most visited locations in Japan",  xaxis_title="Time", yaxis_title="Accomodations", font=dict(size=22))

#fig.show()
#fig.write_html("newplot.html")
fig.write_image("fig1.png")

In [66]:
df[(df["time"] < "2008") &  (df["location"] == "Chiba")]

Unnamed: 0,time,accomodations,location,Area


In [45]:
df = accomodation[accomodation["location"].isin(["Tokyo", "Japan"])]
df

Unnamed: 0,time,accomodations,location,Area
0,2007-01-01,22136270,Japan,377915.00
1,2007-02-01,23346870,Japan,377915.00
2,2007-03-01,26997200,Japan,377915.00
3,2007-04-01,23729790,Japan,377915.00
4,2007-05-01,25636730,Japan,377915.00
...,...,...,...,...
2347,2020-08-01,1685530,Tokyo,2187.08
2348,2020-09-01,1776060,Tokyo,2187.08
2349,2020-10-01,2670360,Tokyo,2187.08
2350,2020-11-01,3047500,Tokyo,2187.08


In [48]:
temp_df = pd.pivot_table(df, values='accomodations', index=['time'], columns=['location'])
temp_df

location,Japan,Tokyo
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2007-01-01,22136270,2693010
2007-02-01,23346870,2909700
2007-03-01,26997200,3305360
2007-04-01,23729790,3212150
2007-05-01,25636730,3140590
...,...,...
2020-08-01,26149030,1685530
2020-09-01,26020820,1776060
2020-10-01,32412890,2670360
2020-11-01,34501310,3047500


In [49]:
temp_df["percent"] = 100 * temp_df["Tokyo"] / temp_df["Japan"]

In [51]:
temp_df["percent"].mean()

11.610283269000146

In [72]:
accomodation[(accomodation["time"]  >= "2019-01-01") & (accomodation["time"]  < "2020-01-01") ].groupby("location").sum(["accomodations"]).nlargest(11, "accomodations")

Unnamed: 0_level_0,accomodations,Area
location,Unnamed: 1_level_1,Unnamed: 2_level_1
Japan,595921480,4534980.0
Tokyo,78981720,26244.96
Osaka,47427520,22718.16
Hokkaidō,36983420,1001429.64
Okinawa,32865650,27255.6
Kyoto,30749560,55355.16
Chiba,29229110,61873.8
Kanagawa,23883900,28985.04
Shizuoka,23429450,87943.32
Fukuoka,20420370,59652.12
