In [1]:
import os
os.chdir("../../../../")
import pandas as pd

from scripts.python.ts_utils import *
from scripts.python.utils import *
from scripts.python.PdfParse import *

In [2]:
solomon_folder = os.getcwd() + "/data/tourism/solomon/"
solomon_pdfs = [solomon_folder + "scraping/" + file
                for file in os.listdir(solomon_folder + "scraping/") if ".pdf" in file]

## Official Statistics
### Parsing

In [3]:
solomon_2020s = [pdf for pdf in solomon_pdfs if "2020" in pdf]

In [4]:
# Read the pdf file
df = load_pdf(filepath=solomon_2020s[1],
              search_string="Table 3",
              table_page=8)
df = df.iloc[:13, ].dropna(how="all", axis=1).dropna(thresh=3, axis=0)
df.head(5)

Unnamed: 0,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NUMBER,NaN.6,NaN.7,NaN.8,NaN.9,NaN.10
0,January,1602,1235,1600,1383,1590,"1,415 1,259",1655,1514,2081,1750,1857
1,February,1422,1209,1658,1591,1544,"1,523 1,465",1707,1635,1855,1891,1471
2,March,1249,1766,1861,1677,2184,"1,816 1,675",1675,1732,2360,2378,752
3,April,1499,1820,1736,1839,2021,"1,514 1,750",1799,2013,2250,2106,2
4,May,1393,1137,1760,1968,1857,"1,462 1,681",1896,1851,2003,2434,23


In [5]:
yr_range = len(df.columns) - 1 + 1  # drop month column and include splitted
yr_lst = [i for i in range(2020 - yr_range + 1, 2020 + 1)]

for colname in df.columns:
    if type(colname) == str and len(colname) > 4 and colname != "Month":
        break

col_idx = df.columns.to_list().index(colname)

In [6]:
## Expand the column and keep the sequence by
## creating before/splitted/after dataframe
df_precol, df_postcol = df.iloc[:, :col_idx],  df.iloc[:, col_idx+1:]

precol_lst = df_precol.columns.to_list()
precol_lst[0], precol_lst[1:] = "Month", yr_lst[:col_idx-1]
df_precol.columns = precol_lst
df_postcol.columns = yr_lst[-len(df_postcol.columns):]

splitted = df.iloc[:, col_idx].str.split(" ", expand=True)
splitted.columns = yr_lst[col_idx-1: -col_idx+1]

In [7]:
temp_df = pd.concat([df_precol, splitted, df_postcol], axis=1)
temp_df = remove_separator(temp_df)


for col in temp_df.columns[1:]:
    temp_df[col] = temp_df[col].astype(int)

temp_df.head(5)

Unnamed: 0,Month,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,January,1602,1235,1600,1383,1590,1415,1259,1655,1514,2081,1750,1857
1,February,1422,1209,1658,1591,1544,1523,1465,1707,1635,1855,1891,1471
2,March,1249,1766,1861,1677,2184,1816,1675,1675,1732,2360,2378,752
3,April,1499,1820,1736,1839,2021,1514,1750,1799,2013,2250,2106,2
4,May,1393,1137,1760,1968,1857,1462,1681,1896,1851,2003,2434,23


In [8]:
temp_df_tr = temp_df.T
temp_df_tr.columns = temp_df_tr.iloc[0].to_list()
temp_df_tr = (temp_df_tr
              .drop(index="Month")
              .reset_index()
              .rename({"index": "Year"}, axis=1))

In [9]:
import calendar
month_dict= {}
for index, month in enumerate(calendar.month_name):
    month_dict.update({month: int(index)})
    
temp_df_tr = temp_df_tr.melt(id_vars="Year")
temp_df_tr["month"] = temp_df_tr["variable"].map(month_dict)
temp_df_tr = temp_df_tr.dropna()
temp_df_tr["date"] = [str(y) + "-" + str(int(m)) + "-01" for y,m in zip(temp_df_tr["Year"], temp_df_tr["month"])]
temp_df_tr["date"] = pd.to_datetime(temp_df_tr["date"])
temp_df_tr.head(5)

Unnamed: 0,Year,variable,value,month,date
0,2009,January,1602,1.0,2009-01-01
1,2010,January,1235,1.0,2010-01-01
2,2011,January,1600,1.0,2011-01-01
3,2012,January,1383,1.0,2012-01-01
4,2013,January,1590,1.0,2013-01-01


In [10]:
temp_df_tr = (temp_df_tr.drop(["variable"], axis=1)
              .rename({"value": "total", 
                       "Year": "year"}, axis=1))
temp_df_tr = temp_df_tr[["date", "year", "month", "total"]]
temp_df_tr.to_csv(
    solomon_folder + "intermediate/solomon_monthly_visitor.csv", encoding="utf-8")

In [11]:
get_adf_df(temp_df_tr, ["total"])

Unnamed: 0,Test Statistic,p-value,# Lags Used,Number of Observations Used,Critical Value (1%),Critical Value (5%),Critical Value (10%)
total,-2.393654,0.143535,14.0,129.0,-3.482088,-2.884219,-2.578864


In [12]:
df_19_20 = temp_df.iloc[:-1, -2:].unstack().reset_index().rename({0: "Total"}, axis=1)
df_19_20.head(5)

Unnamed: 0,level_0,level_1,Total
0,2019,0,1750
1,2019,1,1891
2,2019,2,2378
3,2019,3,2106
4,2019,4,2434


## Aviation Statistics

In [13]:
aviation_path = os.getcwd() + "/data/tourism/aviation_seats_flights_pic.xlsx"
aviation = pd.read_excel(aviation_path)
aviation.head(5)

Unnamed: 0,Country,ISO,Region,Date,Aircraft_type,Seats_arrivals_domestic,Seats_arrivals_interregional,Seats_arrivals_intraregional,Seats_arrivals_intl,Seats_arrivals_total,Available_seat_kilometers,Number_of_flights_domestic,Number_of_flights_interregional,Number_of_flights_intraregional,Number_of_flights_intl,Number_of_flights_total
0,Fiji,FJ,East Asia & Pacific,2019-01-01,passenger,839,273,3480,3753,4592,14304160.0,8,1,10,11,19
1,Fiji,FJ,East Asia & Pacific,2019-01-02,passenger,974,313,3471,3784,4758,14956100.0,8,1,10,11,19
2,Fiji,FJ,East Asia & Pacific,2019-01-03,passenger,1190,443,3675,4118,5308,15921430.0,10,2,12,14,24
3,Fiji,FJ,East Asia & Pacific,2019-01-04,passenger,831,586,3159,3745,4576,14573340.0,7,2,12,14,21
4,Fiji,FJ,East Asia & Pacific,2019-01-05,passenger,744,273,4752,5025,5769,17734490.0,7,1,12,13,20


In [24]:
sb_avi = (aviation[(aviation.ISO == "SB") & (aviation.Aircraft_type == "passenger")]
          .reset_index()
          .drop("index", axis=1)
          [["Date", "Number_of_flights_intl", "Number_of_flights_total",
            "Seats_arrivals_intl", "Seats_arrivals_total"]])

dates = pd.DataFrame(pd.date_range(start="2019-01-01",
                                   end="2020-12-31"), columns=["Date"])

sb_avi = dates.merge(sb_avi, how="left", on="Date")
sb_avi["Date"] = pd.to_datetime(sb_avi["Date"])

sb_avi["Month"], sb_avi["Year"] = sb_avi["Date"].dt.month, sb_avi["Date"].dt.year
sb_avi_19_20 = sb_avi.groupby(by=["Year", "Month"]).sum().reset_index()

In [15]:
sb_merged = pd.concat([sb_avi_19_20, df_19_20], axis=1).drop(["level_0", "level_1"], axis=1)
sb_merged = remove_separator(sb_merged)
sb_merged["Total"] = sb_merged["Total"].astype(float)

In [16]:
sb_merged[["Total", "Seats_arrivals_intl"]].describe()

Unnamed: 0,Total,Seats_arrivals_intl
count,24.0,24.0
mean,1389.458333,6357.375
std,1160.284784,4291.692772
min,2.0,510.0
25%,49.25,1239.0
50%,1803.5,9278.0
75%,2446.5,9734.5
max,3053.0,11255.0


In [20]:
sb_merged.to_csv(solomon_folder + "intermediate/solomon_merged.csv", encoding="utf-8")

## EDA

### Correlation and Cross-correlation

In [28]:
from scipy.stats import pearsonr

corr_seats, _ = pearsonr(sb_merged["Seats_arrivals_total"], sb_merged["Total"])
corr_seats_intl, _ = pearsonr(
    sb_merged["Seats_arrivals_intl"], sb_merged["Total"])
print(f"Pearson Correlation between FlightRadar's Seats Arrival and VU's census data is{corr_seats: .4f}.\n",
      f"Pearson Correlation between FlightRadar's # of International Seats Arrival and VU's census data is{corr_seats_intl: .4f}.")

Pearson Correlation between FlightRadar's Seats Arrival and VU's census data is 0.9521.
 Pearson Correlation between FlightRadar's # of International Seats Arrival and VU's census data is 0.9503.


In [29]:
sb_cc = cross_corr_df(sb_merged, "Seats_arrivals_intl", "Total")
sb_cc.head(5)

Unnamed: 0,lag,cross_corr_coef
0,0,0.950256
1,1,0.915791
2,2,0.791296
3,3,0.683567
4,4,0.496011


### Stationarity

In [30]:
incl_cols = ["Total", "Seats_arrivals_intl"]
get_adf_df(sb_merged, incl_cols)

Unnamed: 0,Test Statistic,p-value,# Lags Used,Number of Observations Used,Critical Value (1%),Critical Value (5%),Critical Value (10%)
Total,-0.451626,0.901116,0.0,23.0,-3.752928,-2.9985,-2.638967
Seats_arrivals_intl,-0.559959,0.879775,9.0,14.0,-4.012034,-3.104184,-2.690987


In [31]:
# Difference once to check stationarity
sb_diff = sb_merged.diff().dropna()
sb_diff2 = sb_diff.diff().dropna()
get_adf_df(sb_diff2, incl_cols)

Unnamed: 0,Test Statistic,p-value,# Lags Used,Number of Observations Used,Critical Value (1%),Critical Value (5%),Critical Value (10%)
Total,-10.592363,6.478633999999999e-19,0.0,21.0,-3.788386,-3.013098,-2.646397
Seats_arrivals_intl,-12.078784,2.257563e-22,0.0,21.0,-3.788386,-3.013098,-2.646397


Both `Total` and `Seats_arrivals_intl` are somewhat stationary after differencing once.

### Granger Causality

In [33]:
grangers_causation_matrix(sb_diff, incl_cols, maxlag=6)

Unnamed: 0,Total_x,Seats_arrivals_intl_x
Total_y,1.0,0.0067
Seats_arrivals_intl_y,0.0,1.0
