In [1]:
import os
os.chdir("../../")
import math
import re
import numpy as np
import pandas as pd
import pdfplumber
import tabula
import PyPDF2
import warnings
warnings.filterwarnings("ignore")

In [2]:
pdf_path = os.getcwd() + "/data/tourism/vanuatu/2013-TM-06-June_News.pdf"

## Tonga 
### Table 1

In [6]:
tonga_lsts = os.listdir("data/tourism/tonga")
filepaths = list()
for path in tonga_lsts:
    folder_path = os.getcwd() + "/data/tourism/tonga/"
    if "Dec" in path:
        filepaths.append(folder_path + path)
    elif "2021" and "Bulletin" in path:
        filepaths.append(folder_path + path)
    else:
        pass

['/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Statistical-Bulletin-on-International-Arrivals-and-Departures-2021.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Migration-Report-Dec-2017.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Migration-December-Report-2019.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-Report-2014.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-Report-2015.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Migration-December-Report-2020.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-Report-2013.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-Migration-Report-Dec-2016.pdf',
 '/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-2012.pdf']

In [4]:
def locate_table(filepath: str,
                 search_string: str,
                 ignore_case=False):

    search_lst = list()
    reader = PyPDF2.PdfReader(filepath)

    for page_num, page in enumerate(reader.pages):
        try:
            page_text = page.extract_text()
            hits = None
            if ignore_case == False:
                hits = re.search(search_string, page_text.lower())
            else:
                hits = re.search(
                    search_string, page_text.lower(), re.IGNORECASE)

            if hits:
                search_lst.append(page_num+1)
        except:
            pass
    return {"table_loc": search_lst}


def load_pdf(filepath: str,
             search_string: str,
             table_page: int,
             table_seq=0):

    table_loc = locate_table(filepath, search_string,
                             ignore_case=True)["table_loc"]
    if len(table_loc) != 0:
        table_page = table_loc[-1]
        dfs = tabula.read_pdf(filepath, pages=table_page, stream=True)
        if len(dfs) > 1:
            print(f"The page has {len(dfs)} tables.")
            df = dfs[table_seq]

        else:
            df = dfs[0]
            df.columns = df.iloc[0, :].to_list()
    else:
        dfs = tabula.read_pdf(filepath, pages="all", stream=True)
        df = dfs[table_page]
        df.columns = df.iloc[0, :].to_list()

    df = df.iloc[1:].reset_index().drop("index", axis=1)

    return df


def split_time(df: pd.DataFrame,
               time_var: str):

    year_idx, month_idx = list(), list()
    for idx in df.index:
        if (str(df[time_var][idx]).isdigit() == True):
            year_idx.append(idx)
        else:
            month_idx.append(idx)

    latest_year_idx = max(year_idx)

    return latest_year_idx, year_idx, month_idx


def detect_year(series: pd.Series):
    nacheck = pd.isna(series)
    start_year = int(series[nacheck == False][0])
    return start_year


def generate_time(df: pd.DataFrame,
                  start_year: int):

    years = [start_year + idx // 12 for idx in df.index]
    df["Year"] = years

    return df


def remove_separator(df: pd.DataFrame):

    colnames = df.columns
    for col in colnames:
        try:
            if df[col].dtype == "O":
                df[col] = (df[col].str.replace(",", "")
                                  .str.replace("-", "")
                                  .str.replace("(", "")
                                  .str.replace(")", "")
                                  .str.replace(" ", ""))
        except:
            print(col, "might have an error.")

    return df


def separate_data(df: pd.DataFrame,
                  var: str,
                  split_rule: str):

    splited_lst = var.split(split_rule)
    var_number = len(splited_lst)

    obj = dict()
    for i in range(var_number):
        obj[str(splited_lst[i])] = []

    for i in df[var]:
        elems = i.split(" ")
        length = len(elems)
        if length == var_number:
            idx, var = 0, list(obj.keys())
            while idx < length:
                key, val = var[idx], elems[idx]
                obj[key].append(val)
                idx += 1

        elif length < var_number:
            idx, var = 0, list(obj.keys())
            while idx < length and len(elems) != 0:
                key, val = var[idx], elems[idx].split(" ")[0]
                obj[key].append(val)
                elems = i.replace(val, "").strip()
                idx += 1
            else:
                key, val = var[idx], 0
                obj[key].append(val)
                idx += 1

        else:
            idx, var = 0, list(obj.keys())
            while idx < var_number:
                key, val = var[idx], elems[idx]
                obj[key].append(val)
                idx += 1
            else:
                key, val = var[-1], elems[idx]
                prev_val = obj[key][-1]
                obj[key][-1] = prev_val + val

    for i in range(var_number):
        df[str(splited_lst[i])] = obj[list(obj.keys())[i]]

    return df


def check_quality(df: pd.DataFrame,
                  exclude_vars: list,
                  sum_var: str):

    new_df = df.iloc[:, ~df.columns.isin(exclude_vars)]
    checked_vars = new_df.columns[~new_df.columns.isin([sum_var])].to_list()

    for idx in new_df.index:
        row_sum = 0
        for var in checked_vars:
            val = new_df[var][idx]
            if math.isnan(float(val)) != True:
                row_sum += float(val)
            else:
                row_sum += 0
        if float(new_df[sum_var][idx]) == row_sum:
            pass
        else:
            return False

    return True

In [7]:
months = pd.DataFrame()

for file in filepaths[:-1]:
    print(file)

    df = load_pdf(file, "Monthly Arrival and Departure", table_page=-5)
    latest_year, year_idx, month_idx = split_time(df, "Period")
    month = df.iloc[month_idx, 0:4]
    start_year = detect_year(df.iloc[month_idx].iloc[0])

    month = (month.dropna(how="all").reset_index()
             .drop("index", axis=1))

    print(f"The file starts from {start_year}.")

    month = separate_data(month, "Air Ship", " ").drop("Air Ship", axis=1)
    month = remove_separator(month)
    month = month.replace(r'^\s*$', 0, regex=True)

    if check_quality(month, ["Period", "Year"], "Total") == False:
        name = file.split("/")[-1].split(".")[0]
        print("  ", name, "could go wrong!")

    generate_time(month, start_year)
    months = pd.concat([months, month], axis=0)
    
months = (months[["Year", "Period", "Air", "Ship", "Yacht", "Total"]]
          .drop_duplicates()
          .sort_values(by="Year")
          .reset_index()
          .drop("index", axis=1))

months

/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Statistical-Bulletin-on-International-Arrivals-and-Departures-2021.pdf
The file starts from 2018.
   Statistical-Bulletin-on-International-Arrivals-and-Departures-2021 could go wrong!
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Migration-Report-Dec-2017.pdf
The file starts from 2013.
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Migration-December-Report-2019.pdf
The file starts from 2018.
   Migration-December-Report-2019 could go wrong!
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-Report-2014.pdf


Got stderr: Dec 08, 2022 3:40:26 PM org.apache.pdfbox.pdmodel.PDDocument importPage
Dec 08, 2022 3:40:26 PM org.apache.pdfbox.pdmodel.PDDocument importPage



The file starts from 2010.
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-Report-2015.pdf


Got stderr: Dec 08, 2022 3:40:33 PM org.apache.pdfbox.pdmodel.PDDocument importPage
Dec 08, 2022 3:40:33 PM org.apache.pdfbox.pdmodel.PDDocument importPage



The file starts from 2010.
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/Migration-December-Report-2020.pdf
The file starts from 2018.
   Migration-December-Report-2020 could go wrong!
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-December-Migration-Report-2013.pdf


Got stderr: Dec 08, 2022 3:40:43 PM org.apache.pdfbox.pdmodel.PDDocument importPage
Dec 08, 2022 3:40:43 PM org.apache.pdfbox.pdmodel.PDDocument importPage



The file starts from 2010.
/Users/czhang/Desktop/pacific-observatory/data/tourism/tonga/12-Migration-Report-Dec-2016.pdf


Got stderr: Dec 08, 2022 3:40:51 PM org.apache.pdfbox.pdmodel.PDDocument importPage
Dec 08, 2022 3:40:51 PM org.apache.pdfbox.pdmodel.PDDocument importPage



The file starts from 2011.


Unnamed: 0,Year,Period,Air,Ship,Yacht,Total
0,2010,May,3670,5303,177,9150
1,2010,January,3158,646,4,3808
2,2010,February,2379,0,5,2384
3,2010,March,3134,853,5,3992
4,2010,April,2818,2802,30,5650
...,...,...,...,...,...,...
154,2021,Apr,2,0,0,2
155,2021,May,1,8,,18
156,2021,Feb,4,1,0,41
157,2021,Oct,8,,,8


## Vanuatu

In [7]:
pdf_path = os.getcwd() + "/data/tourism/vanuatu/2014-TM-12-December-News.pdf"

### Visitor Arrivals by Purpose of Visit

In [8]:
df = load_pdf(pdf_path, "Visitor Arrivals by Purpose of Visit", 6)
df.columns = df.iloc[0]

df = df.dropna(thresh=4, axis=1).replace("-", 0)
df = df.iloc[3:].reset_index().drop("index", axis=1)

splited = df["Conferences Stop Over"].str.split(" ", n=1, expand=True)
splited.columns = ["Conference", "Stopover"]
df = pd.concat([df, splited], axis=1)
df = remove_separator(df)

In [9]:
vu_lsts = os.listdir("data/tourism/vanuatu")
dec_lst = [file for file in vu_lsts if "Dec" in file]

In [10]:
error_dict = {
    "file": [],
    "reason": []
}


for file in dec_lst:
    if ".pdf" in file and "2010" not in file:

        print(f"{file} has started")
        filepath = os.getcwd() + "/data/tourism/vanuatu/" + file

        df = load_pdf(filepath, "Visitor Arrivals by Purpose of Visit", 6)
        df.columns = df.iloc[0]

        df = df.dropna(thresh=4, axis=1).replace("-", 0)
        df = df.iloc[3:].reset_index().drop("index", axis=1)

        try:
            col_lst = df.columns.to_list()
            stored_splited = ["Business, Stop",
                              "Cruiseship Other", "Conferences Stop Over"]

            for idx, val in enumerate(col_lst):
                if type(val) == str and val in stored_splited:
                    if val == "Business, Stop":
                        separate_data(df, "Business, Stop", ",")

                    elif val == "Conferences Stop Over":
                        splited = df[val].str.split(" ", n=1, expand=True)

                        if len(splited.columns) == 2:
                            splited.columns = [
                                val.split(" ")[0], val.split(" ")[-1]]
                            df = pd.concat([df, splited], axis=1)

                        else:
                            print(f"{file} has incompatible column.")
                            error_dict["file"].append(file)
                            error_dict["reason"].append("Incompatible Column")

                    else:
                        splited = df[val].str.split(" ", n=2, expand=True)

                        if len(splited.columns) == 2:
                            splited.columns = val.split(" ")
                            df = pd.concat([df, splited], axis=1)
                        else:
                            print("Incompatible Column")
                            error_dict["file"].append(file)
                            error_dict["reason"].append("Incompatible Column")

            df = remove_separator(df)

            try:
                df = df.drop(["Conferences Stop Over",
                             "Année", "Mois"], axis=1)

                if "Holidays" in df.columns:
                    df["Holidays"] = df["Holidays"].str.replace(" ", "")
                    saved_path = os.getcwd() + "/data/tourism/vanuatu/temp/" + \
                        file.split(".")[0] + ".csv"
                    df.to_csv(saved_path, encoding="utf-8")

                else:
                    print("  Holidays column not found.")
                    error_dict["file"].append(file)
                    error_dict["reason"].append("Holidays column not found.")

            except:
                print(f"  {file} does not find Année or Mois column.")
                error_dict["file"].append(file)
                error_dict["reason"].append("Année or Mois column not found.")

        except:
            error_dict["file"].append(file)
            error_dict["reason"].append("Column Error")

Tou12_December_News_2005.pdf has started
nan might have an error.
nan might have an error.
Other might have an error.
nan might have an error.
nan might have an error.
Other might have an error.
  Tou12_December_News_2005.pdf does not find Année or Mois column.
Tou12_December_News_2004.pdf has started
nan might have an error.
nan might have an error.
Other might have an error.
nan might have an error.
nan might have an error.
Other might have an error.
  Tou12_December_News_2004.pdf does not find Année or Mois column.
Tou12_December_News_2007.pdf has started
nan might have an error.
nan might have an error.
nan might have an error.
nan might have an error.
  Tou12_December_News_2007.pdf does not find Année or Mois column.
IAS_12_December_2015.pdf has started
2012-TM-12-December_News.pdf has started
  2012-TM-12-December_News.pdf does not find Année or Mois column.
IAS_12_December_2016.pdf has started
IVA_12_December_2021.pdf has started
2014-TM-12-December-News.pdf has started
2011-TM-

In [11]:
check_lst = os.listdir(os.getcwd() + "/data/tourism/vanuatu/temp")
check_lst = [os.getcwd() + "/data/tourism/vanuatu/temp/" + file for file in check_lst]

for file in check_lst:
    if ".DS_Store" not in file:
        df = pd.read_csv(file).drop("Unnamed: 0", axis=1)
        df = remove_separator(df)
        if check_quality(df, ["Year", "Month"], "Visitors"):
            df.to_csv(file, encoding="utf-8")
        else:
            print(f"{file} fails to pass the quality check.")

/Users/czhang/Desktop/pacific-observatory/data/tourism/vanuatu/temp/IVA_12_Dec_2020.csv fails to pass the quality check.
/Users/czhang/Desktop/pacific-observatory/data/tourism/vanuatu/temp/IVA_12_December_2021.csv fails to pass the quality check.


### Visitor Arrivals by Usual Country of Residence

In [12]:
bycountry_err_dict = {
    "file": [],
    "reason": []
}

for file in dec_lst:
    filepath = "/Users/czhang/Desktop/pacific-observatory/data/tourism/vanuatu/" + file
    print(file, locate_table(
        filepath, "Visitor Arrivals by Usual Country of Residence", ignore_case=True))
    try:
        df = load_pdf(
            filepath, "Visitor Arrivals by Usual Country of Residence", 2)
        df = df.iloc[:, :-2].dropna(thresh=4, axis=1)

        headers, row1 = df.columns.to_list(), df.iloc[0].to_list()
        newheader = list()
        for header, row in zip(headers, row1):
            if type(header) != str:
                newheader.append(str(row))
            else:
                newheader.append(str(header))

        newheader[-1] = "Total"
        newheader[newheader.index("Countries")], newheader[newheader.index(
            "nan")] = "Other PIC", "Europe"

        df.columns = newheader
        df = df.iloc[2:].reset_index().drop("index", axis=1)
        df = remove_separator(df)
        if check_quality(df, ["Month", "Year"], "Total"):
            print(f"  {file} pass the quality check.")
            saved_path = "/Users/czhang/Desktop/pacific-observatory/data/tourism/vanuatu/byorigin/" + \
                file.split(".")[0] + ".csv"
            df.to_csv(saved_path, encoding="utf-8")
        else:
            print(f"  {file} could have column errors")
            saved_path = "/Users/czhang/Desktop/pacific-observatory/data/tourism/vanuatu/byorigin/" + \
                file.split(".")[0] + ".csv"
            df.to_csv(saved_path, encoding="utf-8")
            bycountry_err_dict["file"].append(file)
            bycountry_err_dict["reason"].append("Column Error")
    except:
        print(f"  {file} has an error.")
        bycountry_err_dict["file"].append(file)
        bycountry_err_dict["reason"].append("Missing Error")

Tou12_December_News_2005.pdf {'table_loc': []}
  Tou12_December_News_2005.pdf has an error.
Tou12_December_News_2004.pdf {'table_loc': []}
  Tou12_December_News_2004.pdf has an error.
Tou12_December_News_2007.pdf {'table_loc': [3]}
  Tou12_December_News_2007.pdf has an error.
IAS_12_December_2015.pdf {'table_loc': [9]}
  IAS_12_December_2015.pdf pass the quality check.
2012-TM-12-December_News.pdf {'table_loc': [8]}
  2012-TM-12-December_News.pdf could have column errors
IAS_12_December_2016.pdf {'table_loc': [9]}
  IAS_12_December_2016.pdf pass the quality check.
IVA_12_December_2021.pdf {'table_loc': [6]}
  IVA_12_December_2021.pdf has an error.
2014-TM-12-December-News.pdf {'table_loc': [7]}
  2014-TM-12-December-News.pdf pass the quality check.
2011-TM-12-December_News.pdf {'table_loc': [6]}
  2011-TM-12-December_News.pdf could have column errors
2013-TM-12-December_News.pdf {'table_loc': [7]}
  2013-TM-12-December_News.pdf could have column errors
Tou12_December_2006.pdf {'table_l

In [13]:
pd.DataFrame(bycountry_err_dict)

Unnamed: 0,file,reason
0,Tou12_December_News_2005.pdf,Missing Error
1,Tou12_December_News_2004.pdf,Missing Error
2,Tou12_December_News_2007.pdf,Missing Error
3,2012-TM-12-December_News.pdf,Column Error
4,IVA_12_December_2021.pdf,Missing Error
5,2011-TM-12-December_News.pdf,Column Error
6,2013-TM-12-December_News.pdf,Column Error
7,Tou12_December_2006.pdf,Missing Error
8,TM12_December_2009_News.pdf,Column Error
9,TM12_December_2008_News.pdf,Missing Error


In [32]:
temp_lst = os.listdir("data/tourism/vanuatu/byorigin/")
temp_lst = [os.getcwd() + "/data/tourism/vanuatu/byorigin/" +
            file for file in temp_lst]

test = pd.read_csv(temp_lst[4])
test

Unnamed: 0.1,Unnamed: 0,Year,Month,Australie,New Zealand,Caledonia,Other PIC,Europe,North America,Japon,Countries,Total
0,0,2009.0,,64909,12607,9155,3708,4890,2549,642,2216,100675
1,1,2010.0,,58760,11927,11410,4719,4888,2395,517,2564,97180
2,2,2011.0,,57843,11399,11376,3397,5265,1922,630,2128,93960
3,3,2012.0,,65405,14430,13138,4313,5491,2094,705,2585,108161
4,4,2013.0,,65776,15068,12515,4874,5544,2614,659,3059,110109
5,5,2009.0,Dec,5631,688,1197,374,365,189,49,201,8693
6,6,2010.0,Dec,5995,746,1410,696,373,158,51,204,9633
7,7,2011.0,Dec,5785,759,1460,333,340,109,52,152,8990
8,8,2012.0,Jan,5653,757,1458,247,341,105,49,145,8755
9,9,,Feb,2387,354,814,403,367,130,59,186,4700


In [None]:
split_time()

## PDFMINER

In [5]:
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser


output_string = StringIO()
with open(pdf_path, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        
output_string.getvalue()