## Delay Report

In [1]:
# Imports
import pandas as pd
import numpy as np
import random
import os
import json
import requests
import time

from tqdm.auto import tqdm
from pathlib import Path
from datetime import datetime, timedelta
# from requests_html import HTMLSession, AsyncHTMLSession

In [2]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
from delay_report import write_json, read_config

In [4]:
# Read configurations
config = {v['Field']: v['Value'] for k, v in pd.read_excel(
    'data/Configurations.xlsx').to_dict('index').items()}

# Used to map Fwd Agent column to the respective carrier portals
carrier_mapping = {v['Fwd Agent']: v['Carrier'] for k, v in pd.read_excel(
    'data/Carrier Mapping.xlsx').to_dict('index').items()}

# # Prepare base information
# # UNLOCODE to port name mapping
# port_mapping = (
#     pd.concat([pd.read_csv(p, usecols=[1, 2, 4, 5], engine='python', names=[
#               'country', 'port', 'name', 'subdiv']) for p in Path('data').glob("*UNLOCODE CodeListPart*")])
#     .query('port == port')
#     .assign(
#         uncode=lambda x: x.country.str.cat(x.port),
#         full_name=lambda x: np.where(
#             x.subdiv.notnull(), x.name.str.cat(x.subdiv, sep=", "), x.name)
#     )
#     .drop_duplicates('uncode')
#     .set_index('uncode')
#     .to_dict('index')
# )

# Read the vessel delay tracking file
xl = pd.ExcelFile('Vessel Delay Tracking.xlsx')

### HAPAG

In [None]:
hapag_sheet = (
    xl.parse(pd.to_datetime(xl.sheet_names,
                            errors='coerce',
                            format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                            parse_dates=True)
                            .query(f"`Fwd Agent` in {[k for k,v in carrier_mapping.items() if v == 'HAPAG']}")
                            .replace({'Fwd Agent': carrier_mapping})
)

In [None]:
port_mapping = {v['Port Code']: v['Port Name'] for k, v in (pd.read_excel('data/Hapag Port Code Mapping.xlsx')
                                                                 .to_dict('index').items())}

In [None]:
# Get port name
delay_sheet = hapag_sheet.assign(pol_name=lambda x: x['Port of Loading'],
                                 pol_code=lambda x: x['Port of Loading'],
                                 pod_name=lambda x: x['Port of discharge'],
                                 pod_code=lambda x: x['Port of discharge']).copy()

interval = (2,5)
session = requests.Session()

In [None]:
delay_sheet.pol_name = delay_sheet.pol_name.replace(port_mapping)
delay_sheet.pod_name = delay_sheet.pod_name.replace(port_mapping)

In [None]:
key = ['pol_name', 'pod_name']
reduced_df = delay_sheet.drop_duplicates(key)[key + ['pol_code', 'pod_code']].sort_values(key)

reduced_df.dropna(inplace=True)

In [None]:
def get_schedules(pol_name: str, pod_name: str, pol_code: str, pod_code: str):
    pol_name = pol_name.replace(',', '%2C').replace(' ', '%20')
    pod_name = pod_name.replace(',', '%2C').replace(' ', '%20')
    first_day = datetime.today().replace(day=1).strftime('%Y-%m-%d')
    url = f'https://www.hapag-lloyd.com/en/online-business/schedules/interactive-schedule.html?sn={pol_name}&sl={pol_code}&sp=3000&en={pod_name}&el={pod_code}&ep=&exportHaulage=MH&importHaulage=MH&departureDate={first_day}&weeksAfterStart=6&reefer=N'
    return session.get(url)

In [None]:
r = get_schedules(reduced_df.pol_name.iloc[0], reduced_df.pod_name.iloc[0],
                  reduced_df.pol_code.iloc[0], reduced_df.pod_code.iloc[0])

### Use selenium with phantomJS driver or headless

In [None]:
response_jsons = []
for row in tqdm(reduced_df.itertuples(), total=len(reduced_df)):
    response_filename = f'Hapag {row.pol_name}-{row.pod_name}.html'
    if response_filename not in os.listdir():
        response = get_schedules(row.pol_name, row.pod_name, row.pol_code, row.pod_code)
        response_jsons.append(response.text)
        if len(response.text):
            with open(response_filename, 'w') as f:
                f.write(response.text)
        time.sleep(random.randint(*interval))
    else:
        with open(response_filename, 'r') as f:
            response_jsons.append(f.read())

In [None]:
response_intermediate = [pd.read_html(response) for response in response_jsons]

In [None]:
response.text