## Delay Report

In [1]:
# Imports
import pandas as pd
import numpy as np
import random
import os
import json
import requests
import time

from tqdm.auto import tqdm
from pathlib import Path
from datetime import datetime, timedelta
# from requests_html import HTMLSession, AsyncHTMLSession

In [74]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
from delay_report import write_json, read_config

In [4]:
# Read configurations
config = {v['Field']: v['Value'] for k, v in pd.read_excel(
    'data/Configurations.xlsx').to_dict('index').items()}

# Used to map Fwd Agent column to the respective carrier portals
carrier_mapping = {v['Fwd Agent']: v['Carrier'] for k, v in pd.read_excel(
    'data/Carrier Mapping.xlsx').to_dict('index').items()}

# Read the vessel delay tracking file
xl = pd.ExcelFile('Vessel Delay Tracking.xlsx')

### ONE

In [15]:
interval = (2,5)
port_id = {}
session = requests.Session()

delay_sheet = (xl.parse().query(f"`Fwd Agent` in {[k for k,v in carrier_mapping.items() if v == 'ONE']}")
                    .replace({'Fwd Agent': carrier_mapping})
                    .drop(['updated_etd', 'updated_eta', 'No. of days delayed ETD',
                           'No. of days delayed ETA', 'Reason of Delay'], axis=1)
                    .copy())

port_mapping = {v['Port Code']: v['Port Name'] for k, v in (pd.read_excel('data/Port Code Mapping - ONE.xlsx')
                                                                                     .to_dict('index').items())}

delay_sheet = delay_sheet.assign(pol_name=lambda x: x['Port of Loading'],
                                 pod_name=lambda x: x['Port of discharge']).copy()

delay_sheet.pod_name = delay_sheet.pod_name.replace(port_mapping)

In [18]:
key = ['pol_name', 'pod_name']
reduced_df = delay_sheet.drop_duplicates(key)[
    key].sort_values(key)

reduced_df.dropna(inplace=True)

In [35]:
def get_schedules(pol_name: str, pod_name: str):
    url = "https://ecomm.one-line.com/ecom/CUP_HOM_3001GS.do"

    first_day = datetime.today().replace(day=1).strftime('%Y-%m-%d')
    last_day = datetime.today().replace(day=25).replace(month=datetime.today().month+1).strftime('%Y-%m-%d')

    payload = f'f_cmd=3&por_cd={pol_name}&del_cd={pod_name}&rcv_term_cd=Y&de_term_cd=Y&frm_dt={first_day}&to_dt={last_day}&ts_ind=&skd_tp=L'
    headers = {
        'Connection': 'keep-alive',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Origin': 'https://ecomm.one-line.com',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://ecomm.one-line.com/ecom/CUP_HOM_3001.do?sessLocale=en',
        'Accept-Language': 'en-GB,en;q=0.9',
    }
    
    return requests.post(url, headers=headers, data=payload)

response_jsons = []
for row in tqdm(reduced_df.itertuples(), total=len(reduced_df)):
    response_filename = f'ONE {row.pol_name}-{row.pod_name}.json'
    if response_filename not in os.listdir():
        response = get_schedules(row.pol_name, row.pod_name)
        response_jsons.append(response.json())
        if len(response.json()):
            write_json(response.json(), response_filename)
        time.sleep(random.randint(*interval))
    else:
        with open(response_filename, 'r') as f:
            response_jsons.append(json.load(f))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [72]:
def get_relevant_fields(response, i):
    def get_vv(response, i):
        return response['list'][i]['n1stVslNm'].rsplit(maxsplit=1)
    
    return {
        'pol_code': response['list'][i]['polYdCd'][:5],
        'pod_code': response['list'][i]['lstPodYdCd'][:5],
        'Voyage': get_vv(response, i)[0],
        'Vessel': get_vv(response, i)[1],
        'updated_etd': response['list'][i]['polEtdDt'],
        'updated_eta': response['list'][i]['polEtaDt']
    }

response_df = pd.DataFrame(([get_relevant_fields(response, i)
                                  for response in response_jsons
                                  if len(response)
                                  for i in range(len(response['list']))]))

In [94]:
# Create reverse mapping from port_code to name
port_id_reversed = {v['First Name']: v['Port Name'] for k, v in (pd.read_excel('data/Port Code Mapping - COSCO.xlsx')
                                                                 .to_dict('index').items())}

if len(response_df):
    response_df['pol_name'] = response_df.pol_code.map(
        port_id_reversed)
    response_df['pod_name'] = response_df.pod_code.map(
        port_id_reversed)

    merge_key = ['pol_name', 'pod_name', 'Vessel', 'Voyage']
    response_df = response_df.sort_values(
        'updated_eta').drop_duplicates(merge_key)

    delay_sheet = (delay_sheet.reset_index().
                        merge(response_df[merge_key + ['updated_eta', 'updated_etd']],
                              on=merge_key, how='left')
                        .set_index('index')
                        .copy())
else:
    # TODO: i think this should be delay_sheet
    response_df = pd.DataFrame({
        'pol_name': [], 'pod_name': [],
        'Vessel': [], 'Voyage': [],
        'updated_eta': [], 'updated_etd': []})

### HAPAG

In [36]:
hapag_sheet = (
    xl.parse(pd.to_datetime(xl.sheet_names,
                            errors='coerce',
                            format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                            parse_dates=True)
                            .query(f"`Fwd Agent` in {[k for k,v in carrier_mapping.items() if v == 'HAPAG']}")
                            .replace({'Fwd Agent': carrier_mapping})
)

In [38]:
port_mapping = {v['Port Code']: v['Port Name'] for k, v in (pd.read_excel('data/Port Code Mapping - HAPAG.xlsx')
                                                                 .to_dict('index').items())}

In [39]:
# Get port name
delay_sheet = hapag_sheet.assign(pol_name=lambda x: x['Port of Loading'],
                                 pol_code=lambda x: x['Port of Loading'],
                                 pod_name=lambda x: x['Port of discharge'],
                                 pod_code=lambda x: x['Port of discharge']).copy()

interval = (2,5)
session = requests.Session()

In [40]:
delay_sheet.pol_name = delay_sheet.pol_name.replace(port_mapping)
delay_sheet.pod_name = delay_sheet.pod_name.replace(port_mapping)

In [41]:
key = ['pol_name', 'pod_name']
reduced_df = delay_sheet.drop_duplicates(key)[key + ['pol_code', 'pod_code']].sort_values(key)

reduced_df.dropna(inplace=True)

In [42]:
def get_schedules(pol_name: str, pod_name: str, pol_code: str, pod_code: str):
    pol_name = pol_name.replace(',', '%2C').replace(' ', '%20')
    pod_name = pod_name.replace(',', '%2C').replace(' ', '%20')
    first_day = datetime.today().replace(day=1).strftime('%Y-%m-%d')
    url = f'https://www.hapag-lloyd.com/en/online-business/schedules/interactive-schedule.html?sn={pol_name}&sl={pol_code}&sp=3000&en={pod_name}&el={pod_code}&ep=&exportHaulage=MH&importHaulage=MH&departureDate={first_day}&weeksAfterStart=6&reefer=N'
    return session.get(url)

In [43]:
r = get_schedules(reduced_df.pol_name.iloc[0], reduced_df.pod_name.iloc[0],
                  reduced_df.pol_code.iloc[0], reduced_df.pod_code.iloc[0])

In [44]:
r.text

'<!DOCTYPE html>\r\n<html><head>\r\n<meta http-equiv="Pragma" content="no-cache"/>\r\n<meta http-equiv="Expires" content="-1"/>\r\n<meta http-equiv="CacheControl" content="no-cache"/>\r\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\r\n<link rel="shortcut icon" href="data:;base64,iVBORw0KGgo="/>\r\n\r\n<script>\r\n\r\n(function(){\r\nwindow["bobcmn"] = "111110111110102000000022000000052000000002a4b927ad200000096300000000300000000300000006/TSPD/300000008TSPD_101300000005https3000000b0081ecde62cab2000197d8704303c1d94cd372d9e5d00f6ef0afb7063dcd798e7c871f08391a8222f0898e6784a0a280029cee1825eeb770adc44313c824db5de966df9d4e07c4e804401eab2e34c3910c383370e9d1c9bbe200000000200000000";\r\n\r\nwindow.NVDX=!!window.NVDX;try{(function(){(function(){})();var OL=74;try{var sL,_L,IL=O(709)?0:1,JL=O(821)?0:1;for(var Lo=(O(232),0);Lo<_L;++Lo)IL+=O(588)?1:2,JL+=(O(764),3);sL=IL+JL;window.I_===sL&&(window.I_=++sL)}catch(oo){window.I_=sL}var Oo=!0;function io(L){var z=50;!L||document

### Use selenium with phantomJS driver or headless

In [None]:
response_jsons = []
for row in tqdm(reduced_df.itertuples(), total=len(reduced_df)):
    response_filename = f'Hapag {row.pol_name}-{row.pod_name}.html'
    if response_filename not in os.listdir():
        response = get_schedules(row.pol_name, row.pod_name, row.pol_code, row.pod_code)
        response_jsons.append(response.text)
        if len(response.text):
            with open(response_filename, 'w') as f:
                f.write(response.text)
        time.sleep(random.randint(*interval))
    else:
        with open(response_filename, 'r') as f:
            response_jsons.append(f.read())

In [None]:
response_intermediate = [pd.read_html(response) for response in response_jsons]

In [None]:
response.text