## Delay Report

In [1]:
# Imports
import pandas as pd
import numpy as np
import random
import os
import json
import requests
import time

from tqdm.auto import tqdm
from pathlib import Path
from datetime import datetime, timedelta

In [2]:
from delay_report import MSCExtractor, G2Extractor, DelayReport, write_json, read_config

In [3]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [4]:
# Read configuration file
with open("data/config.json", "r") as f:
    config = json.load(f)
    
# Used to map carrier names to the ones BigSchedule uses and supports
with open("data/carrier_mapping.json", "r") as f:
    carrier_mapping = json.load(f)

# Bigschedule login
with open("data/bigschedules_login.json", "r") as f:
    bs_login = json.load(f)
    
# # Prepare base information
# # UNLOCODE to port name mapping
# port_mapping = (
#     pd.concat([pd.read_csv(p, usecols=[1, 2, 4, 5], engine='python', names=[
#               'country', 'port', 'name', 'subdiv']) for p in Path('data').glob("*UNLOCODE CodeListPart*")])
#     .query('port == port')
#     .assign(
#         uncode=lambda x: x.country.str.cat(x.port),
#         full_name=lambda x: np.where(
#             x.subdiv.notnull(), x.name.str.cat(x.subdiv, sep=", "), x.name)
#     )
#     .drop_duplicates('uncode')
#     .set_index('uncode')
#     .to_dict('index')
# )

# Read the vessel delay tracking file
xl = pd.ExcelFile('Vessel Delay Tracking.xlsx')

In [5]:
oocl_sheet = (
    xl.parse(pd.to_datetime(xl.sheet_names,
                            errors='coerce',
                            format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                            parse_dates=True)
                            .query(f"`Fwd Agent` in {[k for k,v in carrier_mapping.items() if v == 'OOCL']}")
                            .replace({'Fwd Agent': carrier_mapping})
)

In [10]:
oocl_sheet.loc[oocl_sheet['BOL Date'].isnull(), 'updated_etd'] = oocl_sheet['ETD Date']


In [6]:
port_mapping = {v['Port Code']: v['Port Name'] for k, v in (pd.read_excel('data/OOCL Port Code Mapping.xlsx')
                                                                         .to_dict('index').items())}

In [7]:
# Get port name
oocl_sheet = oocl_sheet.assign(pol_name=lambda x: x['Port of Loading'].apply(lambda y: port_mapping.get(y)),
                               pod_name=lambda x: x['Port of discharge'].apply(lambda y: port_mapping.get(y)))

In [8]:
session = requests.Session()

In [9]:
def query_id(port: str):
    url = f"https://www.oocl.com/_catalogs/masterpage/AutoCompleteSailingSchedule.aspx?type=sailingSchedule&Pars={port}"
    headers = {
        'Sec-Fetch-User':'?1',
        'Accept-Encoding':'gzip, deflate, br',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Dest': 'document',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'Accept-Language': 'en-GB,en;q=0.9',
        'Upgrade-Insecure-Requests': "1",
        'Cache-Control': 'max-age=0',
        'Cookie': 'BIGipServerOOCL=2251415186.20480.0000'
    }
    return session.get(url, headers=headers)

In [10]:
oocl_locations = (list(oocl_sheet.pol_name.unique()) + list(oocl_sheet.pod_name.unique()))

In [11]:
oocl_locations

['Brisbane, Queensland, Australia',
 'Sydney, New South Wales, Australia',
 'Melbourne, Victoria, Australia',
 'Bangkok, Krung Thep Mahanakhon, Thailand',
 'Ho Chi Minh (Cat Lai), Ho Chi Minh, Vietnam',
 'Busan, South Korea',
 'Nagoya, Aichi, Japan',
 'Hai Phong, Hai Phong, Vietnam',
 'Incheon, Inchon, South Korea']

In [12]:
def get_id(response):
    return response.json().get('data').get('results')[0].get('LocationID')

In [13]:
response = query_id(oocl_locations[0])

In [14]:
get_id(response)

'461802935875046'

In [None]:
def get_schedules(pol_locationID: str, pod_locationID: str, pol_name: str, pod_name: str):
    url = f"http://moc.oocl.com/nj_prs_wss/mocss/secured/supportData/nsso/searchHubToHubRoute"
    headers = {
        'Host': 'moc.oocl.com'
        'Connection': 'keep-alive'
        'Content-Length': '525'
        'Accept': 'application/json, text/plain, */*'
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
        'Content-Type': 'application/json'
        'Origin': 'http://moc.oocl.com'
        'Referer': 'http://moc.oocl.com/nj_prs_wss/'
        'Accept-Encoding': 'gzip, deflate'
        'Accept-Language': 'en-GB,en;q=0.9'
        'Cookie': 'BIGipServeriris4-wss=1613880978.61451.0000; BIGipServerpool_ir4moc=590470802.20480.0000'
    }

    payload = {
        "date": f"{(datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')}",
        "displayDate": f"{(datetime.today() + timedelta(days=1)).strftime('%Y-%m-%d')}",
        "transhipment_Port": None,
        "port_of_Load": None,
        "port_of_Discharge": None,
        "sailing": "sailing.from",
        "weeks": "2",
        "transhipment_PortId": None,
        "service": None,
        "port_of_LoadId": None,
        "port_of_DischargeId": None,
        "origin_Haulage": "cy",
        "destination_Haulage": "cy",
        "cargo_Nature": "dry",
        "originId": f"{pol_locationID}",
        "originCountryCode": "",
        "destinationCountryCode": "",
        "destinationId": f"{pod_locationID}",
        "origin": f"{pol_name}",
        "destination": f"{pod_name}",
        "weeksSymbol": "+"
    }
    
    return self.session.post(url, headers=headers, data=payload)

In [None]:
get_schedules()

In [None]:
class OOCLExtractor:
    def __init__(self, main_delay_sheet: pd.DataFrame, interval: tuple):
        # Get the OOCL delay sheet
        self.delay_sheet = (main_delay_sheet.query(f"`Fwd Agent` in {['OOCL']}")
                            .drop(['updated_etd', 'updated_eta', 'No. of days delayed ETD',
                                   'No. of days delayed ETA', 'Reason of Delay'], axis=1)
                            .copy())

        # Get the MSC-specific port names from the UNLOCODEs
        self.port_mapping = {v['Port Code']: v['Port Name'] for k, v in (pd.read_excel('../../data/OOCL Port Code Mapping.xlsx')
                                                                         .to_dict('index').items())}
        
        # Get port name
        self.delay_sheet = self.delay_sheet.assign(pol_name=lambda x: x['Port of Loading'].apply(lambda y: self.port_mapping.get(y)),
                                                   pod_name=lambda x: x['Port of discharge'].apply(lambda y: self.port_mapping.get(y))).copy()

        self.interval = interval
        self.session = requests.Session()
    
    def get_locationID(self):
        """
        Checks if the query for locationID has been done today.
        If it has been done, skips it and uses the existing locationID JSON file.
        Otherwise, queries the locationID API.

        This API call requires a cookie.
        """
        if 'OOCL locationID.json' not in os.listdir():
            def query_id(port: str):
                url = f"https://www.oocl.com/_catalogs/masterpage/AutoCompleteSailingSchedule.aspx?type=sailingSchedule&Pars={port}"
                return self.session.get(url)

            
            
            
            
            
            
            
            
            
            
            
            
            
            
            
            def get_id(response):
                if len(response.json()):
                    return response.json()[0].get('id')

            msc_locations = list(self.delay_sheet.pol_name.unique()) + list(self.delay_sheet.pod_name.unique())
            location_code_responses = {location: query_id(
                location) for location in tqdm(msc_locations)}
            self.msc_port_id = {k: get_id(v)
                                for k, v in location_code_responses.items()}
            write_json(self.msc_port_id, 'countryID.json')

            # PODs with no pod_id
            exception_cases = [
                k for k, v in self.msc_port_id.items() if v is None]
            write_json(exception_cases, 'msc_exceptions.txt')
        else:
            read_config(self, 'msc_port_id', 'OOCL locationID.json')

    
    
    def prepare(self):
        """
        Further filters self.delay_sheet to a smaller list of searches needed to fulfill all the lines on the
            delay_sheet.
        """
        # Further filter by POL-Vessel-Voyage to get ETD, POD-Vessel-Voyage to get ETA
        key = ['pol_name', 'pod_name']
        self.reduced_df = self.delay_sheet.drop_duplicates(key)[key].sort_values(key)

        self.reduced_df['pol_code'] = self.reduced_df.pol_name.map(self.msc_port_id)
        self.reduced_df['pod_code'] = self.reduced_df.pod_name.map(self.msc_port_id)

#         # Unable to handle those with no pod_id in BigSchedules Web; dropping these lines
#         self.reduced_df.dropna(inplace=True)
        
    def call_api(self):
        """
        Makes calls to the BigSchedules Web API, using information from the prepare method as parameters in the
        API request. Also saves the API responses into a subdirectory "responses/<today_date>".
        """
#         def get_schedules(etd: str, pol: str, pod: str):
#             url = f"https://www.bigschedules.com//api/vesselSchedule/list?DISABLE_ART=true&_=2020081917&carrierId=18&language=en-US&scac=HLCU&vesselGid=V000005557&vesselName=CHRISTA+SCHULTE"
#             headers = {
#                 'Accept': 'application/json',
#                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
#                 'Content-Type': 'application/json',
#                 'Sec-Fetch-Site': 'same-origin',
#                 'Sec-Fetch-Mode': 'cors',
#                 'Sec-Fetch-Dest': 'empty',
#                 'Referer': 'https://www.msc.com/search-schedules',
#                 'Accept-Language': 'en-GB,en;q=0.9',
#                 'Cookie': 'CMSPreferredCulture=en-GB; ASP.NET_SessionId=tht5lkut0asln2goiskoagfe; UrlReferrer=https://www.google.com/; CurrentContact=8b0b2fea-705b-4a4f-b8bf-bb1cd6c982bc; MSCAgencyId=115867; BIGipServerkentico.app~kentico_pool=439883018.20480.0000; _ga=GA1.2.1736073830.1597290148; _gid=GA1.2.1289141279.1597290148; _gcl_au=1.1.345060449.1597290148; __hstc=100935006.13bb76c8a78a8d0a203a993ffef3a3f6.1597290148282.1597290148282.1597290148282.1; hubspotutk=13bb76c8a78a8d0a203a993ffef3a3f6; __hssrc=1; _ym_uid=15972901491036911544; _ym_d=1597290149; _ym_isad=1; newsletter-signup-cookie=temp-hidden; _hjid=3e183004-f562-4048-8b60-daccdf9c187c; _hjUserAttributesHash=2c3b62a0e1cd48bdfd4d01b922060e19; _hjCachedUserAttributes={"attributes":{"mscAgencyId":"115867"},"userId":null}; OptanonAlertBoxClosed=2020-08-13T03:42:45.080Z; CMSCookieLevel=200; VisitorStatus=11062214903; TS0142aef9=0192b4b6225179b1baa3b4d270b71a4eee782a0192338173beabaa471f306c2a13fe854bf6a7ac08ac21924991864aa7728c54559023beabd273d82285d5f943202adb58da417d61813232e89b240828c090f890c6a74dc4adfec38513d13447be4b5b4404d69f964987b7917f731b858f0c9880a139994b98397c4aeb5bd60b0d0e38ec9e5f3c97b13fb184b4e068506e6086954f8a515f2b7239d2e5c1b9c70f61ca74f736355c58648a6036e9b5d06412389ac41221c5cb740df99c84dc2bfef4a530dbc5e2577c189212eebac723d9ee9f98030f4bc6ca7d824ab313ae5fdd1eaa9886; OptanonConsent=isIABGlobal=false&datestamp=Thu+Aug+13+2020+11%3A43%3A36+GMT%2B0800+(Singapore+Standard+Time)&version=5.9.0&landingPath=NotLandingPage&groups=1%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_53017%3A1%2C0_53020%3A1%2C0_53018%3A1%2C0_53019%3A1%2C101%3A1&AwaitingReconsent=false'
#             }
#             response = self.session.get(url, headers=headers)
#             return response
        
#         self.response_jsons = []
#         first_day = datetime.today().replace(day=1).strftime('%Y-%m-%d')
        
#         for row in tqdm(self.reduced_df.itertuples(), total=len(self.reduced_df)):
#             response_filename = f'MSC {int(row.pol_code)}-{int(row.pod_code)}.json'
#             if response_filename not in os.listdir():
#                 response = get_schedules(first_day, int(row.pol_code), int(row.pod_code))
#                 self.response_jsons.append(response.json())
#                 write_json(response.json(), response_filename)
#                 time.sleep(random.randint(*self.interval))
#             else:
#                 with open(response_filename, 'r') as f:
#                     self.response_jsons.append(json.load(f))
        
    def extract(self):
        """
        Extracts information from the JSON responses from the call_api method and assembles the final dataframe.
        """
#         def get_relevant_fields(response, i):
#             return {
#                 'pol_code': response[0]['Sailings'][i]['PortOfLoadId'],
#                 'pod_code': response[0]['Sailings'][i]['PortOfDischargeId'],
#                 'Voyage': response[0]['Sailings'][i]['VoyageNum'],
#                 'Vessel': response[0]['Sailings'][i]['VesselName'],
#                 'updated_etd': response[0]['Sailings'][i]['NextETD'],
#                 'updated_eta': response[0]['Sailings'][i]['ArrivalDate']
#             }

#         self.response_df = pd.DataFrame(([get_relevant_fields(response, i)
#                                      for response in self.response_jsons
#                                      for i in range(len(response[0]['Sailings']))
#                                      if len(response)
#                                     ]))
        
#         # Create reverse mapping from port_code to name
#         msc_port_id_reversed = {v:k for k,v in self.msc_port_id.items()}

#         # Add additional columns to response_df
#         self.response_df['pol_name'] = self.response_df.pol_code.map(msc_port_id_reversed)
#         self.response_df['pod_name'] = self.response_df.pod_code.map(msc_port_id_reversed)

#         # Merge results back to original dataframe
#         merge_key = ['pol_name', 'pod_name', 'Vessel', 'Voyage']
#         self.delay_sheet = (self.delay_sheet.reset_index().
#                             merge(self.response_df[merge_key + ['updated_eta', 'updated_etd']],
#                                   on=merge_key, how='left')
#                             .set_index('index')
#                             .copy())
#         self.delay_sheet.updated_eta = pd.to_datetime(self.delay_sheet.updated_eta.str[:10])
#         self.delay_sheet.updated_etd = pd.to_datetime(self.delay_sheet.updated_etd.str[:10])


In [None]:
os.getcwd()

In [None]:
os.chdir('../..')
# Delay report skeleton
delay_report = DelayReport()
delay_report.run_bs()
delay_report.run_msc()
delay_report.run_g2()
delay_report.calculate_deltas()
delay_report.output()