## Delay Report
### Overview
The delay report script aims to find the updated_eta and updated_etd of certain vessels provided within "Vessel Delay Tracking.XLSX". This is done by querying an underlying BigSchedules API, MSC Web API and from a static G2 Schedules Excel document. None of the API interactions use the Rio Tinto credentials to ensure that traceback cannot occur.

The script is written in a modular approach to increase ease of maintenance and improve code quality. Configurations are stored in a `data` subdirectory. The script expects a `Vessel Delay Tracking.XLSX` file and `g2_filename` (G2 Schedule Excel file) in the same directory.

### Features
1. Avoids detection
    - Uses API calls instead of Selenium which is easily detectable
    - Uses randomised timing for API requests
2. Modular
    - If one component breaks, you can always disable it without affecting the other modules

In [2]:
# Imports
import pandas as pd
import numpy as np
import random
import os
import json
import requests
import time

from tqdm.auto import tqdm
from pathlib import Path
from datetime import datetime

In [1]:
from delay_report import MSCExtractor, G2Extractor, DelayReport, write_json, read_config

In [None]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# Read configuration file
with open("data/config.json", "r") as f:
    config = json.load(f)
    
# Used to map carrier names to the ones BigSchedule uses and supports
with open("data/carrier_mapping.json", "r") as f:
    carrier_mapping = json.load(f)

# Bigschedule login
with open("data/bigschedules_login.json", "r") as f:
    bs_login = json.load(f)
    
# Prepare base information
# UNLOCODE to port name mapping
port_mapping = (
    pd.concat([pd.read_csv(p, usecols=[1, 2, 4, 5], engine='python', names=[
              'country', 'port', 'name', 'subdiv']) for p in Path('data').glob("*UNLOCODE CodeListPart*")])
    .query('port == port')
    .assign(
        uncode=lambda x: x.country.str.cat(x.port),
        full_name=lambda x: np.where(
            x.subdiv.notnull(), x.name.str.cat(x.subdiv, sep=", "), x.name)
    )
    .drop_duplicates('uncode')
    .set_index('uncode')
    .to_dict('index')
)

# Read the vessel delay tracking file
xl = pd.ExcelFile('Vessel Delay Tracking.xlsx')

In [None]:
bigschedules_sheet = (
    xl.parse(pd.to_datetime(xl.sheet_names,
                            errors='coerce',
                            format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                            parse_dates=True)
                            .query(f"`Fwd Agent` in {[k for k,v in carrier_mapping.items()]}")
                            .replace({'Fwd Agent': carrier_mapping})
)

In [None]:
# Get port name
bigschedules_sheet = bigschedules_sheet.assign(pol_name=lambda x: x['Port of Loading'].apply(lambda y: port_mapping.get(y)['name']),
                                               pod_name=lambda x: x['Port of discharge'].apply(lambda y: port_mapping.get(y)['name']))

In [None]:
bigschedules_sheet.loc[bigschedules_sheet['Fwd Agent'] == 'HAMBURG']

In [None]:
class BSExtractor:
    """
    Extracts information from the BigSchedules Sailing Schedules Web API (not Vessel Tracking Web API).
    
    Methods
    -------
    prepare:
        A single query to the BigSchedules Web API can provide information to multiple lines on the delay_sheet.
        Further filters self.delay_sheet to a smaller list of searches needed to fulfill all the lines on the
            delay_sheet. This reduces the total number of calls made to the BigSchedules Web API and prevents
            duplication of API calls.
        
    call_api:
        Makes calls to the BigSchedules Web API, using information from the prepare method as parameters in the
        API request. Also saves the API responses into a subdirectory "responses/<today_date>".
    
    extract:
        Extracts information from the JSON responses from the call_api method and assembles the final dataframe.
    """
    def __init__(self, main_delay_sheet: pd.DataFrame, interval: tuple):
        # Get the BigSchedules delay sheet
        self.delay_sheet = (main_delay_sheet.query(f"`Fwd Agent` not in {['MSC', 'G2OCEAN']}")
                            .drop(['updated_etd', 'updated_eta', 'No. of days delayed ETD',
                                   'No. of days delayed ETA', 'Reason of Delay'], axis=1)
                            .copy())

        # Get the BigSchedules-specific port names from the UNLOCODEs
        self.port_mapping = (pd.concat([pd.read_csv(p, usecols=[1, 2, 4, 5], engine='python',
                                               names=['country', 'port', 'name', 'subdiv']) for p in Path('data').glob("*UNLOCODE CodeListPart*")])
            .query('port == port')
            .assign(
                uncode=lambda x: x.country.str.cat(x.port),
                full_name=lambda x: np.where(
                    x.subdiv.notnull(), x.name.str.cat(x.subdiv, sep=", "), x.name)
            )
            .drop_duplicates('uncode')
            .set_index('uncode')
            .to_dict('index')
        )
        
        # Get port name
        self.delay_sheet = self.delay_sheet.assign(pol_name=lambda x: x['Port of Loading'].apply(lambda y: self.port_mapping.get(y)),
                                                   pod_name=lambda x: x['Port of discharge'].apply(lambda y: self.port_mapping.get(y))).copy()

        self.interval = interval
        self.session = requests.Session()
        
    def prepare(self):
        """
        Further filters self.delay_sheet to a smaller list of searches needed to fulfill all the lines on the
            delay_sheet.
        """
#         # Further filter by POL-Vessel-Voyage to get ETD, POD-Vessel-Voyage to get ETA
#         key = ['pol_name', 'pod_name']
#         self.reduced_df = self.delay_sheet.drop_duplicates(key)[key].sort_values(key)

#         self.reduced_df['pol_code'] = self.reduced_df.pol_name.map(self.msc_port_id)
#         self.reduced_df['pod_code'] = self.reduced_df.pod_name.map(self.msc_port_id)

#         # Unable to handle those with no pod_id in BigSchedules Web; dropping these lines
#         self.reduced_df.dropna(inplace=True)
        
    def call_api(self):
        """
        Makes calls to the BigSchedules Web API, using information from the prepare method as parameters in the
        API request. Also saves the API responses into a subdirectory "responses/<today_date>".
        """
#         def get_schedules(etd: str, pol: str, pod: str):
#             url = f"https://www.bigschedules.com//api/vesselSchedule/list?DISABLE_ART=true&_=2020081917&carrierId=18&language=en-US&scac=HLCU&vesselGid=V000005557&vesselName=CHRISTA+SCHULTE"
#             headers = {
#                 'Accept': 'application/json',
#                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
#                 'Content-Type': 'application/json',
#                 'Sec-Fetch-Site': 'same-origin',
#                 'Sec-Fetch-Mode': 'cors',
#                 'Sec-Fetch-Dest': 'empty',
#                 'Referer': 'https://www.msc.com/search-schedules',
#                 'Accept-Language': 'en-GB,en;q=0.9',
#                 'Cookie': 'CMSPreferredCulture=en-GB; ASP.NET_SessionId=tht5lkut0asln2goiskoagfe; UrlReferrer=https://www.google.com/; CurrentContact=8b0b2fea-705b-4a4f-b8bf-bb1cd6c982bc; MSCAgencyId=115867; BIGipServerkentico.app~kentico_pool=439883018.20480.0000; _ga=GA1.2.1736073830.1597290148; _gid=GA1.2.1289141279.1597290148; _gcl_au=1.1.345060449.1597290148; __hstc=100935006.13bb76c8a78a8d0a203a993ffef3a3f6.1597290148282.1597290148282.1597290148282.1; hubspotutk=13bb76c8a78a8d0a203a993ffef3a3f6; __hssrc=1; _ym_uid=15972901491036911544; _ym_d=1597290149; _ym_isad=1; newsletter-signup-cookie=temp-hidden; _hjid=3e183004-f562-4048-8b60-daccdf9c187c; _hjUserAttributesHash=2c3b62a0e1cd48bdfd4d01b922060e19; _hjCachedUserAttributes={"attributes":{"mscAgencyId":"115867"},"userId":null}; OptanonAlertBoxClosed=2020-08-13T03:42:45.080Z; CMSCookieLevel=200; VisitorStatus=11062214903; TS0142aef9=0192b4b6225179b1baa3b4d270b71a4eee782a0192338173beabaa471f306c2a13fe854bf6a7ac08ac21924991864aa7728c54559023beabd273d82285d5f943202adb58da417d61813232e89b240828c090f890c6a74dc4adfec38513d13447be4b5b4404d69f964987b7917f731b858f0c9880a139994b98397c4aeb5bd60b0d0e38ec9e5f3c97b13fb184b4e068506e6086954f8a515f2b7239d2e5c1b9c70f61ca74f736355c58648a6036e9b5d06412389ac41221c5cb740df99c84dc2bfef4a530dbc5e2577c189212eebac723d9ee9f98030f4bc6ca7d824ab313ae5fdd1eaa9886; OptanonConsent=isIABGlobal=false&datestamp=Thu+Aug+13+2020+11%3A43%3A36+GMT%2B0800+(Singapore+Standard+Time)&version=5.9.0&landingPath=NotLandingPage&groups=1%3A1%2C2%3A1%2C3%3A1%2C4%3A1%2C0_53017%3A1%2C0_53020%3A1%2C0_53018%3A1%2C0_53019%3A1%2C101%3A1&AwaitingReconsent=false'
#             }
#             response = self.session.get(url, headers=headers)
#             return response
        
#         self.response_jsons = []
#         first_day = datetime.today().replace(day=1).strftime('%Y-%m-%d')
        
#         for row in tqdm(self.reduced_df.itertuples(), total=len(self.reduced_df)):
#             response_filename = f'MSC {int(row.pol_code)}-{int(row.pod_code)}.json'
#             if response_filename not in os.listdir():
#                 response = get_schedules(first_day, int(row.pol_code), int(row.pod_code))
#                 self.response_jsons.append(response.json())
#                 write_json(response.json(), response_filename)
#                 time.sleep(random.randint(*self.interval))
#             else:
#                 with open(response_filename, 'r') as f:
#                     self.response_jsons.append(json.load(f))
        
    def extract(self):
        """
        Extracts information from the JSON responses from the call_api method and assembles the final dataframe.
        """
#         def get_relevant_fields(response, i):
#             return {
#                 'pol_code': response[0]['Sailings'][i]['PortOfLoadId'],
#                 'pod_code': response[0]['Sailings'][i]['PortOfDischargeId'],
#                 'Voyage': response[0]['Sailings'][i]['VoyageNum'],
#                 'Vessel': response[0]['Sailings'][i]['VesselName'],
#                 'updated_etd': response[0]['Sailings'][i]['NextETD'],
#                 'updated_eta': response[0]['Sailings'][i]['ArrivalDate']
#             }

#         self.response_df = pd.DataFrame(([get_relevant_fields(response, i)
#                                      for response in self.response_jsons
#                                      for i in range(len(response[0]['Sailings']))
#                                      if len(response)
#                                     ]))
        
#         # Create reverse mapping from port_code to name
#         msc_port_id_reversed = {v:k for k,v in self.msc_port_id.items()}

#         # Add additional columns to response_df
#         self.response_df['pol_name'] = self.response_df.pol_code.map(msc_port_id_reversed)
#         self.response_df['pod_name'] = self.response_df.pod_code.map(msc_port_id_reversed)

#         # Merge results back to original dataframe
#         merge_key = ['pol_name', 'pod_name', 'Vessel', 'Voyage']
#         self.delay_sheet = (self.delay_sheet.reset_index().
#                             merge(self.response_df[merge_key + ['updated_eta', 'updated_etd']],
#                                   on=merge_key, how='left')
#                             .set_index('index')
#                             .copy())
#         self.delay_sheet.updated_eta = pd.to_datetime(self.delay_sheet.updated_eta.str[:10])
#         self.delay_sheet.updated_etd = pd.to_datetime(self.delay_sheet.updated_etd.str[:10])


### To-do
1. I need to figure out how to get the first cookie and use it in subsequent headers.

In [None]:
os.getcwd()

In [None]:
os.chdir('../..')
# Delay report skeleton
delay_report = DelayReport()
delay_report.run_bs()
delay_report.run_msc()
delay_report.run_g2()
delay_report.calculate_deltas()
delay_report.output()