In [1]:
# Imports
import pandas as pd
import numpy as np
import random
import logging
import os
import json
import requests

from pathlib import Path
from datetime import datetime

In [None]:
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
# Read configuration file
with open("data/config.json", "r") as f:
    config = json.load(f)
    
# Used to map carrier names to the ones BigSchedule uses and supports
with open("data/carrier_mapping.json", "r") as f:
    carrier_mapping = json.load(f)

# Bigschedule login
with open("data/bigschedule_login.json", "r") as f:
    bs_login = json.load(f)
    
# Prepare base information
# UNLOCODE to port name mapping
port_mapping = (
    pd.concat([pd.read_csv(p, usecols=[1, 2, 4, 5], engine='python', names=[
              'country', 'port', 'name', 'subdiv']) for p in Path('data').glob("*UNLOCODE CodeListPart*")])
    .query('port == port')
    .assign(
        uncode=lambda x: x.country.str.cat(x.port),
        full_name=lambda x: np.where(
            x.subdiv.notnull(), x.name.str.cat(x.subdiv, sep=", "), x.name)
    )
    .drop_duplicates('uncode')
    .set_index('uncode')
    .to_dict('index')
)

In [None]:
# Read the vessel delay tracking file
xl = pd.ExcelFile('Vessel Delay Tracking.xlsx')

### BSExtractor

In [None]:
bigschedules_sheet = (
    xl.parse(pd.to_datetime(xl.sheet_names,
                            errors='coerce',
                            format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                            parse_dates=True)
                            .query(f"`Fwd Agent` in {[k for k,v in carrier_mapping.items() if v != '']}")
                            .replace({'Fwd Agent': carrier_mapping})
)

In [None]:
# Get port name
bigschedules_sheet = bigschedules_sheet.assign(
    pol_name=lambda x: x['Port of Loading'].apply(
        lambda y: port_mapping.get(y)['name']),
    pod_name=lambda x: x['Port of discharge'].apply(
        lambda y: port_mapping.get(y)['name']),
)

In [None]:
# Determine what searches need to be made (splitting of concerns amongst BigSchedules, MSC & G2)

In [None]:
# Make the searches on the BigSchedules portal
'''
Takes in a list of dataframe of vessels, their carriers, POL, POD.
Shrinks the above dataframe to vessels & their carriers. Uses this new dataframe for querying BigSchedules.
Outputs an updated dataframe of vessels & their carriers with 2 additional columns updated_eta and updated_etd.
'''

### MSCExtractor

In [None]:
# Make the searches on the MSC portal
msc_delay_sheet = (xl.parse(pd.to_datetime(xl.sheet_names,
                            errors='coerce',
                            format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                            parse_dates=True)
                            .query(f"`Fwd Agent` in {['MSC']}")
                            .replace({'Fwd Agent': carrier_mapping})
)

In [None]:
msc_delay_sheet

### G2Extractor

In [2]:
class G2Extractor:
    def __init__(self, xl, carrier_mapping):
        self.schedule = pd.read_excel("G2 Schedule New.xlsx", skiprows=9, index_col='Unnamed: 0')
        self.delay_sheet = (xl.parse(pd.to_datetime(xl.sheet_names,
                                errors='coerce',
                                format='%d.%m.%Y').max().date().strftime('%d.%m.%Y'),
                                parse_dates=True)
                                .query(f"`Fwd Agent` in {['G2OCEAN']}")
                                .replace({'Fwd Agent': carrier_mapping}))
        self.g2_port_map = {
            'AUPTJ': 'Portland',
            'AUNTL': 'Newcastle',
            'AUGLT': 'Gladstone',
            'NZTWI': 'Bluff',
            'TWKHH': 'Kaohsiung',
            'KRINC': 'Inchon',
            'KRPUS': 'Busan',
            'JPYOK': 'Yokohama',
            'JPNGO': 'Nagoya',
            'JPOSA': 'Osaka',
            'JPTOY': 'Toyama',
            'JPIHA': 'Niihama',
            'HKHKG': 'Hong Kong',
            'CNSHA': 'Shanghai'
        }
        
    def get_updated_etd(self, row):
        try:
            # column_index_etd is the column number that points to the ETD
            column_index_etd = np.argwhere(self.schedule.columns.str.contains(row['Vessel']))[0][0] + 1
        except IndexError:
            return np.nan
        return self.schedule.loc[self.schedule.index == self.g2_port_map.get(row['Port of Loading'])].iloc[:, column_index_etd][0] 
    
    def get_updated_eta(self, row):
        try:
            # column_index_eta is the column number that points to the ETA
            column_index_eta = np.argwhere(self.schedule.columns.str.contains(row['Vessel']))[0][0]
        except IndexError:
            return np.nan
        return self.schedule.loc[self.schedule.index == self.g2_port_map.get(row['Port of discharge'])].iloc[:, column_index_eta][0]
    
    def extract(self):
        self.delay_sheet['updated_etd'] = self.delay_sheet.apply(self.get_updated_etd, axis=1)
        self.delay_sheet['updated_eta'] = self.delay_sheet.apply(self.get_updated_eta, axis=1)

### Final update

In [5]:
class DelayReport:
    def __init__(self):
        # Read configuration file
        with open("data/config.json", "r") as f:
            self.config = json.load(f)
            
        # Used to map carrier names to the ones BigSchedules uses and supports
        with open("data/carrier_mapping.json", "r") as f:
            self.carrier_mapping = json.load(f)
        
        # BigSchedules login
        with open("data/bigschedules_login.json", "r") as f:
            self.bs_login = json.load(f)
        
        # Prepare base information
        # UNLOCODE to port name mapping
        self.port_mapping = (
            pd.concat([pd.read_csv(p, usecols=[1, 2, 4, 5], engine='python', names=[
                      'country', 'port', 'name', 'subdiv']) for p in Path('data').glob("*UNLOCODE CodeListPart*")])
            .query('port == port')
            .assign(uncode=lambda x: x.country.str.cat(x.port),
                    full_name=lambda x: np.where(x.subdiv.notnull(), x.name.str.cat(x.subdiv, sep=", "), x.name))
            .drop_duplicates('uncode')
            .set_index('uncode')
            .to_dict('index'))
        
        # Read the vessel delay tracking file
        self.xl = pd.ExcelFile('Vessel Delay Tracking.xlsx')
        # today_date = datetime.now().strftime('%d.%m.%Y')
        # if today_date not in self.xl.sheet_names:
        #     raise Exception(
        #         f"The script cannot find today's date ({today_date}) in the Vessel Delay Tracking.xlsx file provided. Please check that the sheets are correctly named - the script will only operate on a sheet with today's date.")
        
    def run_bs(self):
        if self.config.get('run_bs'):
            bs_extractor = BSExtractor()
            bs_extractor.extract()
        
    def run_msc(self):
        if self.config.get('run_msc'):
            self.msc_extractor = MSCExtractor(self.xl, self.carrier_mapping)
            self.msc_extractor.extract()
    
    def run_g2(self):
        if self.config.get('run_g2'):
            self.g2_extractor = G2Extractor(self.xl, self.carrier_mapping)
            self.g2_extractor.extract()
    
    def assemble(self):
        # Assemble the final dataframe to update
        main_delay_sheet = self.xl.parse()

        # Add new columns to the right side of the dataframe
        new_columns = ['updated_etd', 'updated_eta', 'No. of days delayed ETD', 'No. of days delayed ETA', 'Reason of Delay']
        main_delay_sheet[new_columns] = pd.DataFrame([[pd.NaT for i in range(4)] + [np.nan]])

        if self.config.get('run_bs'):
            main_delay_sheet.update(self.bs_extractor.delay_sheet)
        
        if self.config.get('run_msc'):
            main_delay_sheet.update(self.msc_extractor.delay_sheet)
        
        if self.config.get('run_g2'):
            main_delay_sheet.update(self.g2_extractor.delay_sheet)

        # Calculate the deltas
        main_delay_sheet['No. of days delayed ETD'] = (main_delay_sheet.updated_etd
                                                       - pd.to_datetime(main_delay_sheet['ETD Date'])).dt.days
        main_delay_sheet['No. of days delayed ETA'] = (main_delay_sheet.updated_eta
                                                       - pd.to_datetime(main_delay_sheet['Disport ETA'])).dt.days

        # Format the dates correctly via strftime
        date_columns = ['ETD Date', 'Disport ETA', 'updated_etd', 'updated_eta']
        for column in date_columns:
            main_delay_sheet[column] = main_delay_sheet[column].dt.strftime('%d/%m/%Y')
        self.main_delay_sheet = main_delay_sheet.copy()
    
    def output(self):
        # Output the excel file
        saved_file = 'main_delay_sheet.xlsx'
        main_delay_sheet.to_excel(saved_file)
        # os.startfile(saved_file)

In [6]:
# Delay report skeleton
delay_report = DelayReport()
delay_report.run_bs()
delay_report.run_msc()
delay_report.run_g2()
delay_report.assemble()