# ONE ETL

In [36]:
from bs4 import BeautifulSoup
import time
import datetime as dt
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
URL = "https://ecomm.one-line.com/ecom/CUP_HOM_3301.do?sessLocale=en"
#DRIVER_PATH = "C:/Users/CRE1725/AppData/Local/edge_driver_x64/msedgedriver.exe"
DRIVER_PATH = "/home/evgeny/ETL_project/geckodriver"



In [20]:
from jupyterthemes import jtplot
jtplot.style('gruvboxd')
!jt -t gruvboxd -T -N
# onedork

## Init driver and open browser

In [24]:
if os.name == 'nt':
    service = Service("C:/Users/CRE1725/AppData/Local/edge_driver_x64/msedgedriver.exe")
    driver = webdriver.Edge(executable_path="C:/Users/CRE1725/AppData/Local/edge_driver_x64/msedgedriver.exe")
elif os.name == 'posix':
    status_code = requests.get(URL).status_code
    options = webdriver.FirefoxOptions()
    #options.add_argument('--headless')
    service = Service("/home/evgeny/ETL_project/geckodriver")
    driver = webdriver.Firefox(service=service, options=options)
driver.get(URL)
WebDriverWait(driver, 10).until(lambda d: d.find_element(By.ID, "searchType"))
WebDriverWait(driver, 10).until(lambda d: d.find_element(By.ID, "searchName"))
WebDriverWait(driver, 10).until(lambda d: d.find_element(By.ID, "btnSearch"))

<selenium.webdriver.remote.webelement.WebElement (session="ee10289c-638c-4c38-994f-2fb0651da095", element="a1fc1795-7911-42ff-b1ec-58ff5ac5f597")>

## Extract

In [25]:
cont_number = "NYKU9733409"
def extract(cont_number):
    # Click first to ensure sending keys properly to select field
    driver.find_element(By.ID, "searchType").click()
    driver.find_element(By.ID, "searchType").send_keys("All")
    driver.find_element(By.ID, "searchName").clear()
    driver.find_element(By.ID, "searchName").send_keys(cont_number)
    driver.find_element(By.ID, "btnSearch").click()
    time.sleep(2)
    #WebDriverWait(driver, 10).until(lambda d: d.find_element(By.ID, "main-control-btn4"))
    page_source = BeautifulSoup(driver.page_source)
    table_data = page_source.find(id="detail").find_all("tr")
    if len(str(table_data)) < 80:
        pass # add counter for 2nd attempt and exit
    else:
        return table_data

In [26]:
table_data = extract(cont_number)

In [175]:
#table_data

In [27]:
len(str(table_data))

7977

## Transform

In [169]:
def transform(cont_number, table_data, mode="init"):
    """Transform table data for saving initial schedule."""
    document = {
        "contNumber": cont_number,
        "trackStart": None, "trackEnd": None, "lastUpdate": None,
        "nextUpdate": None, "nextEventIdx": None, "outboundTerminal": None,
        "inboundTerminal": None, "oceanVessel": None, "feederVessel": None,
        "event": [], "location": [], "date": [], "status": [],
    }
    terminals = ["Outbound Terminal", "Inbound Terminal"]
    updateFlag = True
    # Iterate over table rows and fetch data
    for idx, row in enumerate(table_data[1:]):
        # Fetch text data from table html elements
        row_data = [d.text for d in row.find_all("td")]
        # Find inbound and outbound terminals
        for t in terminals:
            if row_data[1].find(t) > -1:
                key = t.split()
                document[key[0].lower() + key[1]] = row_data[2]
        # Find and update ocean vessel name
        if row_data[1].startswith("Loaded on") > -1\
            and row_data[1].find("at Port of Loading") > -1:
            start_idx = row_data[1].find(" '") + 2
            end_idx = row_data[1].find("' ") - 1
            document["oceanVessel"] = row_data[1][start_idx:end_idx]
        # Find and update feeder vessel name
        if row_data[1].startswith("Loaded on") > -1\
            and row_data[1].find("at Transhipment Port") > -1:
            start_idx = row_data[1].find(" '") + 2
            end_idx = row_data[1].find("' ") - 1
            document["feederVessel"] = row_data[1][start_idx:end_idx]
        # Find and update next estimated event date and event index
        if row_data[3].startswith("Estimate") and updateFlag:
            document["nextUpdate"] = row_data[3][-16:]
            document["nextEventIdx"] = idx
            updateFlag = False
        # Find and update event, location, date, status
        document["event"].append(row_data[1])
        document["location"].append(row_data[2])
        document["date"].append(row_data[3][-16:])
        document["status"].append(row_data[3][:-17])
    # Update trackEnd nextUpdate nextEventIdx when container was delivered
    if updateFlag:
        document["trackEnd"] =\
        dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d %H:%M")
        document["nextUpdate"] = None
        document["nextEventIdx"] = None
    # Add initial schedule and set tracking start date
    if mode == "init":
        document["trackStart"] =\
            dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d %H:%M")
        document["lastUpdate"] = document["trackStart"]
        document["initData"] = {
            "event": document["event"], "location": document["location"],
            "date": document["date"], "status": document["status"]
        }
    return document

In [170]:
init_document = transform(cont_number, table_data)

In [171]:
init_document

{'contNumber': 'NYKU9733409',
 'trackStart': '2021-10-31 10:32',
 'trackEnd': None,
 'lastUpdate': '2021-10-31 10:32',
 'nextUpdate': '2021-11-06 07:30',
 'nextEventIdx': 4,
 'outboundTerminal': 'NAGOYA, AICHI, JAPAN TCB (TOBISHIMA CONTAINER BERTH)',
 'inboundTerminal': 'ST PETERSBURG, RUSSIAN FEDERATION CONTAINER TERMINAL SAINT PETERSBURG JSC',
 'oceanVessel': 'NYK VENUS 069',
 'feederVessel': 'VALENCIA EXPRESS 2145',
 'event': ['Empty Container Release to Shipper',
  'Gate In to Outbound Terminal',
  "Loaded on 'NYK VENUS 069W' at Port of Loading NYK VENUS 069W",
  "'NYK VENUS 069W' Departure from Port of Loading NYK VENUS 069W",
  "'NYK VENUS 069W' Arrival at Transhipment Port NYK VENUS 069W",
  "'NYK VENUS 069W' T/S Berthing Destination NYK VENUS 069W",
  "Unloaded from 'NYK VENUS 069W' at Transhipment Port NYK VENUS 069W",
  "Loaded on 'VALENCIA EXPRESS 2145E' at Transhipment Port VALENCIA EXPRESS 2145E",
  'Departure from Transhipment Port VALENCIA EXPRESS 2145E',
  "'VALENCIA EX

## Load

In [12]:
def load(data):
    time_stamp = dt.datetime.strftime(dt.datetime.now(), "-%d-%m-%y")
    data.to_csv(os.path.join(os.getcwd(), cont_number + time_stamp + ".csv"))

In [13]:
load(transformed_table_data)

In [97]:
with open("test.html", "w") as f:
    f.write(html_data)

In [150]:
transformed_table_data

Unnamed: 0,No.,Status,Location,Event Date
0,1,Empty Container Release to Shipper,"NAGOYA, AICHI, JAPAN NAGOYA - NISHI 4-KU RYUTS...",Actual 2021-09-28 15:31
1,2,Gate In to Outbound Terminal,"NAGOYA, AICHI, JAPAN TCB (TOBISHIMA CONTAINER ...",Actual 2021-09-29 14:48
2,3,Loaded on 'NYK VENUS 069W' at Port of Loading ...,"NAGOYA, AICHI, JAPAN TCB (TOBISHIMA CONTAINER ...",Actual 2021-10-03 13:39
3,4,'NYK VENUS 069W' Departure from Port of Loadin...,"NAGOYA, AICHI, JAPAN TCB (TOBISHIMA CONTAINER ...",Actual 2021-10-04 01:30
4,5,'NYK VENUS 069W' Arrival at Transhipment Port ...,"ROTTERDAM, NETHERLANDS ECT DELTA TERMINAL",Estimate 2021-11-06 07:30
5,6,'NYK VENUS 069W' T/S Berthing Destination NYK ...,"ROTTERDAM, NETHERLANDS ECT DELTA TERMINAL",Estimate 2021-11-06 09:30
6,7,Unloaded from 'NYK VENUS 069W' at Transhipment...,"ROTTERDAM, NETHERLANDS ECT DELTA TERMINAL",Estimate 2021-11-08 02:30
7,8,Loaded on 'VALENCIA EXPRESS 2145E' at Tranship...,"ROTTERDAM, NETHERLANDS ECT DELTA TERMINAL",Estimate 2021-11-12 20:00
8,9,Departure from Transhipment Port VALENCIA EXPR...,"ROTTERDAM, NETHERLANDS ECT DELTA TERMINAL",Estimate 2021-11-13 04:00
9,10,'VALENCIA EXPRESS 2145E' Arrival at Port of Di...,"ST PETERSBURG, RUSSIAN FEDERATION CONTAINER TE...",Estimate 2021-11-20 04:00


In [154]:
import sys
sys.getsizeof(table_data)
sys.getsizeof(transformed_table_data)

5293

In [185]:
with open("text.html", "w") as f:
    f.write(s)

In [186]:
driver.close()

In [104]:
# For debug: identify best element for WebDriverWait()
page_source = BeautifulSoup(driver.page_source)
page_source#.find_all(id="detail")

<html class="js draganddrop borderradius boxshadow textshadow cssanimations csstransitions placeholder" lang="en"><head>
<meta charset="utf-8"/>
<meta content="text/javascript" http-equiv="Content-Script-Type"/>
<meta content="text/css" http-equiv="Content-Style-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!-- Mobile meta tag(S) -->
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="telephone=no" name="format-detection"/>
<!-- Mobile meta tag(E) -->
<title>ONE : Cargo Tracking</title>
<link href="images/common/favicon_N.ico" rel="shortcut icon"/>
<!-- NEW SOURCE(S) : CSS -->
<link href="css/new_common.css?baseDate=202110290300" rel="stylesheet" type="text/css"/>
<!--[if IE 8]>
	<link rel="stylesheet" type="text/css" href="css/new_common_ie8.css?baseDate=202110290300" />
	<![endif]-->
<!--[if IE 9]>
    <link rel="stylesheet" type="text/css" href="css/new_common_ie9.css?baseDat

In [168]:
b = driver.find_element(By.NAME, "searchType")
dir(b)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_execute',
 '_id',
 '_parent',
 '_upload',
 'accessible_name',
 'aria_role',
 'clear',
 'click',
 'find_element',
 'find_element_by_class_name',
 'find_element_by_css_selector',
 'find_element_by_id',
 'find_element_by_link_text',
 'find_element_by_name',
 'find_element_by_partial_link_text',
 'find_element_by_tag_name',
 'find_element_by_xpath',
 'find_elements',
 'find_elements_by_class_name',
 'find_elements_by_css_selector',
 'find_elements_by_id',
 'find_elements_by_link_text',
 'find_elements_by_name',
 'find_elements_by_partial_link_text',
 'find_elements_by_tag_name',
 'find_

In [174]:
#b.click()
b.get_dom_attribute

<bound method WebElement.get_dom_attribute of <selenium.webdriver.remote.webelement.WebElement (session="2912ef46-3ae6-4b1b-8906-372525f35198", element="4c5485ec-6e3d-4943-bc9d-48a76b81e2e1")>>

In [22]:
list(zip([1,2,3,4], [1,2,3,4], [1,2,3,4]))

[(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4)]

In [34]:
key = "Outbound Termianl".split()
key[0].lower() + key[1]

'outboundTermianl'

In [45]:
'my my my'.split(sep="'")

['my my my']

In [8]:
from pymongo import MongoClient
from bson.json_util import dumps

In [1]:
user = "AdminEvgeny"
password = "mypa$$81"
host = "194.58.102.147"
uri = "mongodb://{}:{}@{}:27017/?authSource=admin".format(user,password,host)
client = MongoClient(uri)

NameError: name 'MongoClient' is not defined

In [11]:
test_db = client.test
test = test_db.test

In [129]:
#cursor = test.find({})
#print(dumps(cursor, indent=4))

In [45]:
# find by cont and cop_no
payload = {
    '_search': 'false', 'f_cmd': '125', 'cntr_no': 'NYKU9733409',
    'bkg_no': '', 'cop_no': 'COSA1827848695' }
r = requests.get('https://ecomm.one-line.com/ecom/CUP_HOM_3301GS.do', params=payload)

In [46]:
bs = BeautifulSoup(r.text)

In [76]:
# Find copno
payload = {
    '_search': 'false', 'nd': '1635790993', 'rows': '10000',
    'page': '1', 'sidx': '', 'sord': 'asc', 'f_cmd': '121', 'search_type': 'A',
    'search_name': 'NYKU9733409', 'cust_cd': '',
}
r = requests.get('https://ecomm.one-line.com/ecom/CUP_HOM_3301GS.do', params=payload)

In [77]:
dir(bs)#['list']
dict_ = r.json()
dict_['list'][0]['copNo']

'COSA1827848695'

In [93]:
sec = time.time()
ns = time.time_ns()

print(int(sec * 1000), len(str(int(sec * 1000))))
print(str(ns)[:-6], len(str(ns)[:-6]))
print(len('1635790992755'))

1635791625372 13
1635791625372 13
13


In [127]:
# generate nd payload
start = time.time()
for i in range(1000000):
    str(int(time.time() * 1000))
print(time.time() - start)

1.075000286102295


In [128]:
# generate nd payload (faster version)
start = time.time()
for i in range(1000000):
    str(time.time_ns())[:-6]
print(time.time() - start)

0.9700717926025391
