# Ships scrapper for marinetraffic.com

### 1. Imports

In [1]:
import requests
import json
from datetime import datetime
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from bson.json_util import dumps
import access
from bs4 import BeautifulSoup

# External data resource
URL = "https://www.marinetraffic.com/en/ais/details/ships/shipid:"

### Log function

### 2. Request web page

In [3]:
def request_web_page(ship_id):
    """Request web page for ship_id."""
    return requests.get(URL + ship_id, headers={"User-Agent": "Mozilla/5.0"})

In [37]:
response = request_web_page("5630138")
response

<Response [200]>

### 3. Parse web page title

In [5]:
def parse_title(response):
    """Parse title string from html response."""
    response = BeautifulSoup(response.text, "html.parser")
    return response.title.text

In [38]:
title = parse_title(response)
title

'Ship C 36 (Patrol Vessel) Registered in  - Vessel details, Current position and Voyage information - IMO 9050888, MMSI -9050888, Call Sign '

### 4. Parse ship details

In [17]:
def parse_ship_details(title):
    """Parse ship details from web page title."""
    name_str = title[title.find("Ship") + 5:title.find("Registered in")].strip()
    name = name_str.split("(")[0].strip()
    ship_type = name_str.split("(")[1].replace(")", "")
    flag = title[title.find("Registered in") + 13:title.find("-")].strip()
    imo = title[title.find("IMO") + 3: title.find("MMSI") - 2].strip()
    mmsi = title[title.find("MMSI") + 4: title.find("MMSI") + 14].strip()
    call_sign = title[title.find("Call Sign") + 9:].strip()
    result = {
        "ship_id": "", "name": name, "type": ship_type, "flag": flag,
        "imo": imo, "mmsi": mmsi, "callSign": call_sign
    }
    return result

In [8]:
parse_ship_details(title)

{'ship_id': '',
 'name': 'EVER GIVEN',
 'type': 'Container Ship',
 'flag': 'Panama',
 'imo': '9811000',
 'mmsi': '353136000',
 'call_sign': 'H3RC'}

### 5. Web scrapper pipeline function

In [35]:
def web_scrapper(ship_ids):
    """Scrap data from marinetraffic.com by ship id number."""
    result = []
    for ship_id in ship_ids:
        response = request_web_page(ship_id)
        if response.status_code == 200:
            #print(ship_id)
            #print(response.text)
            title = parse_title(response)
            ship_details = parse_ship_details(title)
            ship_details["ship_id"] = ship_id
            result.append(ship_details)
        else:
            log("[Ships_web_scrapper.py] [web_scrapper()] " \
                + f"[No ship for ship id {ship_id}]")
    return result

### 6. Insert ship data to db

In [36]:
def insert_ship_to_db(ships):
    """Insert ship record to db."""
    # Connect to database and update data
    conn = MongoClient(access.update)
    try:
        conn.admin.command("ping")
        now = datetime.now().replace(microsecond=0)
        for ship in ships:
            ship_copy = ship.copy()
            #del ship_copy["cntrNo"]
            ship_copy["lastUpdate"] = now
            cur = conn.one.ships.insert_one(ship_copy)
            if cur.acknowledged == False:
                log("[Update ship location] [Insert ship to db] "\
                    + f"[Imo {ship['imo']} not inserted]")
        conn.close()
    except ConnectionFailure:
        log("[Update ship location] [Insert ship to db] "\
            + "[DB Connection failure for imo "\
            + f"{ship['imo']}, mmsi {ship['mmsi']}]")
        conn.close()
    except BaseException as err:
        log("[Update ship location] [Insert ship to db] "\
            + f"[{err} for imo {ship['imo']}, mmsi {ship['mmsi']}]")
        conn.close()

### 7. Pipeline

In [34]:
for idx, i in enumerate(range(10,1000,10)):
    ship_ids = [str(ii) for ii in range(idx * 10,i)]
    

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
['10', '11', '12', '13', '14', '15', '16', '17', '18', '19']
['20', '21', '22', '23', '24', '25', '26', '27', '28', '29']
['30', '31', '32', '33', '34', '35', '36', '37', '38', '39']
['40', '41', '42', '43', '44', '45', '46', '47', '48', '49']
['50', '51', '52', '53', '54', '55', '56', '57', '58', '59']
['60', '61', '62', '63', '64', '65', '66', '67', '68', '69']
['70', '71', '72', '73', '74', '75', '76', '77', '78', '79']
['80', '81', '82', '83', '84', '85', '86', '87', '88', '89']
['90', '91', '92', '93', '94', '95', '96', '97', '98', '99']
['100', '101', '102', '103', '104', '105', '106', '107', '108', '109']
['110', '111', '112', '113', '114', '115', '116', '117', '118', '119']
['120', '121', '122', '123', '124', '125', '126', '127', '128', '129']
['130', '131', '132', '133', '134', '135', '136', '137', '138', '139']
['140', '141', '142', '143', '144', '145', '146', '147', '148', '149']
['150', '151', '152', '153', '154', '155', '1

In [12]:
ship_ids = [str(i) for i in range(10)]
ship_ids

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [18]:
ships = ships_scrapper(ship_ids)

In [19]:
insert_ship_to_db(ships)