# Ships scrapper for marinetraffic.com

### 1. Imports

In [2]:
import requests
import json
from datetime import datetime
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from bson.json_util import dumps
import access
from bs4 import BeautifulSoup

# External data resource
URL = "https://www.marinetraffic.com/en/ais/details/ships/shipid:"

### Log function

In [3]:
def log(message):
    """Log function to log errors."""
    timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
    with open("etl.log", "a") as f:
        f.write(timestamp + " " + message + "\n")

### 2. Request web page

In [48]:
def request_web_page(ship_id):
    """Request web page for ship_id."""
    response = requests.get(
        URL + str(ship_id),
        headers={"User-Agent": "Mozilla/5.0"}
    )
    return response

In [49]:
ship_id = 1
response = request_web_page(ship_id)
response

<Response [404]>

### 3. Parse web page title

In [50]:
def get_page_title(response, ship_id):
    """Parse title string from html response."""
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return soup.title.text
    else:
        log("[ships_web_scrapper.py] [get_page_title()] " \
                + f"[{response.status_code} for ship id {ship_id}]")
        return False

In [52]:
title = get_page_title(response, ship_id)
title

False

### 4. Parse ship details

In [54]:
def scrap_ship_details(title, ship_id):
    """Scrap ship details from web page title."""
    if not title:
        return False
    name_str = title[title.find("Ship") + 5:title.find("Registered in")].strip()
    name = name_str.split("(")[0].strip()
    ship_type = name_str.split("(")[1].replace(")", "")
    flag = title[title.find("Registered in") + 13:title.find("-")].strip()
    imo = title[title.find("IMO ") + 3: title.find("MMSI ") - 2].strip()
    if len(imo) == 7 and imo.isdigit():
        imo = int(imo)
    else:
        log("[ships_web_scrapper.py] [scrap_ship_details()] " \
                + f"[Incorrect imo {imo} for ship id {ship_id}]")
        imo = 0
    mmsi = title[title.find("MMSI ") + 4: title.find("MMSI ") + 14].strip()
    if len(mmsi) == 9 and mmsi.isdigit():
        mmsi = int(mmsi)
    else:
        log("[ships_web_scrapper.py] [scrap_ship_details()] " \
                + f"[Incorrect mmsi {mmsi} for ship id {ship_id}]")
        mmsi = 0
    call_sign = title[title.find("Call Sign") + 9:].strip()
    result = {
        "ship_id": ship_id, "name": name, "type": ship_type, "flag": flag,
        "imo": imo, "mmsi": mmsi, "callSign": call_sign
    }
    return result

In [55]:
scrap_ship_details(title, ship_id)

False

### 5. Insert ships data to db

In [43]:
def insert_ship_to_db(ship):
    """Insert ship record to db."""
    if not ship:
        return
    # Connect to database and update data
    conn = MongoClient(access.update)
    try:
        conn.admin.command("ping")
        now = datetime.now().replace(microsecond=0)
        ship["update"] = now
        cur = conn.one.ships.insert_one(ship)
        if cur.acknowledged == False:
            log("[Ships_web_scrapper.py] [insert_ship_to_db()] "\
                + f"[Ship id {ship['ship_id']} imo {ship['imo']} not inserted]")
        conn.close()
    except ConnectionFailure:
        log("[ships_web_scrapper.py] [insert_ship_to_db()] "\
            + f"[DB Connection failure for ship id {ship['ship_id']} imo {ship['imo']}]")
        conn.close()
    except BaseException as err:
        log("[ships_web_scrapper.py] [insert_ship_to_db()] "\
            + f"[{err} for ship id {ship['ship_id']} imo {ship['imo']}]")
        conn.close()

In [44]:
insert_ship_to_db(False)

### 7. Pipeline

In [47]:
for ship_id in range(10):
    response = request_web_page(ship_id)
    title = get_page_title(response, ship_id)
    ship = scrap_ship_details(title, ship_id)
    insert_ship_to_db(ship)
    
    

0
1
2
3
4
5
6
7
8
9


In [58]:
#-8214061
url = "https://www.marinetraffic.com/ru/ais/details/ships/shipid:9237/mmsi:0/imo:8214061/vessel:FRIENDLY_ZHEJIANG"
response = requests.get(
        url,
        headers={"User-Agent": "Mozilla/5.0"}
    )


In [59]:
response.text

'\n<!DOCTYPE html>\n<html lang=ru>\n<head>\n    <title>Детали судна: FRIENDLY ZHEJIANG (Bulk Carrier) - IMO 8214061, MMSI -8214061, Позывной HPNT Зарегистрировано в Panama </title>\n    <meta name="google-site-verification" content="aA4cdkT5SSWS_uycfNXSZ7UHHvJvnSTsEqJza8Wtfsc" />\n    <meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1">\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n    <meta name="apple-itunes-app" content="app-id=563910324">\n    <meta name="google-play-app" content="app-id=com.marinetraffic.android">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <meta name="description" content="Подробности о судне: FRIENDLY ZHEJIANG. Узнайте основную информацию о судне, включая номера IMO, MMSI и позывной. Тип: Bulk Carrier судно зарегистрировано в Panama. Узнайте дедвейт, валовую вместимость и год постройки. Подробности о FRIENDLY ZHEJIANG включают текущее местоположение судна, информацию о рейсе и фотографии