# Scraping data from vesselfinder

## Get all XHR requests

In [None]:
import asyncio
from playwright.async_api import async_playwright
import re
import requests

In [None]:
async with async_playwright() as p:
 browser = await p.chromium.launch(executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
 page    = await browser.new_page()

 # Enable network interception
 await page.route('**/*', lambda route, request: route.continue_())

 # Set up an event listener for XHR requests
 xhr_requests = []
 page.on('request', lambda request: xhr_requests.append(request))

 # Navigate to the web page
 await page.goto('https://www.marinetraffic.com/en/ais/home/centerx:-17.9/centery:22.8/zoom:2')

 # Wait for XHR requests to complete (you can customize this part)
 await page.wait_for_timeout(5000)  # Wait for 5 seconds (adjust as needed)

 # Print a list of XHR request URLs
 for request in xhr_requests:
     print(request.url)

 await browser.close()

In [None]:
def get_AIS_XHR(XHRs: list) -> list:

 url_pattern = re.compile(re.escape('/get_data_json_4/z:2/X:') + r'\d+/Y:' + r'\d+/station:0')

 ais = []
 for request in XHRs:
  if url_pattern.search(request.url):
   ais.append(request.url)

 return ais

In [None]:
ais = get_AIS_XHR(xhr_requests)

In [None]:
headers = {
 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
                Chrome/91.0.4472.124 \
                Safari/537.11',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 'Accept-Encoding': 'none',
 'Accept-Language': 'en-US,en;q=0.8',
 'Connection': 'keep-alive'
}

session = requests.Session()
response = session.get(ais[0], params=None, headers=headers)
response.raise_for_status()

In [None]:
response.json()['data']

## Static Content Utilities

In [None]:
import urllib.request
import os, bs4
import pandas as pd

from bs4 import BeautifulSoup
# locale.setlocale(locale.LC_ALL, 'es_ES') #as we need to deal with names of monthes later on.

In [None]:
def get_static_content(base_url: str, headers: dict):

 req = urllib.request.Request(base_url, None, headers)

 with urllib.request.urlopen(req) as response:
  page_content = response.read()

 parsed_html = BeautifulSoup(page_content, 'html.parser')

 return (
  parsed_html.find_all("h2", class_=['bar']),
  parsed_html.find_all("table", class_=['aparams','tparams'])
 )

def scrape_static_table(titles: bs4.element.ResultSet, tables: bs4.element.ResultSet, idx:list = [0,3,3]) -> dict:

 D = {}

 for j, table in enumerate(tables[:3]):
  tbx = titles[idx[j]].text.strip()
  D[tbx] = {}

  for row in table.find_all('tr'):
   columns = row.find_all('td')
   key   = columns[0].text.strip()
   value = columns[1].text.strip()
   D[tbx][key] = value

 return D

def get_vesselfinder_static(IMOs: list, headers: dict=None, limit: int=10) -> dict:

 vf_details = {}
 exceptions = {}

 for j, IMO in enumerate(IMOs[:limit]):
  base_url = f"https://www.vesselfinder.com/en/vessels/details/{IMO}"

  try:
   static_content = get_static_content(base_url, headers)
   vf_details[IMO] = scrape_static_table(*static_content)
  except Exception as e:
   print(f"An exception occurred for {IMO}: {e}")
   exceptions[IMO] = e

  print(f"{j+1} Vessels Completed", end='\r', flush=True)

 return vf_details, exceptions

## Initialization

In [None]:
headers = {
 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
                Chrome/91.0.4472.124 \
                Safari/537.11',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 'Accept-Encoding': 'none',
 'Accept-Language': 'en-US,en;q=0.8',
 'Connection': 'keep-alive'
}

data_path = os.path.join(os.getcwd(), "..", "data", "imo-vessel-codes.csv")
# Check if the file exists
if os.path.exists(data_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(data_path)
else:
    print(f"The CSV file '{data_path}' does not exist.")

IMOs = list(df.imo.unique()) # There are multiple MMSI for an IMO

## Static Content Scraping

In [None]:
vf_details = get_vesselfinder_static(IMOs, headers)

## Dynamic Content Utilities

In [None]:
import requests
import json

In [None]:
def get_dynamic_table(api_url: str, headers: dict=None, params: dict=None) -> dict:
 response = session.get(api_url, params=params, headers=headers)
 response.raise_for_status()
 return response.json()

## Dynamic Content Scraping

In [None]:
MMSI = 228386800

api_url = f"https://www.vesselfinder.com/api/pub/pcext/v4/{MMSI}?d"

params = {}

response = session.get(api_url, params=params, headers=headers)
response.raise_for_status()

In [None]:
response.json()

In [None]:
# base_url = f"https://www.vesselfinder.com/en/vessels/"

# headers = {
#  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
#                 Chrome/91.0.4472.124 \
#                 Safari/537.11',
#  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#  'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
#  'Accept-Encoding': 'none',
#  'Accept-Language': 'en-US,en;q=0.8',
#  'Connection': 'keep-alive'
# }

# session = requests.Session()

# response = session.get(base_url, verify=True, headers=headers)
# response.raise_for_status()

# Example with Dynamic Content

In [None]:
# stackoverflow.com/a/61051360

import requests
import re, json

session = requests.Session()

params = {
 "Category": "1",
 "salesTypeFilter": "1",
 "sortType": "4"
}

url = "https://www.roblox.com/catalog/"

response = session.get(url)
response.raise_for_status()

# token_pattern = r'<meta name="csrf-token" data-token="(?P<csrf_token>[^"]+)" />'
token_pattern = r'data-token="(?P<csrf_token>[^"]+)"'

match = re.search(token_pattern, response.text)
csrf_token = match.group("csrf_token")

api_url = "https://catalog.roblox.com/v1/search/items"

response = session.get(api_url, params=params, headers={})
response.raise_for_status()

assets = {"items": [{**d, "key": f"{d['itemType']}_{d['id']}"} for d in response.json()["data"]]}

In [None]:
url = "https://catalog.roblox.com/v1/catalog/items/details"

headers = {
    "Content-Type": "application/json;charset=UTF-8",
    "X-CSRF-TOKEN": csrf_token
}

response = session.post(url, data=json.dumps(assets), headers=headers)
response.raise_for_status()

items = response.json()["data"]

In [None]:
first_item = items[0]

for key, value in first_item.items():
 print(f"{key}: {value}")

# Obsolete

In [None]:
# links = parsed_html.find_all("a")
# token_pattern = r'/vessels/details/(\d+)'
# match = re.findall(token_pattern, str(links))

In [None]:
import requests
import re, json

base_url = f"https://www.vesselfinder.com/en/vessels/"

headers = {
 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) \
                Chrome/91.0.4472.124 \
                Safari/537.11',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 'Accept-Encoding': 'none',
 'Accept-Language': 'en-US,en;q=0.8',
 'Connection': 'keep-alive'
}

session = requests.Session()

response = session.get(base_url, verify=True, headers=headers)
response.raise_for_status()

In [None]:
api_url = "https://www.vesselfinder.com/api/pub/pcext/v4/228386800?d"

params = {}

response = session.get(api_url, params=params, headers=headers)
response.raise_for_status()

response.json()

In [None]:
# # Define functions to convert DMS to DD
# def dms2dd(degrees, direction):
#  dd = float(degrees)
#  if direction in ['S', 'W']:
#   dd *= -1
#  return dd

# def parse_dms(dms):
#  parts = re.split(r'[°\'"]+', dms)
#  degrees = float(parts[0])
#  minutes = float(parts[1])
#  seconds = 0.0 if len(parts) < 3 else float(parts[2])
#  direction = parts[-1]
#  dd = degrees + minutes / 60.0 + seconds / 3600.0
#  return dms2dd(dd, direction)

# for table in tables:
#  if table.find_parent("table") is None:
#   for row in table.find_all('tr'):
#    cells = row.find_all('td')
#    if len(cells) == 2:
#     key = cells[0].get_text(strip=True)
#     value = cells[1].get_text(strip=True)

#     # Define a mapping of table keys to vessel_info keys
#     key_mapping = {
#      "Vessel Name": "Name",
#      "Coordinates": "Coordinates",
#      "Position received": "Position Received"
#     }

#     # Check if the key is in the mapping
#     if key in key_mapping:
#      field_name = key_mapping[key]

#      # Special case for 'Position received' to parse the datetime
#      if field_name == "Position Received":
#       position_time_str = cells[1]['data-title']
#       position_time = datetime.strptime(position_time_str, '%b %d, %Y %H:%M %Z')
#       vessel_info[field_name] = position_time
#      else:
#       vessel_info[field_name] = value