In [1]:
import os
import json
import re
import pandas as pd
import numpy as np
from slugify import slugify
import frontmatter

DIRNAME = os.path.abspath('')
THETIS_CSV_PATH = os.path.join(DIRNAME, "../files_computed/thetis_all_with_computed.csv")
WIKIDATA_SHIPS_CSV_PATH = os.path.join(DIRNAME, "../files_original/original.wikidata.ships.csv")
WIKIDATA_URLS_CSV_PATH = os.path.join(DIRNAME, "../files_original/original.wikidata.urls.csv")
WWW_SHIPS_DATA_PATH = os.path.join(DIRNAME, "../../www/views/ships")

In [2]:
df_thetis = pd.read_csv(THETIS_CSV_PATH, usecols=["imo", 'name', "ship_type"], dtype={"imo": str}).replace({np.nan: None}).groupby('imo').agg('first')
df_thetis_filtered = df_thetis[df_thetis.ship_type.isin(["Ro-pax ship", "Passenger ship"])]
df_wikidata_ships = pd.read_csv(WIKIDATA_SHIPS_CSV_PATH, dtype={"imo": str})
df_wikidata_urls = pd.read_csv(WIKIDATA_URLS_CSV_PATH, dtype={"imo": str})
df_wikidata = pd.merge(df_wikidata_ships, df_wikidata_urls, on="imo").groupby('imo').agg(set)
df_all = pd.merge(df_thetis_filtered, df_wikidata, on="imo")
print(f"got {df_all.shape[0]} passenger ships with wikipedia page")

got 528 passenger ships with wikipedia page


In [9]:
filenames = os.listdir(WWW_SHIPS_DATA_PATH)
regex_matches = [re.match(r".*-(\d+)\.md", filename) for filename in filenames]
existing_md_imos = [r.groups()[0] for r in regex_matches if r]
print(f"found {len(existing_md_imos)} existing ships MD files")

found 512 existing ships MD files


In [10]:
existing_md_files_without_thetis_match = set(existing_md_imos) - set(df_thetis.index.to_list())
print(f"found {len(existing_md_files_without_thetis_match)} ship MD files but could not find them back in THETIS: {existing_md_files_without_thetis_match}")

found 2 ship MD files but could not find them back in THETIS: {'9135963', '9832119'}


In [17]:
df_missing_md_files = df_all[~df_all.index.isin(existing_md_imos)]
print(f"found {df_missing_md_files.shape[0]} missing MD files for ships both in thetis and wikipedia")
df_missing_md_files

found 31 missing MD files for ships both in thetis and wikipedia


Unnamed: 0_level_0,name,ship_type,wikidataUrl_x,mmsi,shipTypes,countryCode,imageUrl,beam,draft,maximumCapacity,...,operatorUrl,operatorCountryCode,operatorName,manufacturerUrl,manufacturerCountryCode,manufacturerName,wikidataUrl_y,item,wikipediaUrl,wikipediaLang
imo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8019356,NATIONAL GEOGRAPHIC EXPLORER,Passenger ship,{http://www.wikidata.org/entity/Q1408554},{309336000.0},"{cruise ship, motor ship}",{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{16.5},{4.74},{nan},...,{nan},{nan},{nan},{nan},{nan},{nan},"{9538, 9539, 9540, 9541, 9542, 9543}",{http://www.wikidata.org/entity/Q1408554},{https://de.wikipedia.org/wiki/National_Geogra...,"{en, nb, de}"
8420878,WIND STAR,Passenger ship,{http://www.wikidata.org/entity/Q3569268},{309163000.0},{ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,"{nan, nan, nan}","{nan, nan, nan}",{nan},...,{nan},{nan},{nan},{nan},{nan},{nan},"{7258, 7259, 7260}",{http://www.wikidata.org/entity/Q3569268},{https://en.wikipedia.org/wiki/Wind_Star_(ship...,"{en, fr, de}"
8501957,PRIDE OF YORK,Ro-pax ship,{http://www.wikidata.org/entity/Q11707704},{311063300.0},{ship},{nan},{nan},{nan},{nan},{nan},...,{nan},{nan},{nan},{nan},{nan},{nan},{5330},{http://www.wikidata.org/entity/Q11707704},{https://en.wikipedia.org/wiki/MS_Pride_of_York},{en}
8503797,PRIDE OF BRUGES,Ro-pax ship,{http://www.wikidata.org/entity/Q6718151},{244387000.0},{ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{nan},{nan},{nan},...,{nan},{nan},{nan},{https://www.jfe-holdings.co.jp/},{JP},{JFE Holdings},{17467},{http://www.wikidata.org/entity/Q6718151},{https://en.wikipedia.org/wiki/MS_Pride_of_Bru...,{en}
8700785,WIND SURF,Passenger ship,{http://www.wikidata.org/entity/Q3569271},{309242000.0},{cruise ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{20.0},{5.0},{308.0},...,{nan},{nan},{nan},{nan},{nan},{nan},"{7256, 7257, 7255}",{http://www.wikidata.org/entity/Q3569271},{https://en.wikipedia.org/wiki/Wind_Surf_(ship...,"{en, fr, de}"
8707343,STAR PRIDE,Passenger ship,{http://www.wikidata.org/entity/Q3476677},{311084000.0},{cruise ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,"{nan, nan}",{5.2},{208.0},...,{nan},{nan},{nan},{http://www.schichau-seebeck-shipyard.com},{DE},{Schichau Seebeckwerft},"{7492, 7493}",{http://www.wikidata.org/entity/Q3476677},"{https://fr.wikipedia.org/wiki/Seabourn_Pride,...","{fr, en}"
8807997,STAR BREEZE,Passenger ship,{http://www.wikidata.org/entity/Q166842},{311083000.0},{cruise ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{20.5},{5.5},{212.0},...,{http://www.seabourn.com/},{US},{Seabourn Cruise Line},{http://www.schichau-seebeck-shipyard.com},{DE},{Schichau Seebeckwerft},"{4043, 4044, 4045, 4046}",{http://www.wikidata.org/entity/Q166842},{https://lb.wikipedia.org/wiki/Seabourn_Spirit...,"{en, fr, lb, de}"
9000259,SUN PRINCESS,Passenger ship,{http://www.wikidata.org/entity/Q3006084},{310438000.0},{cruise ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,"{nan, nan, nan, nan, nan, nan, nan, nan, nan}","{nan, nan, nan, nan, nan, nan, nan, nan, nan}",{nan},...,{nan},{nan},{nan},{https://www.fincantieri.com},{IT},{Fincantieri},"{17903, 17904, 17905, 17906, 17907, 17908, 179...",{http://www.wikidata.org/entity/Q3006084},{https://ja.wikipedia.org/wiki/%E3%82%B5%E3%83...,"{de, pt, zh, en, id, ja, it, fa, fr}"
9007283,EUROPEAN SEAWAY,Ro-pax ship,"{http://www.wikidata.org/entity/Q83569203, htt...","{232001040.0, nan, nan, nan}",{ship},{nan},"{nan, http://commons.wikimedia.org/wiki/Specia...","{nan, nan, 28.28, nan}","{nan, nan, 6.25, nan}",{nan},...,{nan},{nan},{nan},"{nan, http://www.schichau-seebeck-shipyard.com}","{nan, DE}","{nan, Schichau Seebeckwerft}","{17404, 17405, 17406}",{http://www.wikidata.org/entity/Q6718060},{https://en.wikipedia.org/wiki/MS_European_Sea...,"{en, fr, de}"
9007295,PRIDE OF CANTERBURY,Ro-pax ship,{http://www.wikidata.org/entity/Q6718155},{232001060.0},{ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{28.3},{6.25},{nan},...,{nan},{nan},{nan},{http://www.schichau-seebeck-shipyard.com},{DE},{Schichau Seebeckwerft},"{17474, 17475, 17476}",{http://www.wikidata.org/entity/Q6718155},{https://fr.wikipedia.org/wiki/MS_Pride_of_Can...,"{en, fr, de}"


In [28]:
df_missing_md_files.operatorName.apply(lambda x: list(x)[0]).value_counts()

Princess Cruises        2
P&O Ferries             2
Seabourn Cruise Line    1
Netherlands             1
Holland America Line    1
Name: operatorName, dtype: int64

In [88]:
operator_mapping = {
    "Carnival Cruise Line": "carnival-cruise-line-us",
    "Royal Caribbean International": "royal-caribbean-international-us",
    "Celebrity Cruises": "celebrity-cruises-us",
    "Costa Cruises": "costa-cruises-it",
    "Holland America Line": "holland-america-line-us",
    "Viking Line": "viking-line-fi",
    "Color Line": "color-line-no",
    "Algérie Ferries": "algerie-ferries-dz",
    "Oceania Cruises": "oceania-cruises-us",
    "Peter Deilmann Cruises": "peter-deilmann-cruises-de",
    "Molslinjen": "molslinjen-dk",
    "Hurtigruten Group": "hurtigruten-group-no",
    "Hapag-Lloyd Kreuzfahrten GmbH": "hapag-lloyd-de",
    "MSC Cruises": "msc-cruises-it",
    "Royal Caribbean Group": "royal-caribbean-international-us",
    "Princess Cruises": "princess-cruises-us",
    "P&O Ferries": "p-o-ferries-gb",
    "Seabourn Cruise Line": "seabourn-cruise-line-us",
    "Holland America Line": "holland-america-line-us",
}

In [90]:
def get_wikipedia_url(item):
    for lang in ["fr", "en", "it", "es", "de"]: # preffered languages
        for url in item['wikipediaUrl']:
            if re.match(rf".*{lang}\.wikipedia\.org.*", url):
                return url
    if len(item['wikipediaUrl']) > 0:
        return list(item['wikipediaUrl'])[0]

for imo, item in df_missing_md_files.iterrows():
    slug = slugify(f"{item['name']}-{imo}", to_lower=True)
    company = ""
    if len(item["operatorName"]) > 0:
        company = operator_mapping.get(list(item["operatorName"])[0])
    md_data = {
        "layout": "ship",
        "tags": ["ship"],
        "imo": imo,
        "name": item["name"],
        "wikipediaUrl": get_wikipedia_url(item),
        "slug": slug,
        "company": company,
    }
    print(md_data)
    with open(os.path.join(WWW_SHIPS_DATA_PATH, f"{slug}.md"), "w") as f:
        f.write(frontmatter.dumps(frontmatter.Post("", **md_data)))

}
{'layout': 'ship', 'tags': ['ship'], 'imo': '8916126', 'name': 'DIAGORAS', 'wikipediaUrl': 'https://ja.wikipedia.org/wiki/%E3%83%8B%E3%83%A5%E3%83%BC%E3%81%A8%E3%81%95', 'slug': 'diagoras-8916126', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '8919245', 'name': 'VASCO DA GAMA', 'wikipediaUrl': 'https://fr.wikipedia.org/wiki/MS_Statendam', 'slug': 'vasco-da-gama-8919245', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9000687', 'name': 'GEMINI', 'wikipediaUrl': 'https://en.wikipedia.org/wiki/MV_Gemini', 'slug': 'gemini-9000687', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9007491', 'name': 'CLUB MED 2', 'wikipediaUrl': 'https://fr.wikipedia.org/wiki/Club_Med_2', 'slug': 'club-med-2-9007491', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9010814', 'name': 'MAZOVIA', 'wikipediaUrl': 'https://de.wikipedia.org/wiki/Mazovia', 'slug': 'mazovia-9010814', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9050137', 'n