In [75]:
import os
import json
import re
import pandas as pd
import numpy as np
from slugify import slugify
import frontmatter

DIRNAME = os.path.abspath('')
THETIS_CSV_PATH = os.path.join(DIRNAME, "../files_computed/thetis_all_with_computed.csv")
WIKIDATA_SHIPS_CSV_PATH = os.path.join(DIRNAME, "../files_original/original.wikidata.ships.csv")
WIKIDATA_URLS_CSV_PATH = os.path.join(DIRNAME, "../files_original/original.wikidata.urls.csv")
WWW_SHIPS_DATA_PATH = os.path.join(DIRNAME, "../../www/views/ships")

In [27]:
df_thetis = pd.read_csv(THETIS_CSV_PATH, usecols=["imo", 'name', "ship_type"], dtype={"imo": str}).replace({np.nan: None}).groupby('imo').agg('first')
df_thetis_filtered = df_thetis[df_thetis.ship_type.isin(["Ro-pax ship", "Passenger ship"])]
df_wikidata_ships = pd.read_csv(WIKIDATA_SHIPS_CSV_PATH, dtype={"imo": str})
df_wikidata_urls = pd.read_csv(WIKIDATA_URLS_CSV_PATH, dtype={"imo": str})
df_wikidata = pd.merge(df_wikidata_ships, df_wikidata_urls, on="imo").groupby('imo').agg(set)
df_all = pd.merge(df_thetis_filtered, df_wikidata, on="imo")
print(f"got {df_all.shape[0]} passenger ships with wikipedia page")

got 497 passenger ships with wikipedia page


In [28]:
filenames = os.listdir(WWW_SHIPS_DATA_PATH)
regex_matches = [re.match(r".*-(\d+)\.md", filename) for filename in filenames]
existing_md_imos = [r.groups()[0] for r in regex_matches if r]
print(f"found {len(existing_md_imos)} existing ships MD files")

found 390 existing ships MD files


In [46]:
existing_md_files_without_thetis_match = set(existing_md_imos) - set(df_thetis.index.to_list())
print(f"found {len(existing_md_files_without_thetis_match)} ship MD files but could not find them back in THETIS: {existing_md_files_without_thetis_match}")

found 2 ship MD files but could not find them back in THETIS: {'9135963', '9832119'}


In [48]:
df_missing_md_files = df_all[~df_all.index.isin(existing_md_imos)]
print(f"found {df_missing_md_files.shape[0]} missing MD files for ships both in thetis and wikipedia")
df_missing_md_files.head()

found 122 missing MD files for ships both in thetis and wikipedia


Unnamed: 0_level_0,name,ship_type,wikidataUrl_x,mmsi,shipTypes,countryCode,imageUrl,beam,draft,maximumCapacity,...,operatorUrl,operatorCountryCode,operatorName,manufacturerUrl,manufacturerCountryCode,manufacturerName,wikidataUrl_y,item,wikipediaUrl,wikipediaLang
imo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6511128,RED STAR 1,Ro-pax ship,{http://www.wikidata.org/entity/Q19380740},{372589000.0},{ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{nan},{nan},{nan},...,{nan},{nan},{nan},{http://www.oundka.com/de/},{DE},{Orenstein & Koppel},{13406},{http://www.wikidata.org/entity/Q19380740},{https://no.wikipedia.org/wiki/MS_%C2%ABRed_St...,{nb}
7325629,OCEAN DIAMOND,Passenger ship,{http://www.wikidata.org/entity/Q3222300},{311063900.0},{cruise ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{16.03},{4.8},"{265.0, 250.0}",...,{nan},{nan},{nan},{nan},{nan},{nan},"{2668, 2669, 2670, 2671}",{http://www.wikidata.org/entity/Q3222300},{https://fa.wikipedia.org/wiki/%D8%A7%D9%88%D8...,"{de, fa, en, fr}"
7426045,KEFALONIA,Ro-pax ship,{http://www.wikidata.org/entity/Q47471106},{239386000.0},{ferry ship},{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{17.2},{5.2},{1100.0},...,{nan},{nan},{nan},{nan},{nan},{nan},{16861},{http://www.wikidata.org/entity/Q47471106},{https://de.wikipedia.org/wiki/Nissos_Kefalonia},{de}
7516773,GALAXY,Ro-pax ship,"{http://www.wikidata.org/entity/Q11764894, htt...","{210652000.0, nan, nan, nan}","{ship, ferry ship}",{nan},{nan},"{nan, nan, nan, nan, nan, nan}","{nan, nan, 5.42, nan}",{nan},...,{nan},{nan},{nan},{http://www.ssn.pl},{PL},{Szczecin Shipyard},"{14120, 18213, 18214}","{http://www.wikidata.org/entity/Q11764894, htt...","{https://de.wikipedia.org/wiki/Galaxy_(Schiff,...","{de, it, pl}"
7527887,KOPERNIK,Ro-pax ship,{http://www.wikidata.org/entity/Q11764890},{209896000.0},"{train ferry, ferry ship, motor ship}",{nan},{http://commons.wikimedia.org/wiki/Special:Fil...,{22.63},{5.8},{36.0},...,{nan},{nan},{nan},{nan},{nan},{nan},"{14571, 14572, 14573, 14574, 14575, 14576, 145...",{http://www.wikidata.org/entity/Q11764890},{https://de.wikipedia.org/wiki/Kopernik_(Schif...,"{de, fi, pl}"


In [88]:
operator_mapping = {
    "Carnival Cruise Line": "carnival-cruise-line-us",
    "Royal Caribbean International": "royal-caribbean-international-us",
    "Celebrity Cruises": "celebrity-cruises-us",
    "Costa Cruises": "costa-cruises-it",
    "Holland America Line": "holland-america-line-us",
    "Viking Line": "viking-line-fi",
    "Color Line": "color-line-no",
    "Algérie Ferries": "algerie-ferries-dz",
    "Oceania Cruises": "oceania-cruises-us",
    "Peter Deilmann Cruises": "peter-deilmann-cruises-de",
    "Molslinjen": "molslinjen-dk",
    "Hurtigruten Group": "hurtigruten-group-no",
    "Hapag-Lloyd Kreuzfahrten GmbH": "hapag-lloyd-de",
    "MSC Cruises": "msc-cruises-it",
    "Royal Caribbean Group": "royal-caribbean-international-us"
}

In [90]:
def get_wikipedia_url(item):
    for lang in ["fr", "en", "it", "es", "de"]: # preffered languages
        for url in item['wikipediaUrl']:
            if re.match(rf".*{lang}\.wikipedia\.org.*", url):
                return url
    if len(item['wikipediaUrl']) > 0:
        return list(item['wikipediaUrl'])[0]

for imo, item in df_missing_md_files.iterrows():
    slug = slugify(f"{item['name']}-{imo}", to_lower=True)
    company = ""
    if len(item["operatorName"]) > 0:
        company = operator_mapping.get(list(item["operatorName"])[0])
    md_data = {
        "layout": "ship",
        "tags": ["ship"],
        "imo": imo,
        "name": item["name"],
        "wikipediaUrl": get_wikipedia_url(item),
        "slug": slug,
        "company": company,
    }
    print(md_data)
    with open(os.path.join(WWW_SHIPS_DATA_PATH, f"{slug}.md"), "w") as f:
        f.write(frontmatter.dumps(frontmatter.Post("", **md_data)))

}
{'layout': 'ship', 'tags': ['ship'], 'imo': '8916126', 'name': 'DIAGORAS', 'wikipediaUrl': 'https://ja.wikipedia.org/wiki/%E3%83%8B%E3%83%A5%E3%83%BC%E3%81%A8%E3%81%95', 'slug': 'diagoras-8916126', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '8919245', 'name': 'VASCO DA GAMA', 'wikipediaUrl': 'https://fr.wikipedia.org/wiki/MS_Statendam', 'slug': 'vasco-da-gama-8919245', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9000687', 'name': 'GEMINI', 'wikipediaUrl': 'https://en.wikipedia.org/wiki/MV_Gemini', 'slug': 'gemini-9000687', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9007491', 'name': 'CLUB MED 2', 'wikipediaUrl': 'https://fr.wikipedia.org/wiki/Club_Med_2', 'slug': 'club-med-2-9007491', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9010814', 'name': 'MAZOVIA', 'wikipediaUrl': 'https://de.wikipedia.org/wiki/Mazovia', 'slug': 'mazovia-9010814', 'company': None}
{'layout': 'ship', 'tags': ['ship'], 'imo': '9050137', 'n