In [1]:
import requests
import pandas as pd
import json
from tqdm import tqdm
import re

# Metadata

Download date: 25-08-2022

## Get data

In [None]:
# METHOD 1: Import EMSA data

with open('/Users/dt/Documents/Projecten/northsea/data/accidents/emsa_accidents.json') as f:
    files = json.load(f)
len (files)

In [None]:
# Extract ids

ids = []

for file in files:
    for f in file['occurrencePreviews']:
        ids.append(f['occurrenceUuid'])
len(ids)

In [None]:
# METHOD 2: import from the har file (preferred)

accidents = open('/Users/dt/Documents/Projecten/northsea/data/accidents/portal.emsa.europa.eu.har')
file = accidents.read()

In [None]:
# Extract ids

pattern = r'occurrenceUuid\\":\\"([0-9a-z-]+)\\",\\"casualtyReportNr\\'

res = re.findall(pattern, file)

ids = list(set(res))
len(ids)

In [None]:
# Get the data from EMSA

results = []

url = 'https://portal.emsa.europa.eu/emcip-open-rest-api/public-occurrence/'

for id_ in tqdm(ids):
    r = requests.get(url + id_)
    result = r.text
    results.append(result)
    
len(results)

In [None]:
# Write json to file

with open('/Users/dt/Documents/Projecten/northsea/data/accidents/results.json', 'w') as outfile:
    json.dump(results, outfile)
    

## Parse data

In [None]:
# Read from file

with open('/Users/dt/Documents/Projecten/northsea/data/accidents/results.json', 'r') as file:
    results = json.load(file)

In [None]:
# Define function for extracting nested labels (values don't work because values are one element lists)
# So for values I use a quick and dirty regex solution. Sorry not sorry.

def json_extract(obj, key):
    """Recursively fetch values from nested JSON."""
    arr = []

    def extract(obj, arr, key):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k == key:
                    arr.append(v)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, key)
        return arr

    values = extract(obj, arr, key)
    return values

In [402]:
rows = []

for result in tqdm(results):
    result = json.loads(result)
    
    labels = []
    values = []

    for label in json_extract(result, 'label'):
        label = re.escape(label)
        if 'itude' in label:
            value_pattern = label + r"\', \'values\': \[\"([A-Za-z0-9- °'.,\/?+=$_<>()#%]+)\"]\},"
        elif 'Time (LT)' in label:
            value_pattern = label + r"\', \'values\': \[\'(\d{2}:\d{2})\'\]\},"
        else:
            value_pattern = label + r"\', \'values\': \[\'([A-Za-z0-9- °':.,\/?+=$_<>()#%]+)\'\]\},"
        try:
            value = re.findall(value_pattern, str(result), flags=re.UNICODE)[0]
        except:
            value = None
        labels.append(label)
        values.append(value)

    row = dict(zip(labels, values))

    rows.append(row)
        

100%|██████████████████████████████████████| 3658/3658 [00:12<00:00, 285.76it/s]


In [406]:
df = pd.DataFrame(rows)
df.sample(2)

Unnamed: 0,Occurrence,Casualty\ Report\ Nr\.,Directive\ 2009/18,State\ Reporting,Competent\ authority,Investigation\ Status,Occurrence\ severity,Date\ of\ occurrence,Time\ \(LT\)\ of\ occurrence,Latitude,...,Narratives,Description,Safety\ Recommendation\ \(SR\),Recommendation,SR\ Coding,Addressee\(s\)\ coding,Action\ Taken,AT\ Description,AT\ Coding,Taken\ by\ \(coding\)
2726,,2021/005848,Yes,Denmark,Denmark - DMAIB,Not Investigated,Serious,2021-10-30,11:00,57°42' N,...,,,,,,,,,,
1414,,38/2012,Yes,Denmark,Denmark - DMAIB,Finished,Serious,2011-12-18,13:15,54°24.08' N,...,,,,The Danish Maritime Accident Investigation Boa...,Ship related procedures - Inspection,,,,,


In [408]:
# Clean it up a bit

# Rename columns

df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace(r'\\', '', regex=True)

# Drop first column
              
df = df.drop(['occurrence'], axis=1)

# Convert date to datetime

df['date_of_occurrence'] = pd.to_datetime(df['date_of_occurrence'])

# Result

df.sample(2)

Unnamed: 0,casualty_report_nr.,directive_2009/18,state_reporting,competent_authority,investigation_status,occurrence_severity,date_of_occurrence,time_(lt)_of_occurrence,latitude,longitude,...,narratives,description,safety_recommendation_(sr),recommendation,sr_coding,addressee(s)_coding,action_taken,at_description,at_coding,taken_by_(coding)
2620,2022/006961,Yes,Greece,Greece - HBMCI,Not Investigated,Marine incident,2022-01-22,20:00,40°38.063' N,22°55.384' E,...,,,,,,,,,,
3263,2020/006665,Yes,Malta,Malta - MSIU,Finished,Less Serious,2020-10-03,11:02,37°34.58' N,23°44.55' E,...,,Frequency of man overboard drills will be carr...,,,,,,Frequency of man overboard drills will be carr...,"Human Factors - Training, skills, experience",


In [409]:
df.to_csv('/Users/dt/Documents/Projecten/northsea/data/accidents/accidents.csv', index=False)

## Analyze data

In [444]:
df[df['occurrence_severity'] == 'Serious'].groupby(['state_reporting'])['occurrence_severity'] \
        .count() \
        .reset_index(name='count') \
        .sort_values(['count'], ascending=False) \
        .head(10)

Unnamed: 0,state_reporting,count
17,Malta,231
7,France,169
23,Spain,162
4,Denmark,82
25,United Kingdom,72
8,Germany,68
10,Greece,66
18,Netherlands,63
21,Portugal,31
11,Iceland,28


In [434]:
df.columns

Index(['casualty_report_nr.', 'directive_2009/18', 'state_reporting',
       'competent_authority', 'investigation_status', 'occurrence_severity',
       'date_of_occurrence', 'time_(lt)_of_occurrence', 'latitude',
       'longitude', 'nr._ships_involved', 'sea_area_of_occurrence',
       'port_of_accident', 'national_location', 'lives_lost_occurrence-total',
       'people_injured_occurrence-total', 'third_party/other_damage',
       'sar_intervention', 'weather_and_environment_-_details', 'wind_force',
       'sea_state', 'natural_light', 'visibility', 'weather_conditions',
       'vessel', 'ship_/_craft_type', 'voyage_type_(certified)',
       'port_of_departure', 'port_of_destination', 'ship_operation',
       'voyage_segment', 'ship’s_routeing', 'did_the_ship_sink?',
       'loss_/_damage_to_ship_or_equipment', 'cargo_damage',
       'pollution_(bunkers)', 'poll._quantity/bunker', 'pollution_(cargo)',
       'poll._quantity/cargo', 'place_on_board', 'casualty_event',
       'occur

In [442]:
df.investigation_status.value_counts()

Not Investigated    2218
Finished            1365
Ongoing               57
To be decided          5
Name: investigation_status, dtype: int64