# Notebook for doing pirate stuff with Pandas Dataframes

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# Install spacy and language model (uncomment if needed)
# !conda install -c conda-forge spacy -y
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !conda install torchvision -y
# !conda install conda-forge::cupy -y

In [32]:
# Import statements
# import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
# from spacy.matcher import Matcher
from spacy import displacy
from sklearn.metrics import confusion_matrix, classification_report
from IPython.display import HTML as html_print
from spacy_model import generate_matcher, html_generator, custom_matcher, model_interpreter
import importlib

### Original Data that Mike found:

In [9]:
# Read clean csv
piracy_df_original = pd.read_csv('Data_Files/[Clean] IMO Piracy - 2000 to 2022 (PDV 01-2023).csv')

# Convert dates to datetime object
piracy_df_original['Incident Date'] = pd.to_datetime(piracy_df_original['Incident Date'], format='%m/%d/%Y')

# pd.concat([piracy_df_original.head(), piracy_df_original.tail()])
piracy_df_original

Unnamed: 0,Incident Date,Ship Name,Ship Flag,Ship Type,Area,Latitude,Longitude,Consequences to Crew,Part of Ship Raided,Ship Status,Weapons Used,Flag - Crew Injuries,Flag - Crew Held Hostage,Flag - Crew Missing,Flag - Crew Deaths,Flag - Crew Assaulted
0,2010-03-18,AL-ASA'A,Yemen,Dhow,In international waters,,,Ship Hijacked,Not Stated,Not Stated,None or Not Reported,False,True,False,False,False
1,2010-05-25,AL JAWAT,Yemen,Dhow,In international waters,,,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,False,False,False,False
2,2011-02-13,AL FARDOUS,Yemen,Fishing vessel,In territorial waters,,,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,False,False,False,False
3,2011-04-16,ABDI KHAN,Yemen,Fishing vessel,In international waters,11.900000,54.083333,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,True,False,False,False
4,2012-01-14,AL WASIL,Yemen,Dhow,In international waters,,,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4809,2009-12-30,GULF ELAN,Bahamas,Chemical tanker,In port area,22.690000,113.696667,No Consequences or Not Stated,Engine Room,At Anchor,None or Not Reported,False,False,False,False,False
4810,2008-11-07,CEC FUTURE,Bahamas,General cargo ship,In international waters,12.766667,45.933333,Ship Hijacked,Engine Room,Steaming,None or Not Reported,False,True,False,False,False
4811,2006-02-13,ASPEN ARROW,Bahamas,General cargo ship,In port area,,,No Consequences or Not Stated,Engine Room,At Anchor,None or Not Reported,False,False,False,False,False
4812,2009-10-24,ELLEN S,Antigua and Barbuda,Container ship,In territorial waters,20.641667,106.880000,Not Reported,Engine Room,At Anchor,None or Not Reported,False,False,False,False,False


In [10]:
# Go forth and do great things

### New data straight from the International Maritime Organization:
Note: This data is uncleaned and is dirtier than the decks of the Flying Dutchman.

In [64]:
# Read dirty csv
piracy_df_imo = pd.read_csv('Data_Files/[Dirty]_ListOfIncidents_IMO.csv')

# Drop columns we don't need
piracy_df_imo.drop(columns=['Boarded?', 'MSC/Circ', 'Coastal State Action Taken'], inplace=True)

# Ensure no NaNs in Lat/Lon and Incident details columns
piracy_df_imo = piracy_df_imo[piracy_df_imo.loc[:,'Incident details'].notna()]

# Convert Date column to DateTime Objects
piracy_df_imo['Date'] = pd.to_datetime(piracy_df_imo['Date'], format='%m/%d/%y')

# Show whatever you want
# pd.concat([piracy_df_imo.head(), piracy_df_imo.tail()])
piracy_df_imo

Unnamed: 0,Date,Ship Name,Ship Type,IMO No.,Area,Latitude,Longitude,Incident details,Consequences for crew etc,Action taken by master/crew,Reported?,Reported to...,Reporting State
0,1994-07-22,PAVELS STERNBERGS,Reefer,7362366,In territorial waters,,,Ship boarded by seven men armed with big cable...,Deck watchman was slightly wounded and some sh...,Chief officer and other crew members came to t...,True,Incident reported to Port Authorities,Latvia
1,1994-09-09,BONSELLA,,,In territorial waters,,,Twenty-six bandits posing as Coast Guard hijac...,Ship's cargo and money stolen,-,True,Yes,United States
2,1994-10-23,SIBOELF,Ore/Bulk/oil carrier,9011935,In port area,,,"6-7 pirates wearing masks, armed with pistols ...",Personal belongings and cash stolen from crew,The watchman saw the pirates and informed term...,True,Terminal informed,Norway
3,1994-10-26,TROPICAL SUN,,,In port area,,,Ship attacked with mortar shells which fell ab...,,-,True,Yes,United States
4,1994-11-17,ANOMIS,,7233711,In territorial waters,,,Boat opened fire on ship after trying unsucces...,-,-,True,Yes,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8551,2024-01-09,CMB Chikako,Bulk carrier,9701190,In territorial waters,1° 03.00' N,103° 39.59' E,"Five robbers armed with a knife, boarded a shi...",The robbers took hostage and tied up one of th...,Alarm raised and crew mustered,True,VTIS Singapore,
8552,2024-01-12,Solar Roma,Product tanker,9887372,In port area,1° 43.29' N,101° 25.72' E,Duty security patrol onboard an anchored tanke...,Nil,Alarm raised,True,Dumai port control,
8553,2024-01-14,Name Withheld,Oil tanker,,In port area,21° 50.84' N,91° 41.84' E,D/O onboard an anchored tanker noticed a small...,Nil,"Alarm raised, and crew mustered",True,Port control and Coast Guard,
8554,2024-01-14,Name Withheld,Supply ship,,In port area,6° 05.00' S,12° 15.00' E,"Unnoticed, thieves boarded an anchored offshor...",Ship’s properties stolen,Nil,True,,


In [12]:
# Go forth and make Steve Urkel proud
piracy_df_imo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8556 entries, 0 to 8555
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         8556 non-null   datetime64[ns]
 1   Ship Name                    8409 non-null   object        
 2   Ship Type                    8206 non-null   object        
 3   IMO No.                      7213 non-null   object        
 4   Area                         8550 non-null   object        
 5   Latitude                     4537 non-null   object        
 6   Longitude                    4575 non-null   object        
 7   Incident details             8551 non-null   object        
 8   Consequences for crew etc    8068 non-null   object        
 9   Action taken by master/crew  8171 non-null   object        
 10  Reported?                    8556 non-null   bool          
 11  Reported to...               6705 non-null 

In [None]:
# Plot histogram of incidents over the years
fig, ax = plt.subplots()

piracy_df_imo['Date'].hist(bins=30, ax=ax)
plt.title('Histogram of Recorded Piracy Incidents (1994-2024)')
plt.xlabel('Year')
plt.ylabel('Frequency')

# Find keywords in incident details using Spacy
Reference: https://towardsdatascience.com/structured-natural-language-processing-with-pandas-and-spacy-7089e66d2b10

Ideas:
- Build categories (cats) and assign them to each noun chunk
    - Categories: hijacked, boarded, hostages, theft, ship missing, crew member abducted/kidnapped, ship fired upon, casualties
- Set extension as interesting with keywords in lemma column
- Add IMO No. to doc metadata (context): Can't find a way to efficiently do this
- Set date as index

## Let's train a Matcher object for rule-based matching

In [None]:
# Generate 500 random rows of training data for our categorization
# DON'T OVERWRITE OUR TRAINING DATA
# piracy_df_imo.sample(n=500).to_csv('./Data_Files/training_data.csv')

In [13]:
# Load language model, disable 'named entity recognition'
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [45]:
# Import training data
training_data = pd.read_csv('./Data_Files/training_data.csv')

# Set up flag columns for when the NLP happens
training_data['BOARDED'] = 0
training_data['HIJACKED'] = 0

# Display result
training_data.loc[:,['Boarded_label', 'Hijacked_label', 'Incident_details']]

Unnamed: 0,Boarded_label,Hijacked_label,Incident_details
0,0,0,A small wooden boat with four pirates approach...
1,0,0,Two robbers in a wooden boat came alongside an...
2,1,0,Three robbers armed with long knives boarded a...
3,0,0,A pirate boat attempted to board the ship whil...
4,0,0,Two boats were spotted near the anchor chain e...
...,...,...,...
495,1,0,A speedboat with five pirates wearing camoufla...
496,1,0,Two robbers boarded an anchored ship using a h...
497,1,1,Pirates attacked and hijacked the ship underwa...
498,1,0,Five robbers armed with guns boarded an anchor...


### Creating products for presentation

In [31]:
from pathlib import Path

In [61]:
# Display sentence dependencies
doc = nlp("On seeing crew alertness, the robbers jumped overboard and escaped empty handed in a waiting boat with one accomplice.")
svg = spacy.displacy.render(doc, style='dep', jupyter=False, 
                      options={'bg': 'white', 'compact':False, 'distance': 130, 
                               'fine_grained': True, 'font_size': 30})

# Output to file
output_path = Path("./Data_Files/Results/dependency_plot3.svg")
output_path.open("w", encoding="utf-8").write(svg)

15544

In [6]:
# Create matcher and docs. Place in training_df with bool values
matcher = generate_matcher(nlp)
docs = list(nlp.pipe(training_data.loc[:,'Incident_details']))
training_data = custom_matcher(training_data, docs, matcher)
# training_data.head(50).loc[:,['Boarded', 'BOARDED', 'Incident_details']]

### More products for presentation

In [7]:
# Test for presentation
test_sents = ["Three robbers in a boat boarded the ship, while at anchor, broke into ship's stores and when alert crew mustered, they escaped with their loot.", 
              "Incident reported to port authority through the local agents. Upon berthing, the police boarded the ship for investigation.", 
              "A pirate boat attempted to board the ship while underway.", "While underway, the master of the bulk carrier reported to Singapore VTIS that four perpetrators were sighted in the steering gear room. The master raised the alarm and mustered crew. A security search on board was conducted and at about 0345 hrs, the security search was completed and no perpetrators found.", "The duty watchman onboard the ship at anchor, noticed robbers trying to open the watertight door to the forecastle store. He immediately alerted the OOW who, in turn, raised the alarm and crew mustered. On seeing crew alertness, the robbers jumped overboard and escaped empty handed in a waiting boat with one accomplice."]
# test_docs = list(nlp.pipe(test_sents))
html_print(html_generator((nlp(i) for i in test_sents), matcher=matcher, n=len(test_sents)))

In [8]:
# Find all false negatives
false_negative = (training_data.loc[lambda d: d['Boarded_label'] == 1]
                               .loc[lambda d:       d['BOARDED'] == 0]['Incident_details'])

# Print all false negatives
print(f'{len(false_negative)} total false negatives.')
html_print(html_generator((nlp(i) for i in false_negative), matcher=matcher, n=len(false_negative)))

0 total false negatives.


In [9]:
# Find all false positives
false_positive = (training_data.loc[lambda d: d['Boarded_label'] == 0]
                               .loc[lambda d: d['BOARDED'] == 1]['Incident_details'])

# Print false positives using html_print and html_generator functions
print(f'{len(false_positive)} total false positives.')
html_print(html_generator((nlp(i) for i in false_positive), matcher=matcher, n=len(false_positive)))

4 total false positives.


In [10]:
# Make confusion matrix
confusion_matrix(training_data['Boarded_label'], training_data['BOARDED'])

array([[171,   4],
       [  0, 325]])

In [11]:
# Boarded performance report
print(classification_report(training_data['Boarded_label'], training_data['BOARDED']))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       175
           1       0.99      1.00      0.99       325

    accuracy                           0.99       500
   macro avg       0.99      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500


In [12]:
# Hijacked confusion matrix
confusion_matrix(training_data['Hijacked_label'], training_data['HIJACKED'])

array([[474,   3],
       [  0,  23]])

In [13]:
# Hijacked performance report
print(classification_report(training_data['Hijacked_label'], training_data['HIJACKED']))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       477
           1       0.88      1.00      0.94        23

    accuracy                           0.99       500
   macro avg       0.94      1.00      0.97       500
weighted avg       0.99      0.99      0.99       500


## Now let's use our trained matcher on the full piracy_df_imo database.

In [103]:
# Add necessary columns, initialize to zeros
piracy_df_imo['BOARDED'] = 0
piracy_df_imo['HIJACKED'] = 0

# Mask out null incident details and IMO No.s
# Null IMO Numbers are just observations and not actual incidents
msk = piracy_df_imo['Incident details'].notna() & piracy_df_imo['IMO No.'].notna()
piracy_df_imo_masked = piracy_df_imo[msk].copy()

# Create docs out of all the 
df_docs = list(nlp.pipe(piracy_df_imo_masked.loc[:,'Incident details']))

# Make the matcher
matcher = generate_matcher(nlp)

# Apply matcher to the database. Typically, takes ~33 seconds to execute
piracy_df_imo_masked = custom_matcher(piracy_df_imo_masked, df_docs, matcher)

KeyboardInterrupt: 

In [14]:
# Show result
piracy_df_imo_masked.loc[:,['Incident details','BOARDED', 'HIJACKED']]

NameError: name 'piracy_df_imo_masked' is not defined

Doesn't do very well on the full dataset...need to make better with a trained statistical model

## Training our own statistical model (this one does not work, keep scrolling to the next section)

In [126]:
# Instead, train model with labels I manually added
# Need to split boarded and hijack data
# Reference: https://medium.com/@johnidouglasmarangon/building-a-text-classification-model-with-spacy-3-x-57e59fa50547
shuffled_df = list(training_data[['Incident_details','Boarded_label', 'Hijacked_label']].sample(frac=1).itertuples(index=False, name=None))

# Split out data into training (75%), dev (15%), test (10%)
train_data = shuffled_boarded_df[:375]
dev_data = shuffled_boarded_df[375:450]
test_data = shuffled_boarded_df[450:500]

# Print lengths
print(f"Boarded Total: {len(shuffled_df)} - Train:  {len(train_data)} - Dev: {len(dev_data)} - Test: {len(test_data)}")

Boarded Total: 500 - Train:  375 - Dev: 75 - Test: 50


list

In [140]:
train = dev = test = 0
for tup in train_data:
    if tup[2] == 1:
        train += 1
for tup in dev_data:
    if tup[2] == 1:
        print(tup)
        dev += 1
for tup in test_data:
    if tup[2] == 1:
        test += 1
print(f'{train = }, {dev = }, {test = }')

('Robbers boarded and hijacked tug, while underway and threw overboard ten crewmembers. A passing tug rescued one crew member and the fate of the remaining nine crewmembers is unknown', 1, 1)
('Pirates hijacked the ship underway \ntaking 25 crewmembers hostage. The ship was sailed to the Somali coast', 1, 1)
train = 18, dev = 2, test = 3


In [128]:
# Reset nlp if it was changed
nlp = spacy.load('en_core_web_md')

# Transform data to binary file (DocBin) in Spacy_Files folder
def convert(data_list, outfile):
    db = spacy.tokens.DocBin()

    # Assign boarded/hijacked categories (cats) to docs
    for tup in data_list:
        # Convert text to doc
        doc = nlp(tup[0])
        
        # Assign cats based on tup and add to DocBin
        doc.cats["BOARDED"] = tup[1] == 1
        doc.cats["HIJACKED"] = tup[2] == 1
        db.add(doc)
        
    # Save DocBin at outfile's path
    db.to_disk(outfile)
    
# Convert all 3 datasets to DocBins
convert(train_data, './Spacy_Files/train.spacy')
print('Train saved')
convert(dev_data, './Spacy_Files/dev.spacy')
print('Dev saved')
convert(test_data, './Spacy_Files/test.spacy')
print('Test saved')

Train saved
Dev saved
Test saved


In [117]:
# Initialize config file for textcat-multilabel training
# !python -m spacy init config --pipeline textcat_multilabel --optimize accuracy --force ./Spacy_Files/config.cfg

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat_multilabel
- Optimize for: accuracy
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
Spacy_Files/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [129]:
# Initiate training using command line and config file downloaded from spacy.io
!python -m spacy train ./Spacy_Files/config.cfg --paths.train ./Spacy_Files/train.spacy --paths.dev ./Spacy_Files/dev.spacy --output ./Spacy_Files/model --verbose

[2024-03-15 16:00:41,385] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: Spacy_Files/model[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2024-03-15 16:00:41,527] [INFO] Set up nlp object from config
[2024-03-15 16:00:41,536] [DEBUG] Loading corpus from path: Spacy_Files/dev.spacy
[2024-03-15 16:00:41,538] [DEBUG] Loading corpus from path: Spacy_Files/train.spacy
[2024-03-15 16:00:41,538] [INFO] Pipeline: ['tok2vec', 'textcat_multilabel']
[2024-03-15 16:00:41,539] [INFO] Created vocabulary
[2024-03-15 16:00:42,350] [INFO] Added vectors: en_core_web_md
[2024-03-15 16:00:42,350] [INFO] Finished initializing nlp object
[2024-03-15 16:00:43,257] [INFO] Initialized pipeline components: ['tok2vec', 'textcat_multilabel']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2024-03-15 16:00:43,268] [DEBUG] Loading corpus from path: Spacy_Files/dev.spacy
[2024-03-15 16:00:43,269] [DEBU

In [130]:
!python -m spacy evaluate ./Spacy_Files/model/model-best/ ./Spacy_Files/test.spacy

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK                   100.00
TEXTCAT (macro AUC)   50.00 
SPEED                 2128  

[1m

               P        R       F
BOARDED    68.00   100.00   80.95
HIJACKED    0.00     0.00    0.00

[1m

           ROC AUC
BOARDED       0.50
HIJACKED      0.50



In [131]:
# Load our new model
nlp_custom = spacy.load('./Spacy_Files/model/model-best')

In [132]:
training_docs = list(nlp_custom.pipe(training_data.loc[:,'Incident_details']))

for doc in training_docs[:10]:
    print(doc.cats, '-', doc)

{'BOARDED': 1.0, 'HIJACKED': 2.06115369216775e-09} - A small wooden boat with four pirates approached the ship on the port side while underway. Another boat with another four pirates approached from the starboard side. When spotted both boats retreated
{'BOARDED': 1.0, 'HIJACKED': 2.06115369216775e-09} - Two robbers in a wooden boat came alongside an anchored ship. Duty crew noticed the robbers attempting to board the ship using a ladder and immediately informed the duty officer who raised the alarm. Crew mustered. Seeing the alerted crew, the robbers fled.
{'BOARDED': 1.0, 'HIJACKED': 2.06115369216775e-09} - Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS.

{'BOARDED': 1.0, 'HIJACKED': 2.06115369216775e-09} -

In [133]:
hijack_example = "Twelve pirates armed with rifles and pistols attacked the fishing vessel, while underway.  Five pirates boarded, hijacked the vessel and demanded payment.  Crew managed to overpower the pirates and handed them over to the Philippines National Police"
boarded_example = "Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS."
doc = nlp(hijack_example)
# doc = nlp(boarded_example)
print(doc.cats, '-', doc)

{} - Twelve pirates armed with rifles and pistols attacked the fishing vessel, while underway.  Five pirates boarded, hijacked the vessel and demanded payment.  Crew managed to overpower the pirates and handed them over to the Philippines National Police


#### This method did not work well. Too many false positives. Going back to the way the YouTube video describes.
Reference: https://www.youtube.com/watch?v=IqOJU1-_Fi0&t=1141s

In [158]:
# Reset nlp if it was changed
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [209]:
# Define function to take rules we already defined and turn it into the 'training format'
# Training format example: ('The pirates boarded.': {'characteristic': [(12, 19, 'BOARDED')]})
def parse_train_data(doc):
    detections = [(doc[start:end].start_char, doc[start:end].end_char, 
                   nlp.vocab.strings[match_id])
                  for match_id, start, end in matcher(doc)]
    return doc.text, {'entities': detections}

In [210]:
parse_train_data(nlp("Pirates hijacked the vessel taking 14 crew members hostage. Owners hired a private aircraft to search for the ship and spotted the ship heading North towards the Somali coast"))

('Pirates hijacked the vessel taking 14 crew members hostage. Owners hired a private aircraft to search for the ship and spotted the ship heading North towards the Somali coast',
 {'entities': [(8, 16, 'BOARDED'),
   (8, 16, 'HIJACKED'),
   (51, 58, 'BOARDED'),
   (51, 58, 'HOSTAGES_TAKEN')]})

In [211]:
doc = nlp("Pirates hijacked the vessel taking 14 crew members hostage. Owners hired a private aircraft to search for the ship and spotted the ship heading North towards the Somali coast")
for match_id, start, end in matcher(doc):
    print(nlp.vocab.strings[match_id], start, end)

BOARDED 1 2
HIJACKED 1 2
BOARDED 8 9
HOSTAGES_TAKEN 8 9


In [221]:
# Create a list of training data in the specified format
# Only train on ones that have matches
# TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(training_data.loc[:,'Incident_details'])
#               if len(matcher(d)) == 1]
TRAIN_DATA = [parse_train_data(d) 
                       for d in nlp.pipe(training_data.loc[:,'Incident_details']) 
                       if len(matcher(d)) >= 1]

In [222]:
TRAIN_DATA

[("Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS.\n",
  {'entities': [(25, 44, 'BOARDED'), (241, 257, 'BOARDED')]}),
 ('While underway, the duty engineer spotted five pirates armed with long knives onboard the vessel. They entered the engine room and tied up the Third Engineer. The duty engineer found the Third Engineer and raised the alarm. The crew was mustered and conducted a search onboard the vessel but the pirates had escaped.',
  {'entities': [(34, 54, 'BOARDED'), (71, 85, 'BOARDED')]}),
 ('Three robbers boarded the anchored product tanker. The alerted crew spotted the robbers. The alarm was raised and crew mustered. Upon seeing the crew alertness, the robbers fled without stealing anything.',
  {'ent

In [223]:
# Create a custom nlp with ner and our custom tags
def create_blank_nlp(train_data):
    nlp_blank = spacy.blank('en')
    ner = nlp_blank.create_pipe('ner')
    nlp_blank.add_pipe('ner', last=True)
    ner.add_label('BOARDED')
    ner.add_label('HIJACKED')
    # for _, annotations in train_data:
    #     for end in annotations.get('entities'):
    #         ner.add_label(end[2])
    return nlp_blank

In [224]:
# nlp_test = create_blank_nlp(TRAIN_DATA)

In [225]:
# Create training loop to train a new model
from datetime import datetime as dt
from spacy.util import minibatch, compounding
from spacy.training import Example

# nlp_ner = create_blank_nlp(TRAIN_DATA)
# optimizer = nlp_ner.begin_training()
# for i in range(20):
#     batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
#     losses = {}
#     for batch in batches:
#         texts, annotations = zip(*batch)
#         example = []
#         
#         for i in range(len(texts)):
#             doc = nlp_ner.make_doc(texts[i])
#             example.append(Example.from_dict(doc, annotations[i]))
#         
#         nlp_ner.update(
#             example,
#             drop=0.1,       # Dropout: makes it harder to memorize data
#             losses=losses
#         )
#     # Print result of each iteration
#     print(f'Losses at iteration {i}: {dt.now()}, {losses}')

ValueError: [E103] Trying to set conflicting doc.ents: '(0, 21, 'BOARDED')' and '(6, 21, 'BOARDED')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap. To work with overlapping entities, consider using doc.spans instead.

In [217]:
nlp_ner.pipeline

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2b5f89bd0>)]

In [218]:
doc = nlp_ner(boarded_example)
displacy.render(doc, style='ent')

In [219]:
hijack_example = "Pirates hijacked the vessel taking 14 crew members hostage. Owners hired a private aircraft to search for the ship and spotted the ship heading North towards the Somali coast"
doc = nlp_ner(hijack_example)
displacy.render(doc, style='ent')



### Another pivot to SpanCat.
Ran into a problem where entities couldn't overlap. We need them to be able to overlap because 'hijacking' also means the ship was 'boarded'.
And here.we.go.

In [240]:
shuffled_df = list(training_data[['Incident_details','Boarded_label', 'Hijacked_label']]
                   .sample(frac=1).itertuples(index=False, name=None))

# Split out data into training (75%), dev (15%), test (10%)
train_data = shuffled_df[:375]
dev_data = shuffled_df[375:450]
test_data = shuffled_df[450:500]

# Print lengths
print(f"Boarded Total: {len(shuffled_df)} - Train:  {len(train_data)} - Dev: {len(dev_data)} - Test: {len(test_data)}")

Boarded Total: 500 - Train:  375 - Dev: 75 - Test: 50


In [241]:
# Make sure we have enough hijacked examples in each category
train = dev = test = 0
for tup in train_data:
    if tup[2] == 1:
        train += 1
for tup in dev_data:
    if tup[2] == 1:
        # print(tup)
        dev += 1
for tup in test_data:
    if tup[2] == 1:
        test += 1
print(f'{train = }, {dev = }, {test = }')

('Robbers boarded and hijacked tug, while underway and threw overboard ten crewmembers. A passing tug rescued one crew member and the fate of the remaining nine crewmembers is unknown', 1, 1)
('Pirates hijacked the ship underway \ntaking 25 crewmembers hostage. The ship was sailed to the Somali coast', 1, 1)
train = 18, dev = 2, test = 3


In [245]:
# Add spans with our custom tags to all our texts
# Texts must be doc objects then we can add spans
# Transform data to binary file (DocBin) in Spacy_Files folder
from spacy.tokens import Span

def convert(data_list, outfile):
    db = spacy.tokens.DocBin()

    # Assign boarded/hijacked categories (cats) to docs
    for tup in data_list:
        # Convert text to doc
        doc = nlp(tup[0])

        # Assign span cats based on matcher
        matches = matcher(doc)
        
        # Add spans with labels to doc
        doc.spans['sc'] = [Span(doc, start, end, label=nlp.vocab.strings[match_id]) 
                           for match_id, start, end in matches]
        
        # Print resulting spans to test
        # for span in doc.spans['sc']:
        #     print(span.text, span.start, span.label_)
        
        db.add(doc)

    # Save DocBin at outfile's path
    db.to_disk(outfile)
    # return db

# Test function
# convert(train_data, '')

# Convert all 3 datasets to DocBins
# convert(train_data, './Spacy_Files/train.spacy')
# print('Train saved')
# convert(dev_data, './Spacy_Files/dev.spacy')
# print('Dev saved')
# convert(test_data, './Spacy_Files/test.spacy')
# print('Test saved')

Train saved
Dev saved
Test saved


In [None]:
# Initialize config file for spancat training
# !python -m spacy init fill-config ./Spacy_Files/sc_base_config.cfg ./Spacy_Files/sc_config.cfg --diff

In [250]:
# Initiate training using command line and config file downloaded from spacy.io
# !python -m spacy train ./Spacy_Files/sc_config.cfg --paths.train ./Spacy_Files/train.spacy --paths.dev ./Spacy_Files/dev.spacy --output ./Spacy_Files/model --verbose

[2024-03-17 10:55:28,808] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: Spacy_Files/model[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[2024-03-17 10:55:28,948] [INFO] Set up nlp object from config
[2024-03-17 10:55:28,957] [DEBUG] Loading corpus from path: Spacy_Files/dev.spacy
[2024-03-17 10:55:28,959] [DEBUG] Loading corpus from path: Spacy_Files/train.spacy
[2024-03-17 10:55:28,959] [INFO] Pipeline: ['tok2vec', 'spancat']
[2024-03-17 10:55:28,961] [INFO] Created vocabulary
[2024-03-17 10:55:29,762] [INFO] Added vectors: en_core_web_md
[2024-03-17 10:55:29,762] [INFO] Finished initializing nlp object
[2024-03-17 10:55:31,293] [INFO] Initialized pipeline components: ['tok2vec', 'spancat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2024-03-17 10:55:31,305] [DEBUG] Loading corpus from path: Spacy_Files/dev.spacy
[2024-03-17 10:55:31,306] [DEBUG] Loading corpus from

In [251]:
# Evaluate the model based on the test data
# !python -m spacy evaluate ./Spacy_Files/model/model-best/ ./Spacy_Files/test.spacy

[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK      100.00
SPAN P   97.30 
SPAN R   92.31 
SPAN F   94.74 
SPEED    1778  

[1m

                      P        R        F
BOARDED           96.88    92.54    94.66
HOSTAGES_TAKEN   100.00    87.50    93.33
HIJACKED         100.00   100.00   100.00



In [15]:
# Load our new model
nlp_custom = spacy.load('./Spacy_Files/model/model-best')

In [16]:
displacy.render(nlp_custom(hijack_example), style='span')

NameError: name 'hijack_example' is not defined

In [254]:
displacy.render(nlp_custom(boarded_example), style='span')

In [26]:
none_example = "Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS."
displacy.render(nlp_custom(none_example), style='span')

In [46]:
# Apply new nlp to our data and put into dataframe using model_interpreter function
training_data = model_interpreter(training_data, 'Incident_details', nlp_custom)
training_data.head(10)

True


Unnamed: 0,Index,IMO No.,Boarded_label,Hijacked_label,Hostages_taken_label,Crew_assaulted_label,Interesting_label,Incident_details,BOARDED,HIJACKED,HOSTAGES_TAKEN,CREW_ASSAULTED
0,1867,9138264.0,0,0,0.0,0.0,0,A small wooden boat with four pirates approach...,0,0,0,0
1,7516,9439113.0,0,0,,,1,Two robbers in a wooden boat came alongside an...,0,0,0,0
2,8414,9498949.0,1,0,,,0,Three robbers armed with long knives boarded a...,1,0,0,0
3,1027,9019030.0,0,0,,,1,A pirate boat attempted to board the ship whil...,0,0,0,0
4,1794,9003392.0,0,0,,,0,Two boats were spotted near the anchor chain e...,0,0,0,0
5,6553,9295270.0,1,0,,,1,"While underway, the duty engineer spotted five...",0,0,0,0
6,7267,9428683.0,1,0,,,0,Three robbers boarded the anchored product tan...,1,0,0,0
7,1276,8818207.0,0,0,,,1,Pirates in two speed boats attempted to board ...,0,0,0,0
8,1448,8018015.0,1,0,,,0,Five robbers armed with long knives and guns b...,1,0,0,0
9,3547,,1,1,,,0,Twelve pirates armed with rifles and pistols a...,1,1,0,0


In [41]:
# Find all false negatives
false_negative = (training_data.loc[lambda d: d['Boarded_label'] == 1]
                               .loc[lambda d: d['BOARDED'] == 0]['Incident_details'])

# Print all false negatives
print(f'{len(false_negative)} total false negatives.')
html_print(html_generator((nlp(i) for i in false_negative), matcher=matcher, n=len(false_negative)))

1 total false negatives.


NameError: name 'matcher' is not defined

In [ ]:
# Find all false positives
false_positive = (training_data.loc[lambda d: d['Boarded_label'] == 0]
                               .loc[lambda d: d['BOARDED'] == 1]['Incident_details'])

# Print false positives using html_print and html_generator functions
print(f'{len(false_positive)} total false positives.')
html_print(html_generator((nlp(i) for i in false_positive), matcher=matcher, n=len(false_positive)))

In [47]:
# Make confusion matrix
confusion_matrix(training_data['Boarded_label'], training_data['BOARDED'])

array([[173,   2],
       [ 17, 308]])

In [48]:
# Boarded performance report
print(classification_report(training_data['Boarded_label'], training_data['BOARDED']))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       175
           1       0.99      0.95      0.97       325

    accuracy                           0.96       500
   macro avg       0.95      0.97      0.96       500
weighted avg       0.96      0.96      0.96       500


In [49]:
# Hijacked confusion matrix
confusion_matrix(training_data['Hijacked_label'], training_data['HIJACKED'])

array([[474,   3],
       [  1,  22]])

In [50]:
# Hijacked performance report
print(classification_report(training_data['Hijacked_label'], training_data['HIJACKED']))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       477
           1       0.88      0.96      0.92        23

    accuracy                           0.99       500
   macro avg       0.94      0.98      0.96       500
weighted avg       0.99      0.99      0.99       500


In [51]:
piracy_df_imo

Unnamed: 0,Date,Ship Name,Ship Type,IMO No.,Area,Latitude,Longitude,Incident details,Consequences for crew etc,Action taken by master/crew,Reported?,Reported to...,Reporting State
0,1994-07-22,PAVELS STERNBERGS,Reefer,7362366,In territorial waters,,,Ship boarded by seven men armed with big cable...,Deck watchman was slightly wounded and some sh...,Chief officer and other crew members came to t...,True,Incident reported to Port Authorities,Latvia
1,1994-09-09,BONSELLA,,,In territorial waters,,,Twenty-six bandits posing as Coast Guard hijac...,Ship's cargo and money stolen,-,True,Yes,United States
2,1994-10-23,SIBOELF,Ore/Bulk/oil carrier,9011935,In port area,,,"6-7 pirates wearing masks, armed with pistols ...",Personal belongings and cash stolen from crew,The watchman saw the pirates and informed term...,True,Terminal informed,Norway
3,1994-10-26,TROPICAL SUN,,,In port area,,,Ship attacked with mortar shells which fell ab...,,-,True,Yes,United States
4,1994-11-17,ANOMIS,,7233711,In territorial waters,,,Boat opened fire on ship after trying unsucces...,-,-,True,Yes,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8551,2024-01-09,CMB Chikako,Bulk carrier,9701190,In territorial waters,1° 03.00' N,103° 39.59' E,"Five robbers armed with a knife, boarded a shi...",The robbers took hostage and tied up one of th...,Alarm raised and crew mustered,True,VTIS Singapore,
8552,2024-01-12,Solar Roma,Product tanker,9887372,In port area,1° 43.29' N,101° 25.72' E,Duty security patrol onboard an anchored tanke...,Nil,Alarm raised,True,Dumai port control,
8553,2024-01-14,Name Withheld,Oil tanker,,In port area,21° 50.84' N,91° 41.84' E,D/O onboard an anchored tanker noticed a small...,Nil,"Alarm raised, and crew mustered",True,Port control and Coast Guard,
8554,2024-01-14,Name Withheld,Supply ship,,In port area,6° 05.00' S,12° 15.00' E,"Unnoticed, thieves boarded an anchored offshor...",Ship’s properties stolen,Nil,True,,


In [65]:
# Apply our nlp to the entire dataset
piracy_df_imo = model_interpreter(piracy_df_imo, 'Incident details', nlp_custom)
piracy_df_imo

True


Unnamed: 0,Date,Ship Name,Ship Type,IMO No.,Area,Latitude,Longitude,Incident details,Consequences for crew etc,Action taken by master/crew,Reported?,Reported to...,Reporting State,BOARDED,HIJACKED,HOSTAGES_TAKEN,CREW_ASSAULTED
0,1994-07-22,PAVELS STERNBERGS,Reefer,7362366,In territorial waters,,,Ship boarded by seven men armed with big cable...,Deck watchman was slightly wounded and some sh...,Chief officer and other crew members came to t...,True,Incident reported to Port Authorities,Latvia,0,0,0,0
1,1994-09-09,BONSELLA,,,In territorial waters,,,Twenty-six bandits posing as Coast Guard hijac...,Ship's cargo and money stolen,-,True,Yes,United States,1,1,0,0
2,1994-10-23,SIBOELF,Ore/Bulk/oil carrier,9011935,In port area,,,"6-7 pirates wearing masks, armed with pistols ...",Personal belongings and cash stolen from crew,The watchman saw the pirates and informed term...,True,Terminal informed,Norway,0,0,0,0
3,1994-10-26,TROPICAL SUN,,,In port area,,,Ship attacked with mortar shells which fell ab...,,-,True,Yes,United States,0,0,0,0
4,1994-11-17,ANOMIS,,7233711,In territorial waters,,,Boat opened fire on ship after trying unsucces...,-,-,True,Yes,United States,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8551,2024-01-09,CMB Chikako,Bulk carrier,9701190,In territorial waters,1° 03.00' N,103° 39.59' E,"Five robbers armed with a knife, boarded a shi...",The robbers took hostage and tied up one of th...,Alarm raised and crew mustered,True,VTIS Singapore,,1,0,1,0
8552,2024-01-12,Solar Roma,Product tanker,9887372,In port area,1° 43.29' N,101° 25.72' E,Duty security patrol onboard an anchored tanke...,Nil,Alarm raised,True,Dumai port control,,0,0,0,0
8553,2024-01-14,Name Withheld,Oil tanker,,In port area,21° 50.84' N,91° 41.84' E,D/O onboard an anchored tanker noticed a small...,Nil,"Alarm raised, and crew mustered",True,Port control and Coast Guard,,0,0,0,0
8554,2024-01-14,Name Withheld,Supply ship,,In port area,6° 05.00' S,12° 15.00' E,"Unnoticed, thieves boarded an anchored offshor...",Ship’s properties stolen,Nil,True,,,1,0,0,0


In [66]:
piracy_df_imo.loc[:, ['Incident details', 'BOARDED', 'HIJACKED', 'HOSTAGES_TAKEN', 'CREW_ASSAULTED']]

Unnamed: 0,Incident details,BOARDED,HIJACKED,HOSTAGES_TAKEN,CREW_ASSAULTED
0,Ship boarded by seven men armed with big cable...,0,0,0,0
1,Twenty-six bandits posing as Coast Guard hijac...,1,1,0,0
2,"6-7 pirates wearing masks, armed with pistols ...",0,0,0,0
3,Ship attacked with mortar shells which fell ab...,0,0,0,0
4,Boat opened fire on ship after trying unsucces...,0,0,0,0
...,...,...,...,...,...
8551,"Five robbers armed with a knife, boarded a shi...",1,0,1,0
8552,Duty security patrol onboard an anchored tanke...,0,0,0,0
8553,D/O onboard an anchored tanker noticed a small...,0,0,0,0
8554,"Unnoticed, thieves boarded an anchored offshor...",1,0,0,0


In [68]:
hijack_example = "Twelve pirates armed with rifles and pistols attacked the fishing vessel, while underway.  Five pirates boarded, hijacked the vessel and demanded payment.  Crew managed to overpower the pirates and handed them over to the Philippines National Police"
boarded_example = "Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS."
hijack_doc = nlp_custom(hijack_example)
boarded_doc = nlp_custom(boarded_example)

In [69]:
displacy.render([hijack_doc, boarded_doc], style='span')