# Notebook for doing pirate stuff with Pandas Dataframes

In [None]:
# Install spacy and language model (uncomment if needed)
# !conda install -c conda-forge spacy -y
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !conda install torchvision -y
# !conda install conda-forge::cupy -y

In [167]:
# Import statements
# import os
import pandas as pd
# import numpy as np
import matplotlib.pyplot as plt
import spacy
# from spacy.matcher import Matcher
from spacy import displacy
from sklearn.metrics import confusion_matrix, classification_report
from IPython.display import HTML as html_print
from spacy_model import generate_matcher, html_generator, custom_matcher

### Original Data that Mike found:

In [82]:
# Read clean csv
piracy_df_original = pd.read_csv('Data_Files/[Clean] IMO Piracy - 2000 to 2022 (PDV 01-2023).csv')

# Convert dates to datetime object
piracy_df_original['Incident Date'] = pd.to_datetime(piracy_df_original['Incident Date'], format='%m/%d/%Y')

# pd.concat([piracy_df_original.head(), piracy_df_original.tail()])
piracy_df_original

Unnamed: 0,Incident Date,Ship Name,Ship Flag,Ship Type,Area,Latitude,Longitude,Consequences to Crew,Part of Ship Raided,Ship Status,Weapons Used,Flag - Crew Injuries,Flag - Crew Held Hostage,Flag - Crew Missing,Flag - Crew Deaths,Flag - Crew Assaulted
0,2010-03-18,AL-ASA'A,Yemen,Dhow,In international waters,,,Ship Hijacked,Not Stated,Not Stated,None or Not Reported,False,True,False,False,False
1,2010-05-25,AL JAWAT,Yemen,Dhow,In international waters,,,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,False,False,False,False
2,2011-02-13,AL FARDOUS,Yemen,Fishing vessel,In territorial waters,,,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,False,False,False,False
3,2011-04-16,ABDI KHAN,Yemen,Fishing vessel,In international waters,11.900000,54.083333,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,True,False,False,False
4,2012-01-14,AL WASIL,Yemen,Dhow,In international waters,,,Ship Hijacked,Not Stated,Steaming,None or Not Reported,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4809,2009-12-30,GULF ELAN,Bahamas,Chemical tanker,In port area,22.690000,113.696667,No Consequences or Not Stated,Engine Room,At Anchor,None or Not Reported,False,False,False,False,False
4810,2008-11-07,CEC FUTURE,Bahamas,General cargo ship,In international waters,12.766667,45.933333,Ship Hijacked,Engine Room,Steaming,None or Not Reported,False,True,False,False,False
4811,2006-02-13,ASPEN ARROW,Bahamas,General cargo ship,In port area,,,No Consequences or Not Stated,Engine Room,At Anchor,None or Not Reported,False,False,False,False,False
4812,2009-10-24,ELLEN S,Antigua and Barbuda,Container ship,In territorial waters,20.641667,106.880000,Not Reported,Engine Room,At Anchor,None or Not Reported,False,False,False,False,False


In [31]:
# Go forth and do great things

### New data straight from the International Maritime Organization:
Note: This data is uncleaned and is dirtier than the decks of the Flying Dutchman.

In [182]:
# Read dirty csv
piracy_df_imo = pd.read_csv('Data_Files/[Dirty]_ListOfIncidents_IMO.csv')

# Drop columns we don't need
piracy_df_imo.drop(columns=['Boarded?', 'MSC/Circ', 'Coastal State Action Taken'], inplace=True)

# Convert Date column to DateTime Objects
piracy_df_imo['Date'] = pd.to_datetime(piracy_df_imo['Date'], format='%m/%d/%y')

# Show whatever you want
# pd.concat([piracy_df_imo.head(), piracy_df_imo.tail()])
piracy_df_imo

Unnamed: 0,Date,Ship Name,Ship Type,IMO No.,Area,Latitude,Longitude,Incident details,Consequences for crew etc,Action taken by master/crew,Reported?,Reported to...,Reporting State
0,1994-07-22,PAVELS STERNBERGS,Reefer,7362366,In territorial waters,,,Ship boarded by seven men armed with big cable...,Deck watchman was slightly wounded and some sh...,Chief officer and other crew members came to t...,True,Incident reported to Port Authorities,Latvia
1,1994-09-09,BONSELLA,,,In territorial waters,,,Twenty-six bandits posing as Coast Guard hijac...,Ship's cargo and money stolen,-,True,Yes,United States
2,1994-10-23,SIBOELF,Ore/Bulk/oil carrier,9011935,In port area,,,"6-7 pirates wearing masks, armed with pistols ...",Personal belongings and cash stolen from crew,The watchman saw the pirates and informed term...,True,Terminal informed,Norway
3,1994-10-26,TROPICAL SUN,,,In port area,,,Ship attacked with mortar shells which fell ab...,,-,True,Yes,United States
4,1994-11-17,ANOMIS,,7233711,In territorial waters,,,Boat opened fire on ship after trying unsucces...,-,-,True,Yes,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8551,2024-01-09,CMB Chikako,Bulk carrier,9701190,In territorial waters,1° 03.00' N,103° 39.59' E,"Five robbers armed with a knife, boarded a shi...",The robbers took hostage and tied up one of th...,Alarm raised and crew mustered,True,VTIS Singapore,
8552,2024-01-12,Solar Roma,Product tanker,9887372,In port area,1° 43.29' N,101° 25.72' E,Duty security patrol onboard an anchored tanke...,Nil,Alarm raised,True,Dumai port control,
8553,2024-01-14,Name Withheld,Oil tanker,,In port area,21° 50.84' N,91° 41.84' E,D/O onboard an anchored tanker noticed a small...,Nil,"Alarm raised, and crew mustered",True,Port control and Coast Guard,
8554,2024-01-14,Name Withheld,Supply ship,,In port area,6° 05.00' S,12° 15.00' E,"Unnoticed, thieves boarded an anchored offshor...",Ship’s properties stolen,Nil,True,,


In [183]:
# Go forth and make Steve Urkel proud
piracy_df_imo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8556 entries, 0 to 8555
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Date                         8556 non-null   datetime64[ns]
 1   Ship Name                    8409 non-null   object        
 2   Ship Type                    8206 non-null   object        
 3   IMO No.                      7213 non-null   object        
 4   Area                         8550 non-null   object        
 5   Latitude                     4537 non-null   object        
 6   Longitude                    4575 non-null   object        
 7   Incident details             8551 non-null   object        
 8   Consequences for crew etc    8068 non-null   object        
 9   Action taken by master/crew  8171 non-null   object        
 10  Reported?                    8556 non-null   bool          
 11  Reported to...               6705 non-null 

In [None]:
# Plot histogram of incidents over the years
fig, ax = plt.subplots()

piracy_df_imo['Date'].hist(bins=30, ax=ax)
plt.title('Histogram of Recorded Piracy Incidents (1994-2024)')
plt.xlabel('Year')
plt.ylabel('Frequency')

# Find keywords in incident details using Spacy
Reference: https://towardsdatascience.com/structured-natural-language-processing-with-pandas-and-spacy-7089e66d2b10

Ideas:
- Build categories (cats) and assign them to each noun chunk
    - Categories: hijacked, boarded, hostages, theft, ship missing, crew member abducted/kidnapped, ship fired upon, casualties
- Set extension as interesting with keywords in lemma column
- Add IMO No. to doc metadata (context): Can't find a way to efficiently do this
- Set date as index

## Let's train a Matcher object for rule-based matching

In [ ]:
# Generate 500 random rows of training data for our categorization
# DON'T OVERWRITE OUR TRAINING DATA

# piracy_df_imo.sample(n=500).to_csv('./Data_Files/training_data.csv')

In [ ]:
# Load language model, disable 'named entity recognition'
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [168]:
# Import training data
training_data = pd.read_csv('./Data_Files/training_data.csv')

# Set up flag columns for when the NLP happens
training_data['BOARDED'] = 0
training_data['HIJACKED'] = 0

# Display result
training_data.loc[:,['Boarded_label', 'Hijacked_label', 'Incident_details']]

Unnamed: 0,Boarded_label,Hijacked_label,Incident_details
0,0,0,A small wooden boat with four pirates approach...
1,0,0,Two robbers in a wooden boat came alongside an...
2,1,0,Three robbers armed with long knives boarded a...
3,0,0,A pirate boat attempted to board the ship whil...
4,0,0,Two boats were spotted near the anchor chain e...
...,...,...,...
495,1,0,A speedboat with five pirates wearing camoufla...
496,1,0,Two robbers boarded an anchored ship using a h...
497,1,1,Pirates attacked and hijacked the ship underwa...
498,1,0,Five robbers armed with guns boarded an anchor...


In [169]:
# Create matcher and docs. Place in training_df with bool values
matcher = generate_matcher(nlp)
docs = list(nlp.pipe(training_data.loc[:,'Incident_details']))
training_data = custom_matcher(training_data, docs, matcher)
# training_data.head(50).loc[:,['Boarded', 'BOARDED', 'Incident_details']]

In [170]:
# Find all false negatives
false_negative = (training_data.loc[lambda d: d['Boarded_label'] == 1]
                               .loc[lambda d:       d['BOARDED'] == 0]['Incident_details'])

# Print all false negatives
print(f'{len(false_negative)} total false negatives.')
html_print(html_generator((nlp(i) for i in false_negative), matcher=matcher, n=len(false_negative)))

0 total false negatives.


In [171]:
# Find all false positives
false_positive = (training_data.loc[lambda d: d['Boarded_label'] == 0]
                               .loc[lambda d: d['BOARDED'] == 1]['Incident_details'])

# Print false positives using html_print and html_generator functions
print(f'{len(false_positive)} total false positives.')
html_print(html_generator((nlp(i) for i in false_positive), matcher=matcher, n=len(false_positive)))

4 total false positives.


In [172]:
# Make confusion matrix
confusion_matrix(training_data['Boarded_label'], training_data['BOARDED'])

array([[171,   4],
       [  0, 325]])

In [173]:
# Boarded performance report
print(classification_report(training_data['Boarded_label'], training_data['BOARDED']))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       175
           1       0.99      1.00      0.99       325

    accuracy                           0.99       500
   macro avg       0.99      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500


In [174]:
# Hijacked confusion matrix
confusion_matrix(training_data['Hijacked_label'], training_data['HIJACKED'])

array([[474,   3],
       [  0,  23]])

In [189]:
# Hijacked performance report
print(classification_report(training_data['Hijacked_label'], training_data['HIJACKED']))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       477
           1       0.88      1.00      0.94        23

    accuracy                           0.99       500
   macro avg       0.94      1.00      0.97       500
weighted avg       0.99      0.99      0.99       500


## Now let's use our trained matcher on the full piracy_df_imo database.

In [191]:
# Add necessary columns, initialize to zeros
piracy_df_imo['BOARDED'] = 0
piracy_df_imo['HIJACKED'] = 0

# Mask out null incident details and IMO No.s
# Null IMO Numbers are just observations and not actual incidents
msk = piracy_df_imo['Incident details'].notna() & piracy_df_imo['IMO No.'].notna()
piracy_df_imo_masked = piracy_df_imo[msk].copy()

# Create docs out of all the 
df_docs = list(nlp.pipe(piracy_df_imo_masked.loc[:,'Incident details']))

# Make the matcher
matcher = generate_matcher(nlp)

# Apply matcher to the database. Typically takes ~33 seconds to execute
piracy_df_imo_masked = custom_matcher(piracy_df_imo_masked, df_docs, matcher)

In [188]:
# Show result
piracy_df_imo_masked.loc[:,['Incident details','BOARDED', 'HIJACKED']]

Unnamed: 0,Incident details,BOARDED,HIJACKED
0,Ship boarded by seven men armed with big cable...,0.0,0.0
2,"6-7 pirates wearing masks, armed with pistols ...",0.0,0.0
4,Boat opened fire on ship after trying unsucces...,1.0,0.0
6,"Chased aggressively by a high speed unlit, uni...",1.0,0.0
7,"Whilst awaiting pilot, two men boarded the shi...",1.0,0.0
...,...,...,...
7193,,1.0,0.0
7194,,0.0,0.0
7198,,1.0,1.0
7199,,1.0,0.0


Doesn't do very well on the full dataset...need to make better

## Other things