# Notebook for building and testing a Token-Based Matcher and SpanCat spaCy Model

In [None]:
# Set up autoreload for our modules in the spacy_model.py file
%load_ext autoreload
%autoreload 2

In [None]:
# Install spacy and language model (uncomment if needed)
# !conda install -c conda-forge spacy -y
# !python -m spacy download en_core_web_md
# !conda install torchvision -y
# !conda install conda-forge::cupy -y

In [None]:
# Import statements
import pandas as pd
import spacy
from spacy import displacy
from sklearn.metrics import confusion_matrix, classification_report
from IPython.display import HTML as html_print
from spacy_model import generate_matcher, html_generator, custom_matcher, model_interpreter
from pathlib import Path
from spacy.tokens import Span

### Dirty data straight from the International Maritime Organization:
Note: This data is uncleaned and is dirtier than the decks of the Flying Dutchman.

In [None]:
# Read dirty csv
piracy_dirty_path = Path('Data_Files/[Dirty]_ListOfIncidents_IMO.csv')
piracy_df_imo = pd.read_csv(piracy_dirty_path)

# Drop columns we don't need
piracy_df_imo.drop(columns=['Boarded?', 'MSC/Circ', 'Coastal State Action Taken'], inplace=True)

# Ensure no NaNs in Lat/Lon and Incident details columns
piracy_df_imo = piracy_df_imo[piracy_df_imo.loc[:,'Incident details'].notna()]

# Convert Date column to DateTime Objects
piracy_df_imo['Date'] = pd.to_datetime(piracy_df_imo['Date'], format='%m/%d/%y')

# Show whatever you want
# pd.concat([piracy_df_imo.head(), piracy_df_imo.tail()])
piracy_df_imo.head(10)

## Find keywords in incident details using Spacy using Token Matcher 
Reference: https://towardsdatascience.com/structured-natural-language-processing-with-pandas-and-spacy-7089e66d2b10

In [None]:
# Generate 500 random rows of training data for our categorization
# DON'T OVERWRITE OUR TRAINING DATA
training_data_path = Path('./Data_Files/training_data.csv')
# piracy_df_imo.sample(n=500).to_csv(training_data_path)

In [None]:
# Load language model, disable 'named entity recognition'
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [None]:
# Import training data
training_data = pd.read_csv(training_data_path)

# Set up flag columns for when the NLP happens
training_data['BOARDED'] = 0
training_data['HIJACKED'] = 0

# Display result
training_data.loc[:,['Boarded_label', 'Hijacked_label', 'Incident_details']].head(10)

In [None]:
# Create matcher and docs. Place found tags in training_df with bool values
matcher = generate_matcher(nlp)
docs = list(nlp.pipe(training_data.loc[:,'Incident_details']))
training_data = custom_matcher(training_data, docs, matcher)
training_data.head(10).loc[:,['Boarded_label', 'BOARDED', 'Incident_details']]

In [None]:
# Find all false negatives
false_negative = (training_data.loc[lambda d: d['Boarded_label'] == 1]
                               .loc[lambda d: d['BOARDED'] == 0]['Incident_details'])

# Print all false negatives
print(f'{len(false_negative)} total false negatives.')
html_print(html_generator((nlp(i) for i in false_negative), matcher=matcher, n=len(false_negative)))

In [None]:
# Find all false positives
false_positive = (training_data.loc[lambda d: d['Boarded_label'] == 0]
                               .loc[lambda d: d['BOARDED'] == 1]['Incident_details'])

# Print false positives using html_print and html_generator functions
print(f'{len(false_positive)} total false positives.')
html_print(html_generator((nlp(i) for i in false_positive), matcher=matcher, n=len(false_positive)))

In [None]:
# Boarded confusion matrix
confusion_matrix(training_data['Boarded_label'], training_data['BOARDED'])

In [None]:
# Boarded performance report
print(classification_report(training_data['Boarded_label'], training_data['BOARDED']))

In [None]:
# Hijacked confusion matrix
confusion_matrix(training_data['Hijacked_label'], training_data['HIJACKED'])

In [None]:
# Hijacked performance report
print(classification_report(training_data['Hijacked_label'], training_data['HIJACKED']))

## Now let's use our trained matcher on the full piracy_df_imo database.
Takes about 30 seconds to run.

In [None]:
# Add necessary columns, initialize to zeros
piracy_df_imo['BOARDED'] = 0
piracy_df_imo['HIJACKED'] = 0

# Mask out null incident details and IMO No.s
# Null IMO Numbers are just observations and not actual incidents
msk = piracy_df_imo['Incident details'].notna() & piracy_df_imo['IMO No.'].notna()
piracy_df_imo_masked = piracy_df_imo[msk].copy()

# Create docs out of all the 
df_docs = list(nlp.pipe(piracy_df_imo_masked.loc[:,'Incident details']))

# Make the matcher
matcher = generate_matcher(nlp)

# Apply matcher to the database. Typically, takes ~33 seconds to execute
piracy_df_imo_masked = custom_matcher(piracy_df_imo_masked, df_docs, matcher)

In [None]:
# Show result
piracy_df_imo_masked.loc[:,['Incident details','BOARDED', 'HIJACKED']]

Doesn't do very well on the full dataset...need to make better with a trained statistical model

### Create and train a Span Categorization model that will detect and label spans with boarded, hijacked, assaulted, hostages taken

In [None]:
# Shuffle the data frame and output tuples with (incident text, boarded_label, hijacked_label)
# These labels are ones that we manually read and put into the training data.
shuffled_df = list(training_data[['Incident_details','Boarded_label', 'Hijacked_label']]
                   .sample(frac=1).itertuples(index=False, name=None))

# Split out data into training (75%), dev (15%), test (10%)
train_data = shuffled_df[:375]
dev_data = shuffled_df[375:450]
test_data = shuffled_df[450:500]

# Print lengths
print(f"Boarded Total: {len(shuffled_df)} - Train:  {len(train_data)} - Dev: {len(dev_data)} - Test: {len(test_data)}")

In [None]:
# Make sure we have enough hijacked examples in each category
# If not enough examples in each category, rerun above cell
train = dev = test = 0
for tup in train_data:
    if tup[2] == 1:
        train += 1
for tup in dev_data:
    if tup[2] == 1:
        # print(tup)
        dev += 1
for tup in test_data:
    if tup[2] == 1:
        test += 1
print(f'{train = }, {dev = }, {test = }')

In [None]:
# Add spans with our custom tags to all our texts
# Texts must be doc objects, then add spans found with the matcher we made above
# Transform data to binary file (DocBin) in Spacy_Files folder
def convert(data_list, outfile):
    db = spacy.tokens.DocBin()

    # Assign boarded/hijacked categories (cats) to docs
    for tup in data_list:
        # Convert text to doc
        doc = nlp(tup[0])

        # Assign span cats based on matcher
        matches = matcher(doc)
        
        # Add spans with labels to doc
        doc.spans['sc'] = [Span(doc, start, end, label=nlp.vocab.strings[match_id]) 
                           for match_id, start, end in matches]
        
        # Print resulting spans to test
        # for span in doc.spans['sc']:
        #     print(span.text, span.start, span.label_)
        
        db.add(doc)

    # Save DocBin at outfile's path
    db.to_disk(Path(outfile))
    # return db

# Test function
# convert(train_data, '')

# Convert all 3 datasets to DocBins. Output to disk
# Comment out to prevent overwriting the data we used
# convert(train_data, './Spacy_Files/train.spacy')
# print('Train saved')
# convert(dev_data, './Spacy_Files/dev.spacy')
# print('Dev saved')
# convert(test_data, './Spacy_Files/test.spacy')
# print('Test saved')

In [None]:
# Fill the config file for SpanCat training
# Config file downloaded from https://spacy.io/usage/training#quickstart
# !python -m spacy init fill-config ./Spacy_Files/sc_base_config.cfg ./Spacy_Files/sc_config.cfg --diff

In [None]:
# Initiate training using command line and config file downloaded from spacy.io
# !python -m spacy train ./Spacy_Files/sc_config.cfg --paths.train ./Spacy_Files/train.spacy --paths.dev ./Spacy_Files/dev.spacy --output ./Spacy_Files/model --verbose

In [None]:
# Evaluate the model based on the test data created above
# !python -m spacy evaluate ./Spacy_Files/model/model-best/ ./Spacy_Files/test.spacy

In [None]:
# Load our new model from disk
nlp_custom = spacy.load(Path('./Spacy_Files/model/model-best'))

In [None]:
hijack_example = "Pirates hijacked the vessel taking 14 crew members hostage. Owners hired a private aircraft to search for the ship and spotted the ship heading North towards the Somali coast"
displacy.render(nlp_custom(hijack_example), style='span')

In [None]:
boarded_example = "Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship‚Äôs engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS."
displacy.render(nlp_custom(boarded_example), style='span')

In [None]:
none_example = "Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS."
displacy.render(nlp_custom(none_example), style='span')

In [None]:
# Apply new nlp to our data and put into dataframe using model_interpreter function
# Takes about 15 seconds to run
training_data = model_interpreter(training_data, 'Incident_details', nlp_custom)
training_data.head(10)

In [None]:
# Find all false negatives in training data
false_negative = (training_data.loc[lambda d: d['Boarded_label'] == 1]
                               .loc[lambda d: d['BOARDED'] == 0]['Incident_details'])

# Print all false negatives
print(f'{len(false_negative)} total false negatives.')
html_print(html_generator((nlp(i) for i in false_negative), matcher=matcher, n=len(false_negative)))

In [None]:
# Find all false positives
false_positive = (training_data.loc[lambda d: d['Boarded_label'] == 0]
                               .loc[lambda d: d['BOARDED'] == 1]['Incident_details'])

# Print false positives using html_print and html_generator functions
print(f'{len(false_positive)} total false positives.')
html_print(html_generator((nlp(i) for i in false_positive), matcher=matcher, n=len(false_positive)))

In [None]:
# Boarded confusion matrix
confusion_matrix(training_data['Boarded_label'], training_data['BOARDED'])

In [None]:
# Boarded performance report
print(classification_report(training_data['Boarded_label'], training_data['BOARDED']))

In [None]:
# Hijacked confusion matrix
confusion_matrix(training_data['Hijacked_label'], training_data['HIJACKED'])

In [None]:
# Hijacked performance report
print(classification_report(training_data['Hijacked_label'], training_data['HIJACKED']))

In [None]:
# Apply our nlp to the entire dataset
# Takes about 4.5 minutes to run
piracy_df_imo = model_interpreter(piracy_df_imo, 'Incident details', nlp_custom)
piracy_df_imo.head(10)

In [None]:
# Show just the columns we're interested in
piracy_df_imo.loc[:, ['Incident details', 'BOARDED', 'HIJACKED', 'HOSTAGES_TAKEN', 'CREW_ASSAULTED']].head(10)

In [None]:
# Initialize examples of how it works
hijack_example = "Twelve pirates armed with rifles and pistols attacked the fishing vessel, while underway.  Five pirates boarded, hijacked the vessel and demanded payment.  Crew managed to overpower the pirates and handed them over to the Philippines National Police"
boarded_example = "Three robbers armed with long knives boarded an anchored tanker during heavy rain. They entered the engine room, threatened the duty engineer and stole ship's engine spares. Incident reported to the OOW who raised the alarm resulting in the robbers escaping in a waiting boat along with two accomplices. Incident reported to VTS."
hijack_doc = nlp_custom(hijack_example)
boarded_doc = nlp_custom(boarded_example)

In [None]:
# Render examples of the spans that were detected and labeled
displacy.render([hijack_doc, boarded_doc], style='span')

### Creating products for presentation

In [None]:
# Display sentence dependencies for presentation
doc = nlp("On seeing crew alertness, the robbers jumped overboard and escaped empty handed in a waiting boat with one accomplice.")
svg = spacy.displacy.render(doc, style='dep', jupyter=False,
                            options={'bg': 'white', 'compact': False, 'distance': 130,
                                     'fine_grained': True, 'font_size': 30})

# Output to file
output_path = Path("./Data_Files/Results/dependency_plot3.svg")
output_path.open("w", encoding="utf-8").write(svg)

In [None]:
# Print matches for presentation using html_generator
test_sents = ["Three robbers in a boat boarded the ship, while at anchor, broke into ship's stores and when alert crew mustered, they escaped with their loot.",
              "Incident reported to port authority through the local agents. Upon berthing, the police boarded the ship for investigation.",
              "A pirate boat attempted to board the ship while underway.", "While underway, the master of the bulk carrier reported to Singapore VTIS that four perpetrators were sighted in the steering gear room. The master raised the alarm and mustered crew. A security search on board was conducted and at about 0345 hrs, the security search was completed and no perpetrators found.", "The duty watchman onboard the ship at anchor, noticed robbers trying to open the watertight door to the forecastle store. He immediately alerted the OOW who, in turn, raised the alarm and crew mustered. On seeing crew alertness, the robbers jumped overboard and escaped empty handed in a waiting boat with one accomplice."]
# test_docs = list(nlp.pipe(test_sents))
html_print(html_generator((nlp(i) for i in test_sents), matcher=matcher, n=len(test_sents)))