# Compare Models


In [1]:
from __future__ import print_function, division
%matplotlib inline
import sys
import numpy as np
sys.path.append('..')
import warnings; warnings.filterwarnings('ignore')
from IPython.core.display import display, HTML, Markdown
import datetime
import pytz
from sklearn import metrics
import csv
import vessel_scoring.models
import vessel_scoring.evaluate_model
from vessel_scoring import utils
import vessel_scoring.data
from glob import glob
import os
from vessel_scoring import data, utils
from numpy.lib.recfunctions import append_fields


sys.path.append("../../training-data-source")
import tools as trtools
sys.path.append("../../vessel-classification-pipeline/classification")
from  classification.utility import is_test

In [2]:
# Load the data. 
# ==============

# Load training and test data

def load_data(*names, **kw):
    res = []
    for name in names:
        d = np.load('../../datasets/data/labeled/%s.measures.labels.npz' % name)['x']
        d = append_fields(d, 'classification', d['is_fishing'])
        d = d[~(np.isinf(d['classification']) | np.isnan(d['classification']) | 
                np.isnan(d['timestamp']) | np.isnan(d['speed']) & np.isnan(d['course']))]
        if len(d):
            res.append(d)
    return np.concatenate(res)

all_lline = load_data("pybossa_project_3_Drifting_longlines")
all_trawl = load_data("pybossa_project_3_Trawlers")
all_pseine = load_data("pybossa_project_3_Purse_seines")

# Slow transits (used to train models to avoid classifying slow transits as fishing)
all_tran = load_data(
    "false_positives_Drifting_longlines",
    "false_positives_Fixed_gear",
    "false_positives_Purse_seines",
    "false_positives_Trawlers",
    "false_positives_Unknown")

In [3]:
rngdata = {
    'longliner' : np.concatenate([all_lline, all_tran]),
    'trawler' : np.concatenate([all_trawl, all_tran]),
    'ps' : np.concatenate([all_pseine, all_tran])
}

tran_mmsi = set(all_tran['mmsi'])

In [4]:
# Then load mappings MMSI to true / inferred gear type

def gear_name_of(x):
    return x        
        
# Create datasets for training and testing

def is_test_id(x):
    # TODO: use list of MMSI from somewhere to 
    # 
    return is_test(int(x))


all_data = np.concatenate([x for x in rngdata.values()])

all_data['is_fishing'] = (all_data['is_fishing'] > 0.5)


In [5]:
test_data = {}
train_data = {}
np.random.seed(4321)
for gear_type in sorted(rngdata):    
    available_mmsi = sorted(set(rngdata[gear_type]['mmsi']) - tran_mmsi)
    np.random.shuffle(available_mmsi)
    n = len(available_mmsi) // 2
    # use the true gear types for the training data
    train_mmsi = set(available_mmsi[:n]) | tran_mmsi
    true_mask = [(x in train_mmsi) for x in rngdata[gear_type]['mmsi']]
    true_mask = np.array(true_mask, dtype=bool)
    if true_mask.sum():
        train_data[gear_type] = rngdata[gear_type][true_mask]    

    test_mmsi = set(available_mmsi[n:])
    test_mask = [(x in test_mmsi) for x in rngdata[gear_type]['mmsi']]
    test_mask = np.array(test_mask, dtype=bool)
    if test_mask.sum():
        test_data[gear_type] = rngdata[gear_type][test_mask]


In [6]:
for gear_type in sorted(rngdata):
    total_train = len(set(train_data[gear_type]['mmsi']))
    print(gear_type, total_train, len(set(train_data[gear_type]['mmsi'])) - len(tran_mmsi), len(set(test_data[gear_type]['mmsi'])))
    assert not set(train_data[gear_type]['mmsi']) & set(test_data[gear_type]['mmsi'])

longliner 98 9 10
ps 96 7 8
trawler 104 15 15


In [7]:
def compare_fishing_localization(true_ranges, inferred_ranges):

    all_mmsi = sorted(set(x.mmsi for x in true_ranges))

    true_by_mmsi = {}
    pred_by_mmsi = {}

    for mmsi in all_mmsi:
        true = np.array([x for x in true_ranges if x.mmsi == mmsi])
        inferred = np.array([x for x in inferred_ranges if x.mmsi == mmsi])

        # Determine minutes from start to finish of this mmsi, create an array to
        # hold results and fill with -1 (unknown)
        _, start, end, _ = true[0]
        for (_, s, e, _) in true[1:]:
            start = min(start, s)
            end = max(end, e)
        start_min = datetime_to_minute(start)
        end_min = datetime_to_minute(end)
        
        minutes = np.empty([end_min - start_min + 1, 2], dtype=int)
        minutes.fill(-1)

        # Fill in minutes[:, 0] with known true / false values
        for (_, s, e, is_fishing) in true:
            s_min = datetime_to_minute(s)
            e_min = datetime_to_minute(e)
            for m in range(s_min - start_min, e_min - start_min + 1):
                minutes[m, 0] = is_fishing

        # fill in minutes[:, 1] with in inferred values
        for (_, s, e, is_fishing) in inferred:
            s_min = datetime_to_minute(s)
            e_min = datetime_to_minute(e)
            for m in range(s_min - start_min, e_min - start_min + 1):
                if 0 <= m < len(minutes):
                    minutes[m, 1] = is_fishing       
 
        mask = ((minutes[:, 0] != -1) & (minutes[:, 1] != -1))

        if mask.sum():
            true_by_mmsi[mmsi] = minutes[mask, 0]
            pred_by_mmsi[mmsi] = minutes[mask, 1]
            
    return true_by_mmsi, pred_by_mmsi

def ranges_from_ais(ais, is_fishing):
    points = [trtools.Point(x['mmsi'], 
                            datetime.datetime.utcfromtimestamp(x['timestamp']).replace(tzinfo=pytz.utc), is_fishing[i])
              for i, x in enumerate(ais)]
    return trtools.ranges_from_points(points)

def datetime_to_minute(dt):
    timestamp = (dt - datetime.datetime(1970, 1, 1, tzinfo=pytz.utc)).total_seconds()
    return int(timestamp // 60)

def model_true_inferred(mdl, test_data):
    check =  mdl.predict_proba(test_data)[:,1]
    
    inferred_ranges = list(ranges_from_ais(test_data, mdl.predict_proba(test_data)[:,1] > 0.5))
    true_ranges = list(ranges_from_ais(test_data, test_data['is_fishing'] > 0.5))
    
    true, inferred = compare_fishing_localization(true_ranges, inferred_ranges)
    
    y_true = np.concatenate(true.values())
    y_pred = np.concatenate(inferred.values())
    
    return y_true, y_pred

def model_metrics(y_true, y_pred):
    
    accuracy = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    
    return precision, recall, accuracy, f1, y_true, y_pred
    

In [8]:
models = [
        ('Random Forest with Distances', vessel_scoring.random_forest_model.RandomForestModel(
                colspec=dict(windows=vessel_scoring.colspec.Colspec.windows,
                                measures=['speed', 'distance_from_shore', 'distance_from_port']))),
        ('Logistic', vessel_scoring.logistic_model.LogisticModel(order=6,
                colspec=dict(windows=vessel_scoring.colspec.Colspec.windows,
                                measures=['measure_speed']))),
] 

for name, mdl in models:

    lines = ["|Model|Train MMSI|Train Pts|Test MMSI|Test Pts|Precision|Recall|Accuracy|F1-Score|True-Ratio|",
             "|-----|----------|---------|---------|--------|---------|------|--------|--------|----------|"]

    all_true = []
    all_pred = []
    all_mmsi = []

    for gear in sorted(rngdata):
                
        title = gear

        gear_name = gear #os.path.splitext(os.path.splitext(gear)[0])[0][9:]

        print(".", end="")
        
        if gear not in test_data:
            continue
        
        trained = vessel_scoring.models.train_model_on_data(mdl, train_data[gear])
        
        y_true, y_pred =  model_true_inferred(trained, test_data[gear])
        

        mmsi = test_data[gear]['mmsi'].astype(int)
        all_mmsi.append(mmsi)
                
        all_true.append(y_true)
        all_pred.append(y_pred)

        train_mmsi = set(train_data[gear]['mmsi']) - tran_mmsi
        train_pts = sum([(train_data[gear]['mmsi'] == m).sum() for m in train_mmsi])
        #
        mmsi_count = len(set(mmsi))
        n_points = len(y_true)
        accuracy = metrics.accuracy_score(y_true, y_pred)
        precision = metrics.precision_score(y_true, y_pred)
        recall = metrics.recall_score(y_true, y_pred)
        f1 = metrics.f1_score(y_true, y_pred)
        ratio = y_true.sum() / len(y_true)
        # Add to Markdown table
        lines.append('|{}|{}|{}|{}|{}|'.format(gear, len(train_mmsi), train_pts, mmsi_count, n_points) + '|'.join(['{:.2f}'.format(x) for x in
                                    [precision, recall, accuracy, f1, ratio]]) + '|')
 
    display(HTML("<h2>{}</h2>".format(name)))

    display(Markdown('\n'.join(lines)))       

...

|Model|Train MMSI|Train Pts|Test MMSI|Test Pts|Precision|Recall|Accuracy|F1-Score|True-Ratio|
|-----|----------|---------|---------|--------|---------|------|--------|--------|----------|
|longliner|9|108662|10|270174|0.96|0.64|0.82|0.77|0.46|
|ps|7|20344|8|117626|0.82|0.54|0.84|0.65|0.28|
|trawler|15|145458|15|510027|0.98|0.97|0.99|0.98|0.32|

...

|Model|Train MMSI|Train Pts|Test MMSI|Test Pts|Precision|Recall|Accuracy|F1-Score|True-Ratio|
|-----|----------|---------|---------|--------|---------|------|--------|--------|----------|
|longliner|9|108662|10|266893|0.90|0.89|0.90|0.89|0.46|
|ps|7|20344|8|117203|0.90|0.23|0.78|0.37|0.28|
|trawler|15|145458|15|508505|0.96|0.96|0.97|0.96|0.32|

In [9]:
# Note that the training counts excludes the transits, which are included in the training set as well
print("Transit MMSI:", len(tran_mmsi))
print("Transit Points:", sum([(train_data[gear]['mmsi'] == m).sum() for m in tran_mmsi]))

Transit MMSI: 89
Transit Points: 19728
