In [1]:
import pandas as pd
import numpy as np

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

# Get raw data

In [90]:
def get_data(path, file_names):
    tables=[]
    for file_name in file_names:
        table = pd.read_csv(f'{path}/{file_name}.csv')
        table['type']=f'{file_name}'
        tables.append(table)
    return pd.concat(tables)

path='raw_data'
file_names = ['trawlers', 'drifting_longlines', 'fixed_gear', 'pole_and_line', 'purse_seines', 'trollers', 'unknown']
df_original = get_data(path, file_names)

# Preprocessing

## Remove unknown (-1 in is_fishing column)

In [155]:
df = df_original.copy()

In [156]:
df = df.loc[df['is_fishing'] > -1]
df['is_fishing'].value_counts()

0.000000    295979
1.000000    247498
0.666667      4806
0.333333      4096
0.750000       752
0.250000       670
0.800000        33
0.166667        12
0.400000         9
Name: is_fishing, dtype: int64

## Converting is_fishing to Binary (0 or 1)

In [157]:
df_fishing = df

# round the decimals so that number becomes 0 or 1
df_fishing.loc[:, ('is_fishing')] = round(df_fishing.loc[:, ('is_fishing')])

# check the unique values
df_fishing['is_fishing'].value_counts()

0.0    300766
1.0    253089
Name: is_fishing, dtype: int64

## remove type, as it is target of second model 

In [158]:
df_fishing.drop(columns = ["type"], inplace = True)

## remove boat history track that are too small ( <15 )

In [159]:
# Calculate the value counts of 'mmsi'
mmsi_counts = df_fishing['mmsi'].value_counts()

# Create a boolean mask for filtering mmsi values with counts less than or equal to 15
mask = mmsi_counts > 15

# Get the mmsi values that meet the condition
selected_mmsi = mmsi_counts[mask].index

# Use the isin() method to filter the DataFrame based on selected_mmsi
filtered_fishing_df = df_fishing[df_fishing['mmsi'].isin(selected_mmsi)]

In [160]:
# Dropping rows with NAN values
df_1 = filtered_fishing_df.dropna()

In [161]:
df_1.shape

(553534, 10)

In [162]:
df_fishing_clean= df_1[0:10000]

In [163]:
df_fishing_clean.shape

(10000, 10)

In [164]:
df_fishing_clean.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source
60646,1252340000000.0,1420089000.0,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw
60647,1252340000000.0,1420090000.0,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw
60648,1252340000000.0,1420090000.0,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw
60649,1252340000000.0,1420091000.0,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw
60650,1252340000000.0,1420092000.0,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw


In [165]:
df_fishing_clean.columns

Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon', 'is_fishing', 'source'],
      dtype='object')

# Split Data

In [166]:
# Defining X - the features and Y - the target
X = df_fishing_clean.drop(columns=['source','is_fishing'])
y = df_fishing_clean['is_fishing']
X.shape, y.shape

((10000, 8), (10000,))

In [167]:
X

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon
60646,1.252340e+12,1.420089e+09,0.0,0.000000,0.0,128.000000,52.458717,4.581316
60647,1.252340e+12,1.420090e+09,0.0,0.000000,0.0,128.000000,52.458733,4.581316
60648,1.252340e+12,1.420090e+09,0.0,0.000000,0.0,128.000000,52.458698,4.581267
60649,1.252340e+12,1.420091e+09,0.0,0.000000,0.0,128.000000,52.458698,4.581234
60650,1.252340e+12,1.420092e+09,0.0,0.000000,0.0,128.000000,52.458683,4.581183
...,...,...,...,...,...,...,...,...
325375,3.491170e+13,1.450915e+09,0.0,3162.200195,0.0,0.000000,56.516781,20.994823
325376,3.491170e+13,1.450916e+09,0.0,3162.200195,0.0,93.199997,56.516792,20.994837
325377,3.491170e+13,1.450917e+09,0.0,3162.200195,0.0,93.300003,56.516754,20.994816
325378,3.491170e+13,1.450917e+09,0.0,3162.200195,0.0,0.000000,56.516766,20.994833


In [168]:
X.columns

Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon'],
      dtype='object')

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)

# Make Pipeline

## OHE month and day of week and scaling for the rest

In [177]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_union

# Define a function to convert datetime to date
def datetime_to_date(df):
    df['date'] = pd.to_datetime(df['timestamp'], unit='s')
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.day_of_week
    return df[['month', 'day_of_week']]

# Create column transformer for date time
date_trans = ColumnTransformer(
    transformers=[('date_transformer', FunctionTransformer(datetime_to_date), ['timestamp'])],
    remainder='passthrough'  # Keep the remaining columns
)

# Define the columns that need different preprocessing
numeric_cols = ['mmsi', 'distance_from_shore', 'distance_from_port', 'speed', 'course']
minmax_cols = ['lat', 'lon']

# Create transformers for each type of preprocessing
numeric_transformer = StandardScaler()
minmax_transformer = MinMaxScaler()

# Create a ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('minmax', minmax_transformer, minmax_cols)],
    remainder='passthrough')  # Bypass these features)

preproc_full = make_union(date_trans, preprocessor)

# Create the Random Forest Classifier model
rf_model = RandomForestClassifier()

# Create the pipeline using make_pipeline
pipeline = make_pipeline(preproc_full, rf_model)

pipeline

# Train Model

In [178]:
X_train.columns

Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon'],
      dtype='object')

In [179]:
X_test.columns

Index(['mmsi', 'timestamp', 'distance_from_shore', 'distance_from_port',
       'speed', 'course', 'lat', 'lon'],
      dtype='object')

In [180]:
# Train Pipeline
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f} Precision:{precision:.2f}")

Accuracy: 0.99 Precision:0.96


# Export Model

In [181]:
from joblib import dump

# Define the path for saving the model
model_path = "rff_model2.joblib"

# Save the trained model to the specified path
dump(pipeline, model_path)

print(f"Model saved at: {model_path}")

Model saved at: rff_model2.joblib


In [182]:
import joblib
pipeline_test =joblib.load("rff_model2.joblib")

In [184]:
pipeline_test.predict(X_test)

array([1., 1., 0., ..., 0., 0., 1.])

In [185]:
pipeline_test.named_steps

{'featureunion': FeatureUnion(transformer_list=[('columntransformer-1',
                                 ColumnTransformer(remainder='passthrough',
                                                   transformers=[('date_transformer',
                                                                  FunctionTransformer(func=<function datetime_to_date at 0x1406011b0>),
                                                                  ['timestamp'])])),
                                ('columntransformer-2',
                                 ColumnTransformer(remainder='passthrough',
                                                   transformers=[('num',
                                                                  StandardScaler(),
                                                                  ['mmsi',
                                                                   'distance_from_shore',
                                                                   'distance_from_port',
  