In [1]:
import pandas as pd
import numpy as np

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

# Get raw data

In [2]:
def get_data(path, file_names):
    tables=[]
    for file_name in file_names:
        table = pd.read_csv(f'{path}/{file_name}.csv')
        table['type']=f'{file_name}'
        tables.append(table)
    return pd.concat(tables)

path='raw_data'
file_names = ['trawlers', 'drifting_longlines', 'fixed_gear', 'pole_and_line', 'purse_seines', 'trollers', 'unknown']
df = get_data(path, file_names)

# Preprocessing

## Remove unknown (-1 in is_fishing column)

In [3]:
df = df.loc[df['is_fishing'] > -1]
df['is_fishing'].value_counts()

is_fishing
0.000000    295979
1.000000    247498
0.666667      4806
0.333333      4096
0.750000       752
0.250000       670
0.800000        33
0.166667        12
0.400000         9
Name: count, dtype: int64

## Converting is_fishing to Binary (0 or 1)

In [4]:
df_fishing = df

# round the decimals so that number becomes 0 or 1
df_fishing.loc[:, ('is_fishing')] = round(df_fishing.loc[:, ('is_fishing')])

# check the unique values
df_fishing['is_fishing'].value_counts()

is_fishing
0.0    300766
1.0    253089
Name: count, dtype: int64

## remove type, as it is target of second model 

In [5]:
df_fishing.drop(columns = ["type"], inplace = True)

## Date time (hour -> Angular distance)

In [6]:
df_fishing['timestamp'] = pd.to_datetime(df_fishing['timestamp'], unit='s')
df_fishing.rename(columns={"timestamp": "date"}, inplace=True)
df_fishing['hour'] = df_fishing['date'].dt.hour
df_fishing['month'] = df_fishing['date'].dt.month
df_fishing['day_of_week'] = df_fishing['date'].dt.day_of_week
df_fishing.head(2)

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,hour,month,day_of_week
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,5,1,3
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,5,1,3


In [7]:
df_fishing['hour_sin'] = np.sin(df_fishing['hour'] * (2 * np.pi / 24))
df_fishing['hour_cos'] = np.cos(df_fishing['hour'] * (2 * np.pi / 24))

## remove boat history track that are too small ( <15 )

In [8]:
# Calculate the value counts of 'mmsi'
mmsi_counts = df_fishing['mmsi'].value_counts()

# Create a boolean mask for filtering mmsi values with counts less than or equal to 15
mask = mmsi_counts > 15

# Get the mmsi values that meet the condition
selected_mmsi = mmsi_counts[mask].index

# Use the isin() method to filter the DataFrame based on selected_mmsi
filtered_fishing_df = df_fishing[df_fishing['mmsi'].isin(selected_mmsi)]

In [20]:
# Dropping rows with NAN values
df_1 = filtered_fishing_df.dropna()

In [21]:
df_1.shape

(553534, 15)

In [22]:
df_fishing_clean= df_1[0:10000]

In [23]:
df_fishing_clean.shape

(10000, 15)

# Split Data

In [24]:
# Defining X - the features and Y - the target
X = df_fishing_clean.drop(columns=['source','date','hour','is_fishing', 'hour_sin', 'hour_cos'])
y = df_fishing_clean['is_fishing']
X.shape, y.shape

((10000, 9), (10000,))

In [25]:
X

Unnamed: 0,mmsi,distance_from_shore,distance_from_port,speed,course,lat,lon,month,day_of_week
60646,1.252340e+12,0.0,0.000000,0.0,128.000000,52.458717,4.581316,1,3
60647,1.252340e+12,0.0,0.000000,0.0,128.000000,52.458733,4.581316,1,3
60648,1.252340e+12,0.0,0.000000,0.0,128.000000,52.458698,4.581267,1,3
60649,1.252340e+12,0.0,0.000000,0.0,128.000000,52.458698,4.581234,1,3
60650,1.252340e+12,0.0,0.000000,0.0,128.000000,52.458683,4.581183,1,3
...,...,...,...,...,...,...,...,...,...
325375,3.491170e+13,0.0,3162.200195,0.0,0.000000,56.516781,20.994823,12,2
325376,3.491170e+13,0.0,3162.200195,0.0,93.199997,56.516792,20.994837,12,3
325377,3.491170e+13,0.0,3162.200195,0.0,93.300003,56.516754,20.994816,12,3
325378,3.491170e+13,0.0,3162.200195,0.0,0.000000,56.516766,20.994833,12,3


In [26]:
X.columns

Index(['mmsi', 'distance_from_shore', 'distance_from_port', 'speed', 'course',
       'lat', 'lon', 'month', 'day_of_week'],
      dtype='object')

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)

# Make Pipeline

## OHE month and day of week and scaling for the rest

In [28]:
# Define the columns that need different preprocessing
numeric_cols = ['mmsi', 'distance_from_shore', 'distance_from_port', 'speed', 'course']
minmax_cols = ['lat', 'lon']
ohe_cols = ['month', 'day_of_week']

# Create transformers for each type of preprocessing
numeric_transformer = StandardScaler()
minmax_transformer = MinMaxScaler()
ohe_transformer = OneHotEncoder(drop='first', sparse=False)

# Create a ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('minmax', minmax_transformer, minmax_cols),
        ('ohe', ohe_transformer, ohe_cols)],
    remainder='passthrough')  # Bypass these features)

# Create the Random Forest Classifier model
rf_model = RandomForestClassifier()

# Create the pipeline using make_pipeline
pipeline = make_pipeline(preprocessor, rf_model)

pipeline

# Train Model

In [29]:
# Train Pipeline
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f} Precision:{precision:.2f}")

Accuracy: 0.99 Precision:0.93


# Export Model

In [30]:
from joblib import dump
import os

# Get the current directory
#current_directory = os.getcwd()

# Navigate to the desired directory
#target_directory = os.path.join(current_directory, '..', 'mlops/training_outputs/models')

# Create the target directory if it doesn't exist
#os.makedirs(target_directory, exist_ok=True)

# Define the path for saving the model
model_path = "rff_model.joblib"

# Save the trained model to the specified path
dump(pipeline, model_path)

print(f"Model saved at: {model_path}")

Model saved at: rff_model.joblib


In [31]:
import joblib
pipeline_test =joblib.load("rff_model.joblib")

In [32]:
pipeline_test.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('num', StandardScaler(),
                                  ['mmsi', 'distance_from_shore',
                                   'distance_from_port', 'speed', 'course']),
                                 ('minmax', MinMaxScaler(), ['lat', 'lon']),
                                 ('ohe',
                                  OneHotEncoder(drop='first', sparse=False),
                                  ['month', 'day_of_week'])]),
 'randomforestclassifier': RandomForestClassifier()}