In [None]:
# Step 1: Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tabular_augmentation import sdv_synthesis, sdv_synthesis_cvae
from tabular_augmentation import smote_augmentation
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

In [None]:
# Step 2: Load the training data

'''
The training_data_source can be:
    fishing: data coming from the fishing reports
    AIS: data derived from the trajectories
    fusion: data coming from joining the two previous datasets
'''
training_data_source = 'fishing' 


train = pd.read_csv(training_data_source+'csv')
train = train.dropna()
X_train = train[['Latitude','Longitude','Elevation','TWeekAvg','CWeekAvg','WWeekAvg','SWeekAvg','11W','Week']]
y_train = train['Tag']


'''
Define the augmentation procedure:
    None: do not use augmentation
    SMOTE: use smote augmentation
    CTGAN: use CTGAN
'''
augmentation = None

if augmentation == 'SMOTE':
    oversample_rate = 1.5 # [1.25,1.5,1.75,2,2.25,2.5]
    method = 'SVMSMOTE'
    X_train, y_train = smote_augmentation(X_train, y_train, method, seed=30,
                                                oversample_num=int(oversample_rate*len(X_train)), positive_ratio=None,
                                                knn_neighbors=3)
if augmentation == 'CTGAN':
    oversample_rate = 1.5 # [1.25,1.5,1.75,2,2.25,2.5]
    X_train, y_train = sdv_synthesis(
            X_train, y_train, method, oversample_num=int(oversample_rate*len(X_train)),
            seed=30, init_synthesizer=True, positive_ratio=0.5,
        )

In [None]:
# Step 3: Load testing data

test = pd.read_csv('test.csv')
test = test.dropna()
test['Day'] = pd.to_datetime(test['Day'])
test['Week'] = test['Day'].dt.isocalendar().week
test = test.sort_values(by=['Latitude','Longitude','Day'])
test = test.drop_duplicates()
X_test = test[['Latitude','Longitude','Elevation','TWeekAvg','CWeekAvg','WWeekAvg','SWeekAvg','11W','Week']]
y_test = test['KN']

In [None]:
# Step 4: Train SVM with cross_validation

svmc = svm.SVC()

svm_cv = GridSearchCV(
    estimator=svmc,
    param_grid={
        'kernel': [ 'linear','poly', 'rbf'],
        'C': [0.01,0.1,1,2,10,50,100],
        'gamma':['auto'],
        'random_state':[8284],
        'class_weight':['balanced']
    },
    scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],
    refit=False, error_score='raise'
)


svm_cv.fit(X_train, y_train)
results_cv = pd.DataFrame(svm_cv.cv_results_)

cols = ['mean_fit_time',
        'param_kernel','param_C','param_gamma',
     'mean_test_f1_macro', 'mean_test_precision_macro',
     'mean_test_recall_macro', 'mean_test_accuracy']

sorted_results = results_cv[cols].sort_values(by='mean_test_precision_macro',ascending=False)

# Select the row with the highest accuracy
best_result = sorted_results.iloc[0]

# Extract the best C and kernel
best_C = best_result['param_C']
best_kernel = best_result['param_kernel']


In [None]:
# Step 5: Train RF with cross validation

forest = RandomForestClassifier()


param_grid = { 
    'n_estimators': [100],
    'max_depth' : [4,5,6,7],
    'max_features' :[2,3,4,5],
    'criterion' :['gini', 'entropy'],
    'min_samples_split' :[2,5,10],
    'random_state':[8284],
}

clf = GridSearchCV(estimator=forest, param_grid=param_grid, cv= 5,verbose=1,
                   scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro'],refit=False)
clf.fit(X_train,y_train)

results_cv = pd.DataFrame(clf.cv_results_)

cols = ['mean_fit_time',
        'param_max_depth','param_max_features','param_criterion',
'param_min_samples_split','mean_test_f1_macro', 'mean_test_precision_macro']

results_cv[cols].sort_values(by='mean_test_precision_macro',ascending=False)

# Select the row with the highest accuracy
best_result = sorted_results.iloc[0]

# Extract the best hyperparameters
best_depth = best_result['param_max_depth']
best_features = best_result['param_max_features']
best_criterion = best_result['param_criterion']
best_samples_split = best_result['param_min_samples_split']

In [None]:
# Step 6: Train models with best parameters and full training data

model_svm = svm.SVC(C=best_C,kernel=best_kernel,gamma='auto',class_weight='balanced',random_state=8284,probability=True) 
model_svm.fit(X_train, y_train)

model_rf = RandomForestClassifier(random_state=8284,max_depth=best_depth,
                                  min_samples_split=best_samples_split,n_estimators=100,
                                  max_features=best_features,criterion=best_criterion,
                                  class_weight='balanced')
model_rf.fit(X_train,y_train)

In [None]:
# Step 7: Train a soft voting model

voting_best = VotingClassifier([('rf',model_rf),('svm',model_svm)])
voting_best.fit(X_train,y_train)

In [None]:
# Step 8: Get the best model and obtain final results

def metrics(X_test,y_test,clf):
    y_pred = clf.predict(X_test)
    print('Accuracy: ',str(round(accuracy_score(y_test,y_pred)*100,2))+'%')
    
    print('Precision: ',str(round(precision_score(y_test,y_pred)*100,2))+'%')

    print('Recall: ',str(round(recall_score(y_test,y_pred)*100,2))+'%')

    return


metrics(X_train,y_train,model_svm)
metrics(X_train,y_train,model_rf)
metrics(X_train,y_train,voting_best)

metrics(X_test,y_test,voting_best)