In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
#
url = "https://raw.githubusercontent.com/ddw26/Team-Vand/circle_role_crkaide/output_data/df_keys.csv"
catch_df = pd.read_csv(url)
catch_df.head(5)

Unnamed: 0,Date file created,Month of intercept,Time of intercept,Data sampling period,Sub region of trip,Kind of day,Fish target species,Type of gear,Mode of fishing,Mode of fishing collapsed,Collapsed area of fishing,Distance from shore,Hours on boat,Hours fished,Catch fish,Number of fish avail for inspection
0,2020-03-16,2,1512,1,6,wd,SPOTTED SEATROUT,1,7,5,5,8,7.0,6.0,1,3
1,2020-03-16,1,1250,1,6,we,RED DRUM,1,7,5,5,8,5.0,4.5,2,0
2,2020-03-16,1,1250,1,6,we,RED DRUM,1,7,5,5,8,5.0,4.5,2,0
3,2020-03-16,2,1303,1,6,we,RED DRUM,1,7,5,5,8,6.0,4.0,2,0
4,2020-03-16,2,1303,1,6,we,RED DRUM,1,7,5,5,8,6.0,4.0,2,0


# Split the Data into Training and Testing

In [19]:
# Create our features
X = catch_df.drop('Date file created', axis=1)
X = catch_df.drop('Number of fish avail for inspection', axis=1)
X = pd.get_dummies(X)

# Create our target
y = catch_df.drop('Fish target species', axis=1)
y = catch_df['Number of fish avail for inspection']

In [20]:
X.describe()

Unnamed: 0,Month of intercept,Time of intercept,Data sampling period,Sub region of trip,Type of gear,Mode of fishing,Mode of fishing collapsed,Collapsed area of fishing,Distance from shore,Hours on boat,...,Fish target species_VERMILION SNAPPER,Fish target species_WAHOO,Fish target species_WEAKFISH,Fish target species_WHITE CRAPPIE,Fish target species_WHITE HAKE,Fish target species_WHITE MARLIN,Fish target species_WHITE PERCH,Fish target species_WINTER FLOUNDER,Fish target species_YELLOW PERCH,Fish target species_YELLOWFIN TUNA
count,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,...,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0,20238.0
mean,7.48572,1322.732681,4.000494,4.786343,1.014527,7.181243,5.658613,3.087509,4.43023,6.668421,...,0.000395,0.004447,0.001927,4.9e-05,4.9e-05,0.000198,0.007609,0.004595,9.9e-05,0.010327
std,2.098378,307.94064,1.071769,0.736666,0.312141,0.860629,1.331135,1.760925,3.234559,2.553845,...,0.019879,0.06654,0.043857,0.007029,0.007029,0.014058,0.086902,0.067634,0.009941,0.101099
min,1.0,2.0,1.0,4.0,1.0,6.0,4.0,1.0,1.0,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,1202.0,3.0,4.0,1.0,6.0,4.0,1.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7.0,1359.0,4.0,5.0,1.0,7.0,5.0,2.0,2.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,9.0,1532.0,5.0,5.0,1.0,8.0,7.0,5.0,8.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,12.0,2359.0,6.0,6.0,9.0,8.0,7.0,5.0,8.0,30.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
# Check the balance of our target values
y.value_counts()

0      8529
1      3372
2      1392
3       937
4       677
       ... 
86        1
122       1
138       1
154       1
127       1
Name: Number of fish avail for inspection, Length: 113, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [23]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier (n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.03684779252230299

In [25]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[1381,    0,    0, ...,    9,   19,   37],
       [   3,   18,   29, ...,   19,    7,   29],
       [   0,   10,   19, ...,    8,    4,   12],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0]])

In [26]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.64      1.00      0.78      0.80      0.62      2159
          1       0.41      0.02      0.99      0.04      0.15      0.02       812
          2       0.22      0.06      0.99      0.09      0.23      0.05       345
          3       0.09      0.01      1.00      0.02      0.10      0.01       215
          4       0.13      0.09      0.98      0.11      0.30      0.08       172
          5       0.00      0.00      1.00      0.00      0.00      0.00       155
          6       0.09      0.03      0.99      0.04      0.17      0.03       136
          7       0.09      0.01      1.00      0.02      0.10      0.01       106
          8       0.00      0.00      1.00      0.00      0.00      0.00        88
          9       0.00      0.00      1.00      0.00      0.00      0.00        68
         10       0.00      0.00      1.00      0.00      0.00      0.00        71
   

In [30]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.15504781314235758, 'Time of intercept'),
 (0.08543268914699498, 'Hours fished'),
 (0.08340676289643642, 'Hours on boat'),
 (0.06630277456159102, 'Month of intercept'),
 (0.038061778774948415, 'Sub region of trip'),
 (0.03460186424002212, 'Distance from shore'),
 (0.03448731467004479, 'Data sampling period'),
 (0.03235085602899737, 'Collapsed area of fishing'),
 (0.03231272749373478, 'Mode of fishing collapsed'),
 (0.030593306439159574, 'Kind of day_we'),
 (0.02870075380849276, 'Mode of fishing'),
 (0.02768614295244964, 'Kind of day_wd'),
 (0.025747107473912848, 'Fish target species_BLACK SEA BASS'),
 (0.021051408100046198, 'Fish target species_SCUP'),
 (0.020423934573802434, 'Date file created_2019-07-26'),
 (0.020292688729799786, 'Date file created_2018-07-20'),
 (0.017689446156923378, 'Date file created_2020-09-22'),
 (0.01502184229319972, 'Date file created_2019-09-18'),
 (0.01364782041700147, 'Fish target species_SPANISH MACKEREL'),
 (0.01283651779352666, 'Fish target species_A

### Easy Ensemble AdaBoost Classifier

In [31]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [32]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.0001624128534735297

In [33]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 27,   0,   0, ...,  37, 184,  65],
       [  0,   0,   0, ...,  28,  77,  18],
       [  0,   0,   0, ...,  20,  29,  12],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [34]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.01      1.00      0.02      0.11      0.01      2159
          1       0.00      0.00      1.00      0.00      0.00      0.00       812
          2       0.00      0.00      1.00      0.00      0.00      0.00       345
          3       0.00      0.00      1.00      0.00      0.00      0.00       215
          4       0.00      0.00      1.00      0.00      0.00      0.00       172
          5       0.00      0.00      1.00      0.00      0.00      0.00       155
          6       0.00      0.00      1.00      0.00      0.00      0.00       136
          7       0.00      0.00      1.00      0.00      0.00      0.00       106
          8       0.00      0.00      1.00      0.00      0.00      0.00        88
          9       0.00      0.00      1.00      0.00      0.00      0.00        68
         10       0.00      0.00      1.00      0.00      0.00      0.00        71
   