<a href="https://colab.research.google.com/github/dakilaledesma/669_Final_Project/blob/main/669_Fishbase_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#@title Copying datasets
! mkdir data
! cp "drive/MyDrive/UNC/Classes/BIOL 669/fishbase_iucn.csv" data/fishbase_iucn.csv

mkdir: cannot create directory ‘data’: File exists


In [8]:
#@title Ignoring all warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#@title Training a model to see which columns have useful information
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np

iucn_data = pd.read_csv("data/fishbase_iucn.csv")
labels = iucn_data["redlistCategory"].str.lower()
label_mapping = {}
for y_cat in labels:
    if y_cat not in label_mapping.keys():
      label_mapping[y_cat] = len(label_mapping.values())

print(label_mapping.items())

train_columns = [c for c in iucn_data.columns if c not in ["specCode", "scientificName", "redlistCategory"]]
for t in train_columns:
  xy = iucn_data[[t, "redlistCategory"]]
  # xy = xy[xy["redlistCategory"].str.lower() != "least concern"]
  xy = xy.dropna()
  X = xy[t]
  y = []
  for label in list(xy['redlistCategory'].str.lower()):
    y_arr = np.zeros((len(label_mapping.values())))
    y_arr[label_mapping.get(label)] = 1
    y.append(y_arr)
  y = np.array(y)

  try:
    num_classes = y.shape[1]
  except IndexError:
    continue

  model = Sequential()
  model.add(Dense(1, input_shape=(1,), activation='relu'))
  model.add(Dense(50, activation='relu'))
  model.add(Dense(num_classes, activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  try:
    history = model.fit(X, y, epochs=20, batch_size=8, verbose=0, validation_split=0.2)
  except:
    continue
  print(f"{t}: {max(history.history['val_accuracy'])}")

dict_items([('least concern', 0), ('data deficient', 1), ('vulnerable', 2), ('not applicable', 3), ('endangered', 4), ('near threatened', 5), ('critically endangered', 6), ('regionally extinct', 7), ('lower risk/least concern', 8), ('extinct', 9), ('extinct in the wild', 10), ('lower risk/near threatened', 11)])
SpecCode: 0.7243391871452332
SpeciesRefNo: 0.7243391871452332


In [None]:
#@title From the above analysis we can ascertain that the following are the useful columns
useful_cols = ["DepthRangeShallow", "DepthRangeDeep", "DepthRangeComShallow", "DepthRangeComDeep", "DepthComRef", "LongevityWild", "LongevityCaptive", "Length", "LengthFemale", "CommonLength", "CommonLengthF", "Weight", "WeightFemale"]

In [None]:
#@title Defining the multi-layer perceptron model and turning it into a function to save space
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import pandas as pd
import numpy as np

def teh_nn(coi, incl_lc=True, num_layers=10, e=100, bs=8, vs=0.2):
  iucn_data = pd.read_csv("data/fishbase_iucn.csv")
  labels = iucn_data["redlistCategory"].str.lower()
  label_mapping = {}
  for y_cat in labels:
      if y_cat not in label_mapping.keys():
        label_mapping[y_cat] = len(label_mapping.values())

  xy = iucn_data[coi + ["redlistCategory"]]
  if not incl_lc:
    xy = xy[xy["redlistCategory"].str.lower() != "least concern"]
  xy = xy.dropna()
  X = xy[coi]
  y = []
  for label in list(xy['redlistCategory'].str.lower()):
    y_arr = np.zeros((len(label_mapping.values())))
    y_arr[label_mapping.get(label)] = 1
    y.append(y_arr)
  y = np.array(y)

  num_classes = y.shape[1]
  num_inputs = X.shape[1]

  model = Sequential()
  model.add(Dense(num_inputs, input_shape=(num_inputs,), activation='relu'))
  for n in range(num_layers):
    model.add(Dense(50, activation='relu'))
  model.add(Dense(num_classes, activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  history = model.fit(X, y, epochs=e, batch_size=bs, verbose=0, validation_split=vs)
  # print(f"Samples: {y.shape[0]} Acc: {max(history.history['val_accuracy'])}")
  return max(history.history['val_accuracy'])

In [None]:
#@title Making all column combinations for brute force checking and some helper functions
from itertools import combinations
from collections import OrderedDict
from tqdm import tqdm

col_combinations = sum([list(map(list, combinations(useful_cols, i))) for i in range(len(useful_cols) + 1)], [])
col_combinations.remove([])

def print_dict(dct):
    for k, v in list(dct.items())[:500]:  
        print(f"{k}: {v}")

In [None]:
#@title Brute force column combination accuracy checking (without least concern species)
acc_dict_lcf = {}
for coi in tqdm([[]]):
  try:
    coi_acc = teh_nn(coi, e=10, incl_lc=False)
  except:
    continue
  str_coi = ', '.join(coi)
  acc_dict_lcf[str_coi] = coi_acc

o_acc_dict_lcf = OrderedDict(sorted(acc_dict_lcf.items(), key=lambda t: t[1], reverse=True))
print_dict(o_acc_dict_lcf)

100%|██████████| 8191/8191 [1:50:51<00:00,  1.23it/s]

DepthRangeComDeep, CommonLengthF: 1.0
DepthComRef, CommonLengthF: 1.0
LongevityCaptive, WeightFemale: 1.0
DepthRangeShallow, DepthRangeComShallow, WeightFemale: 1.0
DepthRangeDeep, DepthRangeComDeep, CommonLengthF: 1.0
DepthRangeDeep, DepthComRef, CommonLengthF: 1.0
DepthRangeDeep, LongevityCaptive, CommonLengthF: 1.0
DepthRangeDeep, CommonLengthF, WeightFemale: 1.0
DepthRangeComShallow, DepthRangeComDeep, WeightFemale: 1.0
DepthRangeComShallow, Length, WeightFemale: 1.0
DepthRangeComShallow, LengthFemale, WeightFemale: 1.0
DepthRangeComShallow, CommonLength, WeightFemale: 1.0
DepthRangeComDeep, DepthComRef, CommonLengthF: 1.0
DepthRangeComDeep, LongevityWild, WeightFemale: 1.0
DepthRangeComDeep, Length, CommonLengthF: 1.0
DepthRangeComDeep, LengthFemale, CommonLengthF: 1.0
DepthRangeComDeep, CommonLength, CommonLengthF: 1.0
DepthRangeComDeep, Weight, WeightFemale: 1.0
DepthComRef, LongevityWild, WeightFemale: 1.0
DepthComRef, Length, CommonLengthF: 1.0
DepthComRef, LengthFemale, Weigh




In [None]:
#@title Brute force column combination accuracy checking (with least concern species)
acc_dict_lct = {}
for coi in tqdm(col_combinations):
  try:
    coi_acc = teh_nn(coi, e=10, incl_lc=True)
  except:
    continue
  str_coi = ', '.join(coi)
  acc_dict_lct[str_coi] = coi_acc

o_acc_dict_lct = OrderedDict(sorted(acc_dict_lct.items(), key=lambda t: t[1], reverse=True))
print_dict(o_acc_dict_lct)

100%|██████████| 8191/8191 [2:28:13<00:00,  1.09s/it]

DepthRangeShallow, LongevityCaptive, CommonLengthF: 1.0
DepthRangeShallow, CommonLengthF, WeightFemale: 1.0
DepthRangeDeep, LongevityCaptive, CommonLengthF: 1.0
DepthRangeComShallow, CommonLengthF, WeightFemale: 1.0
DepthRangeComShallow, Weight, WeightFemale: 1.0
DepthRangeComDeep, LongevityWild, CommonLengthF: 1.0
DepthRangeComDeep, LongevityCaptive, LengthFemale: 1.0
DepthRangeComDeep, CommonLengthF, WeightFemale: 1.0
DepthComRef, LongevityWild, CommonLengthF: 1.0
DepthComRef, LongevityCaptive, LengthFemale: 1.0
DepthComRef, CommonLengthF, WeightFemale: 1.0
LongevityWild, LongevityCaptive, WeightFemale: 1.0
LongevityCaptive, Length, CommonLengthF: 1.0
LongevityCaptive, CommonLength, CommonLengthF: 1.0
LongevityCaptive, CommonLength, WeightFemale: 1.0
LongevityCaptive, CommonLengthF, Weight: 1.0
DepthRangeShallow, DepthRangeComShallow, LongevityWild, CommonLengthF: 1.0
DepthRangeShallow, DepthRangeComShallow, LengthFemale, WeightFemale: 1.0
DepthRangeShallow, DepthRangeComShallow, Com




In [None]:
#@title Depth ranges with least concern categories
#@markdown Acc: 81.63%
cols_of_interest = ["DepthRangeShallow", "DepthRangeDeep", "DepthRangeComShallow", "DepthRangeComDeep", "DepthComRef"]
teh_nn(cols_of_interest, incl_lc=True)

  


Samples: 1770 Acc: 0.8163841962814331


In [None]:
#@title Depth ranges without least concern categories
#@markdown Acc: 33.76%
cols_of_interest = ["DepthRangeShallow", "DepthRangeDeep", "DepthRangeComShallow", "DepthRangeComDeep", "DepthComRef"]
teh_nn(cols_of_interest, incl_lc=False)

  


Samples: 383 Acc: 0.33766233921051025


In [None]:
#@title Longevities with least concern categories
#@markdown Acc: 79.17%
cols_of_interest = ["LongevityWild", "LongevityCaptive"]
teh_nn(cols_of_interest, incl_lc=True)

  


Samples: 120 Acc: 0.7916666865348816


In [None]:
#@title Longevities without least concern categories, highly variable acc depending on validation split seed
#@markdown Acc: 62.5%
cols_of_interest = ["LongevityWild", "LongevityCaptive"]
teh_nn(cols_of_interest, incl_lc=False)

  This is separate from the ipykernel package so we can avoid doing imports until


Samples: 39 Acc: 0.625


In [None]:
#@title Length with least concern categories
#@markdown Acc: 80.00%
cols_of_interest = ["Length", "LengthFemale", "CommonLength", "CommonLengthF"]
teh_nn(cols_of_interest, incl_lc=True)

  after removing the cwd from sys.path.


Samples: 24 Acc: 0.800000011920929


In [None]:
#@title Length without least concern categories
#@markdown Acc: Untested%
cols_of_interest = ["Length", "LengthFemale", "CommonLength", "CommonLengthF"]
teh_nn(cols_of_interest, incl_lc=False)

In [None]:
#@title Longevities without least concern categories, highly variable acc depending on validation split seed
#@markdown Acc: 75%
cols_of_interest = ["LongevityWild", "LongevityCaptive"]
teh_nn(cols_of_interest, incl_lc=False)

  after removing the cwd from sys.path.


0.75