## Model Training - Species Classification
Reads dataframe and uses Resnet 50 to perform species classification on Dangermont data.

### 1. Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np

In [2]:
# get and preprocess image for given image name
def process_image(img_name, img_path, new_shape):
    # get and preprocess
    path = f"{img_path}/{img_name}.jpg"
    image = cv2.imread(path)
    image = cv2.resize(image, new_shape)
    return image

def process_df(df_path, img_path, new_shape):
    # read df from df path
    df = pd.read_csv(df_path, index_col=False)
    df = df.drop(columns=["Unnamed: 0"])
    
    # get processed image
    df["image"] = df["img_name"].apply(lambda x: process_image(x, img_path, new_shape))
    return df

In [3]:
# remove classes with less than or equal to n observations
def remove_classes(df, n):
    df_group = df[["img_name", "common_name"]].groupby("common_name", as_index=False).count()
    selected_species = list(df_group[df_group["img_name"] > n]["common_name"])
    
    return df[df["common_name"].isin(selected_species)]

In [4]:
# organize data in necessary format for ResNet
def format_data(df, path):
    for img, img_name, cname in zip(df["image"].values, df["img_name"].values, df["common_name"].values):
        full_path = f"{path}/{cname}/"
        
        # one folder per label
        if not os.path.isdir(full_path):
            os.mkdir(full_path)
            
        full_name = f"{full_path}/{img_name}.jpg"
    
        # write file if it doesn't exist already
        if not os.path.isfile(full_name):
            cv2.imwrite(full_name, img)

In [5]:
df_path = "df.csv"
img_path = "images-bboxes/jldp"
img_shape = (224, 224)
df = process_df(df_path, img_path, img_shape)
df = remove_classes(df, 5)

In [None]:
path = "tf-dataset/jldp"
format_data(df, path)

In [7]:
# df

Unnamed: 0,img_id,common_name,img_name,image
0,2ce50250-84ca-4a83-b263-2f24bf522d0d,Coyote,2ce50250-84ca-4a83-b263-2f24bf522d0d_932,"[[[17, 17, 17], [15, 15, 15], [13, 13, 13], [1..."
1,3b45f93b-1539-4a7a-8577-8de6a12015e1,Coyote,3b45f93b-1539-4a7a-8577-8de6a12015e1_647,"[[[11, 11, 11], [11, 11, 11], [11, 11, 11], [1..."
2,85f21059-cb89-48bf-98c6-30440c02f0f5,Coyote,85f21059-cb89-48bf-98c6-30440c02f0f5_947,"[[[18, 18, 18], [19, 19, 19], [19, 19, 19], [1..."
3,ed0e7715-c052-4599-a366-bd50134aca77,Coyote,ed0e7715-c052-4599-a366-bd50134aca77_939,"[[[29, 29, 29], [28, 28, 28], [27, 27, 27], [3..."
4,1f29dc37-29b1-4c7b-937d-8bc4e31372ec,Coyote,1f29dc37-29b1-4c7b-937d-8bc4e31372ec_937,"[[[30, 30, 30], [32, 32, 32], [34, 34, 34], [3..."
...,...,...,...,...
11921,jldp:02ea11cea3794ee22cffdd66235a8e76,Mule Deer,jldp:02ea11cea3794ee22cffdd66235a8e76_6150,"[[[19, 19, 19], [19, 19, 19], [19, 19, 19], [1..."
11922,jldp:7b1e0549b33d2ca922afe2423cf3367c,Coyote,jldp:7b1e0549b33d2ca922afe2423cf3367c_6151,"[[[58, 58, 58], [58, 58, 58], [57, 57, 57], [5..."
11923,jldp:c2d28b49b86c2f8c1baec791134eab53,Coyote,jldp:c2d28b49b86c2f8c1baec791134eab53_6152,"[[[54, 54, 54], [54, 54, 54], [53, 53, 53], [5..."
11924,jldp:269e05af4f8b1fcc803537a147b4ec95,Coyote,jldp:269e05af4f8b1fcc803537a147b4ec95_6153,"[[[40, 40, 40], [40, 40, 40], [40, 40, 40], [4..."


### 2. Exploratory Data Analysis

In [8]:
# 33 classes
# 29 classes after removing those with <= 5 observations
# marked class imbalance!
df[["common_name", "img_name"]].groupby("common_name", as_index=False).count().\
    sort_values(by="img_name", ascending=False)

Unnamed: 0,common_name,img_name
8,Coyote,5409
18,Mule Deer,2265
28,Wild Boar,2178
9,Domestic Cattle,595
27,Western fence lizard,475
26,Western Gull,158
5,California Gull,151
13,Great Blue Heron,125
23,Turkey Vulture,113
17,Mallard,101


In [9]:
num_classes = len(df["common_name"].unique())
print(f"There are {num_classes} classes")

There are 29 classes


### 3. Modeling

#### Resnet 50

In [17]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [18]:
def resnet_model(input_shape, batch_size, n_trainable, num_classes, optimizer="adam"):
    # Load the pre-trained model
    # 177 layers
    base_model = ResNet50(weights="imagenet", include_top=True, input_shape=input_shape)
    
    # freeze the first n - n_trainable layers
    for layer in base_model.layers[:-n_trainable]:
        layer.trainable = False
        
    model = Sequential([
        base_model, 
        Flatten(),
        Dense(num_classes, activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model

In [19]:
def train_val_data(path, val_split, target_size, batch_size):
    
    datagen = ImageDataGenerator(
        preprocessing_function = preprocess_input,
        validation_split = 0.2
    )
    
    train_generator = datagen.flow_from_directory(
        path,
        target_size = target_size,
        batch_size = batch_size,
        class_mode = "categorical",
        subset = "training"
    )
    
    validation_generator = datagen.flow_from_directory(
        path,
        target_size = (224, 224),
        batch_size = batch_size,
        class_mode = "categorical",
        subset = "validation"
    )
    
    return train_generator, validation_generator

In [20]:
dim = 224
input_shape = (dim, dim, 3)
batch_size = 32
trainable_layers = 30

# compiled model
model = resnet_model(input_shape, batch_size, trainable_layers, num_classes)

In [21]:
path = "tf-dataset/jldp/"
val_split = 0.2
target_size = (dim, dim)

# train and test data
train_generator, validation_generator = train_val_data(path, val_split, target_size, batch_size)

Found 8933 images belonging to 29 classes.
Found 2223 images belonging to 29 classes.


In [22]:
# fitting model
model.fit(
    train_generator,
    validation_data = validation_generator,
    epochs = 5,
    verbose = True
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2bfac06dcd0>

In [19]:
# result summary

# only human data - removing duplicates
# :-30, 3 epochs, accuracy 0.86, val accuracy 0.88

# only human data - removing duplicates + animl data
# :-30, 3 epochs, accuracy 0.72, val accuracy 0.73

In [21]:
# save model
structure_path = "tf-dataset/models/model01.json"
weights_path = "tf-dataset/models/model01.h5"

# save structure
with open(structure_path, "w") as json_file:
    json_file.write(model.to_json())
    
# save weights
model.save_weights(weights_path)

In [20]:
val_predictions = model.predict(validation_generator)



In [44]:
pred_indices = np.argmax(val_predictions, axis=1)

labels = list(validation_generator.class_indices.keys())
pred_labels = [labels[i] for i in pred_indices]
# actual_labels = [labels[i] for i in validation_generator.classes]

In [None]:
# pred_labels

# test_df = pd.DataFrame()
# test_df['filename'] = test_images
# test_df['actual'] = label_encoder.inverse_transform(test_labels.argmax(axis=1))
# test_df['predicted'] = predicted_unnest
# test_df.loc[test_df['actual']==test_df['predicted'],'Same'] = True
# test_df.loc[test_df['actual']!=test_df['predicted'],'Same'] = False
# test_df.head(10)
# test_df.to_csv('../data/test_results.csv')