## Model Training - Species Classification
Reads dataframe and uses Resnet 50 to perform species classification on Dangermont data.

### 1. Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np

In [2]:
# get and preprocess image for given image name
def process_image(img_name, img_path, new_shape):
    # get and preprocess
    path = f"{img_path}/{img_name}.jpg"
    image = cv2.imread(path)
    image = cv2.resize(image, new_shape)
    return image

def process_df(df_path, img_path, new_shape):
    # read df from df path
    df = pd.read_csv(df_path, index_col=False)
    df = df.drop(columns=["Unnamed: 0"])
    
    # get processed image
    df["image"] = df["img_name"].apply(lambda x: process_image(x, img_path, new_shape))
    df.rename(columns={"img_name": "image_name"}, inplace=True)
    return df

In [3]:
# remove classes with less than or equal to n observations
def remove_classes(df, n):
    df_group = df[["image_name", "common_name"]].groupby("common_name", as_index=False).count()
    selected_species = list(df_group[df_group["image_name"] > n]["common_name"])
    
    return df[df["common_name"].isin(selected_species)]

In [4]:
# organize data in necessary format for ResNet
def format_data(df, path):
    for img, img_name, cname in zip(df["image"].values, df["image_name"].values, df["common_name"].values):
        full_path = f"{path}/{cname}/"
        
        # one folder per label
        if not os.path.isdir(full_path):
            os.mkdir(full_path)
            
        full_name = f"{full_path}/{img_name}.jpg"
    
        # write file if it doesn't exist already
        if not os.path.isfile(full_name):
            cv2.imwrite(full_name, img)

In [5]:
df_path = "df.csv"
img_path = "images-bboxes/jldp"
img_shape = (224, 224)
df = process_df(df_path, img_path, img_shape)
df = remove_classes(df, 5)

In [14]:
path = "tf-dataset/jldp"
format_data(df, path)

In [11]:
# df

### 2. Exploratory Data Analysis

In [12]:
# 31 classes
# 27 classes after removing those with <= 5 observations
# marked class imbalance!
df[["common_name", "image_name"]].groupby("common_name", as_index=False).count().\
    sort_values(by="image_name", ascending=False)

Unnamed: 0,common_name,image_name
7,Coyote,4207
26,Wild Boar,2178
17,Mule Deer,852
8,Domestic Cattle,595
25,Western fence lizard,475
24,Western Gull,158
4,California Gull,151
12,Great Blue Heron,125
21,Turkey Vulture,113
16,Mallard,101


In [13]:
num_classes = len(df["common_name"].unique())
print(f"There are {num_classes} classes")

There are 27 classes


### 3. Modeling

#### Resnet 50

In [15]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [25]:
def resnet_model(input_shape, batch_size, n_trainable, num_classes, optimizer="adam"):
    # Load the pre-trained model
    # 177 layers
    base_model = ResNet50(weights="imagenet", include_top=True, input_shape=input_shape)
    
    # freeze the first n - n_trainable layers
    for layer in base_model.layers[:-n_trainable]:
        layer.trainable = False
        
    model = Sequential([
        base_model, 
        Flatten(),
        Dense(num_classes, activation="softmax")
    ])
    
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    
    return model

In [29]:
def train_val_data(path, val_split, target_size, batch_size):
    
    datagen = ImageDataGenerator(
        preprocessing_function = preprocess_input,
        validation_split = 0.2
    )
    
    train_generator = datagen.flow_from_directory(
        path,
        target_size = target_size,
        batch_size = batch_size,
        class_mode = "categorical",
        subset = "training"
    )
    
    validation_generator = datagen.flow_from_directory(
        path,
        target_size = (224, 224),
        batch_size = batch_size,
        class_mode = "categorical",
        subset = "validation"
    )
    
    return train_generator, validation_generator

In [28]:
dim = 224
input_shape = (dim, dim, 3)
batch_size = 32
trainable_layers = 30

# compiled model
model = resnet_model(input_shape, batch_size, trainable_layers, num_classes)

In [30]:
path = "tf-dataset/jldp/"
val_split = 0.2
target_size = (dim, dim)

# train and test data
train_generator, validation_generator = train_val_data(path, val_split, target_size, batch_size)

Found 7404 images belonging to 27 classes.
Found 1840 images belonging to 27 classes.


In [23]:
# fitting model
model.fit(
    train_generator,
    validation_data = validation_generator,
    epochs = 3,
    verbose = True
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2bb8966a0d0>

In [19]:
# result summary

# all data
# :-70, 2 epochs, accuracy 0.65, val accuracy 0.41
# :-30, 3 epochs, accuracy 0.74, val accuracy 0.77
# :-20, 5 epochs, accuracy 0.60, val accuracy 0.60

# removing labels with <= 5 observations
# :-30, 3 epochs, accuracy 0.87, val accuracy 0.88

# only human data
# :-30, 1 epoch, accuracy 0.4826, val accuracy 0.5306

# only human data - removing duplicates
# :-30, 3 epochs, accuracy 0.86, val accuracy 0.88


In [21]:
# save model
structure_path = "tf-dataset/models/model02.json"
weights_path = "tf-dataset/models/model02.h5"

# save structure
with open(structure_path, "w") as json_file:
    json_file.write(model.to_json())
    
# save weights
model.save_weights(weights_path)

In [20]:
val_predictions = model.predict(validation_generator)



In [44]:
pred_indices = np.argmax(val_predictions, axis=1)

labels = list(validation_generator.class_indices.keys())
pred_labels = [labels[i] for i in pred_indices]
# actual_labels = [labels[i] for i in validation_generator.classes]

In [None]:
# pred_labels

# test_df = pd.DataFrame()
# test_df['filename'] = test_images
# test_df['actual'] = label_encoder.inverse_transform(test_labels.argmax(axis=1))
# test_df['predicted'] = predicted_unnest
# test_df.loc[test_df['actual']==test_df['predicted'],'Same'] = True
# test_df.loc[test_df['actual']!=test_df['predicted'],'Same'] = False
# test_df.head(10)
# test_df.to_csv('../data/test_results.csv')