In [1]:
 # Import libraries:

import numpy as np
import pandas as pd
import os
import random
import shutil
import glob
from sklearn.utils import shuffle

# for image:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# for model:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import Conv2D, AveragePooling2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
tf.__version__


'2.13.0'

In [3]:
print(os.listdir('../input'))


['happy-whale-and-dolphin']


In [4]:
# Set paths:

train = '../input/happy-whale-and-dolphin/train_images'
test = '../input/happy-whale-and-dolphin/test_images'

In [5]:
print(os.listdir('../input/happy-whale-and-dolphin'))


['sample_submission.csv', 'train_images', 'train.csv', 'test_images']


In [6]:

print(len(os.listdir(train)))
print(len(os.listdir(test)))

51033
27956


In [7]:
print(os.listdir(train)[:5])
print(os.listdir(test)[:5])


['80b5373b87942b.jpg', 'e113b51585c677.jpg', '94eb976e25416c.jpg', '19a45862ab99cd.jpg', 'be9645065510e9.jpg']
['cd50701ae53ed8.jpg', '177269f927ed34.jpg', '9137934396d804.jpg', 'c28365a55a0dfe.jpg', '1a40b7b382923a.jpg']


In [8]:
# Set image paths:

train_jpg = tf.io.gfile.glob(train+'/*.jpg')
test_jpg = tf.io.gfile.glob(test+'/*.jpg')


In [9]:
# View train dataset:

train_data = pd.read_csv('../input/happy-whale-and-dolphin/train.csv', sep = ',')
train_data.head()

Unnamed: 0,image,species,individual_id
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9
1,000562241d384d.jpg,humpback_whale,1a71fbb72250
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392


In [10]:
train_data.sample(5)


Unnamed: 0,image,species,individual_id
47053,ec6c1e034defd3.jpg,humpback_whale,032fd951a415
47701,ef8c3866adb5dd.jpg,humpback_whale,98ff54a624f0
36248,b627f5b5525b5b.jpg,spinner_dolphin,50ee31d17a87
11452,3a0274c9881f5c.jpg,blue_whale,f163b890934d
31514,9e7b76dc661edc.jpg,beluga,9a149b8ff660


In [11]:
train_data.shape


(51033, 3)

In [12]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51033 entries, 0 to 51032
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   image          51033 non-null  object
 1   species        51033 non-null  object
 2   individual_id  51033 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [13]:
train_data.describe(include = 'all')


Unnamed: 0,image,species,individual_id
count,51033,51033,51033
unique,51033,30,15587
top,00021adfb725ed.jpg,bottlenose_dolphin,37c7aba965a5
freq,1,9664,400


In [14]:
train_data.isnull().sum()


image            0
species          0
individual_id    0
dtype: int64

In [15]:
train_data.species.value_counts()


species
bottlenose_dolphin           9664
beluga                       7443
humpback_whale               7392
blue_whale                   4830
false_killer_whale           3326
dusky_dolphin                3139
spinner_dolphin              1700
melon_headed_whale           1689
minke_whale                  1608
killer_whale                 1493
fin_whale                    1324
gray_whale                   1123
bottlenose_dolpin            1117
kiler_whale                   962
southern_right_whale          866
spotted_dolphin               490
sei_whale                     428
short_finned_pilot_whale      367
common_dolphin                347
cuviers_beaked_whale          341
pilot_whale                   262
long_finned_pilot_whale       238
white_sided_dolphin           229
brydes_whale                  154
pantropic_spotted_dolphin     145
globis                        116
commersons_dolphin             90
pygmy_killer_whale             76
rough_toothed_dolphin          60
frasie

In [16]:
sum(train_data.individual_id.duplicated())


35446

In [17]:

def Load_Image(path):
    image_path = tf.io.read_file(path)
    image_path = tf.image.decode_image(image_path, channels = 3)
    image_path = tf.image.convert_image_dtype(image_path, tf.float32)
    return image_path

In [18]:

# Fix mis-spellings from species variable:

train_data['species'] = train_data['species'].replace({'kiler_whale': 'killer_whale', 
                               'globis': 'pilot_whale', 
                               'beluga': 'beluga_whale',
                               'bottlenose_dolpin': 'bottlenose_dolphin',
                               'short_finned_pilot_whale': 'pilot_whale',
                               'long_finned_pilot_whale': 'pilot_whale'})

In [19]:
# Declare dolphin and whale variables for further analysis:

dolphin = ['bottlenose_dolphin','common_dolphin','dusky_dolphin', 'spinner_dolphin', 'spotted_dolphin', 'commersons_dolphin', 
           'white_sided_dolphin', 'rough_toothed_dolphin', 'pantropic_spotted_dolphin', 'frasiers_dolphin']


whale = ['melon_headed_whale', 'humpback_whale', 'false_killer_whale', 'belug_whale', 'minke_whale', 'fin_whale', 'blue_whale', 'gray_whale',
         'southern_right_whale', 'killer_whale', 'pilot_whale', 'sei_whale', 'cuviers_beaked_whale', 'brydes_whale', 'pygmy_killer_whale']


# Add to train dataset:
train_data['family'] = 'dolphin'

for ele in range(len(train_data)):
    if train_data.species[ele] in whale:
        train_data.family[ele] = 'whale'
        
        
train_data.sample(5)

Unnamed: 0,image,species,individual_id,family
12274,3dfa81046737ca.jpg,bottlenose_dolphin,eb0b97027ff4,dolphin
36920,b98d628823e1c3.jpg,bottlenose_dolphin,a43daee90cbc,dolphin
8311,29a742a1cd4ab2.jpg,bottlenose_dolphin,02cf681113ce,dolphin
23266,755cff817ee922.jpg,killer_whale,4b4c5f9ef032,whale
12154,3d6a01d965cb37.jpg,humpback_whale,970cdb141edc,whale


In [20]:
# Set globals:

random_state = 42
batch_size = 256
epochs = 3
seed = 42
target_size = (64, 64)
input_shape = (64, 64, 3)

In [21]:
train_data = shuffle(train_data, random_state = random_state)


In [22]:
data_norm = ImageDataGenerator(rescale = 1.0/255, validation_split = 0.20)


In [23]:
# Eet up training batching:

gen_train = data_norm.flow_from_dataframe(train_data,
                                          directory = train,
                                          x_col = 'image',
                                          y_col = 'species',
                                          subset = 'training',
                                          batch_size = batch_size,
                                          class_mode = 'categorical',
                                          seed = seed,
                                          target_size = target_size)

Found 40827 validated image filenames belonging to 25 classes.


In [24]:
# Set up testing/validation batching:

gen_valid = data_norm.flow_from_dataframe(train_data,
                                          directory = train,
                                          x_col = 'image',
                                          y_col = 'species',
                                          subset = 'validation',
                                          batch_size = batch_size,
                                          class_mode = 'categorical',
                                          seed = seed,
                                          target_size = target_size)

Found 10206 validated image filenames belonging to 25 classes.


In [25]:
# Build and Train Simple Model:

mod = Sequential()

# set up base model (simple base):
mod.add(Conv2D(filters = 32, kernel_size = (5, 5), strides = (1, 1), input_shape = input_shape, padding ='valid'))
mod.add(BatchNormalization())
mod.add(Activation(LeakyReLU()))

mod.add(Conv2D(filters = 32, kernel_size = (5, 5), strides = (1, 1), input_shape = input_shape, padding ='valid'))
mod.add(BatchNormalization())
mod.add(Activation(LeakyReLU()))
mod.add(MaxPooling2D(pool_size = (2, 2)))
mod.add(Dropout(0.1))

mod.add(Conv2D(filters = 64, kernel_size = (5, 5), strides = (1, 1), input_shape = input_shape, padding ='valid'))
mod.add(Activation(LeakyReLU()))
mod.add(BatchNormalization())

mod.add(Conv2D(filters = 128, kernel_size = (5, 5), strides = (1, 1), input_shape = input_shape, padding ='valid'))
mod.add(BatchNormalization())
mod.add(Activation(LeakyReLU()))
mod.add(AveragePooling2D(pool_size = (2, 2)))

mod.add(Conv2D(filters = 128, kernel_size = (5, 5), strides = (1, 1), input_shape = input_shape, padding ='valid'))
mod.add(BatchNormalization())
mod.add(Activation(LeakyReLU()))
mod.add(AveragePooling2D(pool_size = (2, 2)))
mod.add(Dropout(0.1))

mod.add(Flatten())

# set dense with activation as softmax:
mod.add(Dense(train_data.species.nunique(), activation = 'softmax'))

# set optimizer with small rate:
opt = Adam(learning_rate = 0.0001)

#set up loss function:
losses = tf.keras.losses.CategoricalCrossentropy() 

# compile model:
mod.compile(loss = 'categorical_crossentropy', metrics = ['accuracy'], optimizer = opt)

# view model summary:
mod.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 60, 60, 32)        2432      
                                                                 
 batch_normalization (Batch  (None, 60, 60, 32)        128       
 Normalization)                                                  
                                                                 
 activation (Activation)     (None, 60, 60, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 56, 56, 32)        25632     
                                                                 
 batch_normalization_1 (Bat  (None, 56, 56, 32)        128       
 chNormalization)                                                
                                                                 
 activation_1 (Activation)   (None, 56, 56, 32)        0

In [26]:
# Train model:

fit = mod.fit(gen_train, epochs = epochs, validation_data = gen_valid)
fit

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7e47d126f430>